usr/src/uts/common/dtrace/dtrace.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  25  */
  26
  27 /*
  28  * DTrace - Dynamic Tracing for Solaris
  29  *
  30  * This is the implementation of the Solaris Dynamic Tracing framework
  31  * (DTrace).  The user-visible interface to DTrace is described at length in
  32  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  33  * library, the in-kernel DTrace framework, and the DTrace providers are
  34  * described in the block comments in the <sys/dtrace.h> header file.  The
  35  * internal architecture of DTrace is described in the block comments in the
  36  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  37  * implementation very much assume mastery of all of these sources; if one has
  38  * an unanswered question about the implementation, one should consult them
  39  * first.
  40  *
  41  * The functions here are ordered roughly as follows:
  42  *
  43  *   - Probe context functions
  44  *   - Probe hashing functions
  45  *   - Non-probe context utility functions
  46  *   - Matching functions
  47  *   - Provider-to-Framework API functions
  48  *   - Probe management functions
  49  *   - DIF object functions
  50  *   - Format functions
  51  *   - Predicate functions
  52  *   - ECB functions
  53  *   - Buffer functions
  54  *   - Enabling functions
  55  *   - DOF functions
  56  *   - Anonymous enabling functions
  57  *   - Consumer state functions
  58  *   - Helper functions
  59  *   - Hook functions
  60  *   - Driver cookbook functions
  61  *
  62  * Each group of functions begins with a block comment labelled the "DTrace
  63  * [Group] Functions", allowing one to find each block by searching forward
  64  * on capital-f functions.
  65  */
  66 #include <sys/errno.h>
  67 #include <sys/stat.h>
  68 #include <sys/modctl.h>
  69 #include <sys/conf.h>
  70 #include <sys/systm.h>
  71 #include <sys/ddi.h>
  72 #include <sys/sunddi.h>
  73 #include <sys/cpuvar.h>
  74 #include <sys/kmem.h>
  75 #include <sys/strsubr.h>
  76 #include <sys/sysmacros.h>
  77 #include <sys/dtrace_impl.h>
  78 #include <sys/atomic.h>
  79 #include <sys/cmn_err.h>
  80 #include <sys/mutex_impl.h>
  81 #include <sys/rwlock_impl.h>
  82 #include <sys/ctf_api.h>
  83 #include <sys/panic.h>
  84 #include <sys/priv_impl.h>
  85 #include <sys/policy.h>
  86 #include <sys/cred_impl.h>
  87 #include <sys/procfs_isa.h>
  88 #include <sys/taskq.h>
  89 #include <sys/mkdev.h>
  90 #include <sys/kdi.h>
  91 #include <sys/zone.h>
  92 #include <sys/socket.h>
  93 #include <netinet/in.h>
  94
  95 /*
  96  * DTrace Tunable Variables
  97  *
  98  * The following variables may be tuned by adding a line to /etc/system that
  99  * includes both the name of the DTrace module ("dtrace") and the name of the
 100  * variable.  For example:
 101  *
 102  *   set dtrace:dtrace_destructive_disallow = 1
 103  *
 104  * In general, the only variables that one should be tuning this way are those
 105  * that affect system-wide DTrace behavior, and for which the default behavior
 106  * is undesirable.  Most of these variables are tunable on a per-consumer
 107  * basis using DTrace options, and need not be tuned on a system-wide basis.
 108  * When tuning these variables, avoid pathological values; while some attempt
 109  * is made to verify the integrity of these variables, they are not considered
 110  * part of the supported interface to DTrace, and they are therefore not
 111  * checked comprehensively.  Further, these variables should not be tuned
 112  * dynamically via "mdb -kw" or other means; they should only be tuned via
 113  * /etc/system.
 114  */
 115 int             dtrace_destructive_disallow = 0;
 116 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 117 size_t          dtrace_difo_maxsize = (256 * 1024);
 118 dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
 119 size_t          dtrace_global_maxsize = (16 * 1024);
 120 size_t          dtrace_actions_max = (16 * 1024);
 121 size_t          dtrace_retain_max = 1024;
 122 dtrace_optval_t dtrace_helper_actions_max = 32;
 123 dtrace_optval_t dtrace_helper_providers_max = 32;
 124 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 125 size_t          dtrace_strsize_default = 256;
 126 dtrace_optval_t dtrace_cleanrate_default = 9900990;             /* 101 hz */
 127 dtrace_optval_t dtrace_cleanrate_min = 200000;                  /* 5000 hz */
 128 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 129 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 130 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 131 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 132 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 133 dtrace_optval_t dtrace_nspec_default = 1;
 134 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 135 dtrace_optval_t dtrace_stackframes_default = 20;
 136 dtrace_optval_t dtrace_ustackframes_default = 20;
 137 dtrace_optval_t dtrace_jstackframes_default = 50;
 138 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 139 int             dtrace_msgdsize_max = 128;
 140 hrtime_t        dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 141 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 142 int             dtrace_devdepth_max = 32;
 143 int             dtrace_err_verbose;
 144 hrtime_t        dtrace_deadman_interval = NANOSEC;
 145 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 146 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 147 hrtime_t        dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
 148
 149 /*
 150  * DTrace External Variables
 151  *
 152  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 153  * available to DTrace consumers via the backtick (`) syntax.  One of these,
 154  * dtrace_zero, is made deliberately so:  it is provided as a source of
 155  * well-known, zero-filled memory.  While this variable is not documented,
 156  * it is used by some translators as an implementation detail.
 157  */
 158 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 159
 160 /*
 161  * DTrace Internal Variables
 162  */
 163 static dev_info_t       *dtrace_devi;           /* device info */
 164 static vmem_t           *dtrace_arena;          /* probe ID arena */
 165 static vmem_t           *dtrace_minor;          /* minor number arena */
 166 static taskq_t          *dtrace_taskq;          /* task queue */
 167 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 168 static int              dtrace_nprobes;         /* number of probes */
 169 static dtrace_provider_t *dtrace_provider;      /* provider list */
 170 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 171 static int              dtrace_opens;           /* number of opens */
 172 static int              dtrace_helpers;         /* number of helpers */
 173 static void             *dtrace_softstate;      /* softstate pointer */
 174 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 175 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 176 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 177 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 178 static int              dtrace_toxranges;       /* number of toxic ranges */
 179 static int              dtrace_toxranges_max;   /* size of toxic range array */
 180 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 181 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 182 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 183 static kthread_t        *dtrace_panicked;       /* panicking thread */
 184 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 185 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 186 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 187 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 188 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 189 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 190 static int              dtrace_dynvar_failclean; /* dynvars failed to clean */
 191
 192 /*
 193  * DTrace Locking
 194  * DTrace is protected by three (relatively coarse-grained) locks:
 195  *
 196  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 197  *     including enabling state, probes, ECBs, consumer state, helper state,
 198  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 199  *     probe context is lock-free -- synchronization is handled via the
 200  *     dtrace_sync() cross call mechanism.
 201  *
 202  * (2) dtrace_provider_lock is required when manipulating provider state, or
 203  *     when provider state must be held constant.
 204  *
 205  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 206  *     when meta provider state must be held constant.
 207  *
 208  * The lock ordering between these three locks is dtrace_meta_lock before
 209  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 210  * several places where dtrace_provider_lock is held by the framework as it
 211  * calls into the providers -- which then call back into the framework,
 212  * grabbing dtrace_lock.)
 213  *
 214  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 215  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 216  * role as a coarse-grained lock; it is acquired before both of these locks.
 217  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 218  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 219  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 220  * acquired _between_ dtrace_provider_lock and dtrace_lock.
 221  */
 222 static kmutex_t         dtrace_lock;            /* probe state lock */
 223 static kmutex_t         dtrace_provider_lock;   /* provider state lock */
 224 static kmutex_t         dtrace_meta_lock;       /* meta-provider state lock */
 225
 226 /*
 227  * DTrace Provider Variables
 228  *
 229  * These are the variables relating to DTrace as a provider (that is, the
 230  * provider of the BEGIN, END, and ERROR probes).
 231  */
 232 static dtrace_pattr_t   dtrace_provider_attr = {
 233 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 234 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 235 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 236 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 237 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 238 };
 239
 240 static void
 241 dtrace_nullop(void)
 242 {}
 243
 244 static int
 245 dtrace_enable_nullop(void)
 246 {
 247         return (0);
 248 }
 249
 250 static dtrace_pops_t    dtrace_provider_ops = {
 251         (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
 252         (void (*)(void *, struct modctl *))dtrace_nullop,
 253         (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
 254         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 255         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 256         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 257         NULL,
 258         NULL,
 259         NULL,
 260         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
 261 };
 262
 263 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 264 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 265 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 266
 267 /*
 268  * DTrace Helper Tracing Variables
 269  */
 270 uint32_t dtrace_helptrace_next = 0;
 271 uint32_t dtrace_helptrace_nlocals;
 272 char    *dtrace_helptrace_buffer;
 273 int     dtrace_helptrace_bufsize = 512 * 1024;
 274
 275 #ifdef DEBUG
 276 int     dtrace_helptrace_enabled = 1;
 277 #else
 278 int     dtrace_helptrace_enabled = 0;
 279 #endif
 280
 281 /*
 282  * DTrace Error Hashing
 283  *
 284  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 285  * table.  This is very useful for checking coverage of tests that are
 286  * expected to induce DIF or DOF processing errors, and may be useful for
 287  * debugging problems in the DIF code generator or in DOF generation .  The
 288  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 289  */
 290 #ifdef DEBUG
 291 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 292 static const char *dtrace_errlast;
 293 static kthread_t *dtrace_errthread;
 294 static kmutex_t dtrace_errlock;
 295 #endif
 296
 297 /*
 298  * DTrace Macros and Constants
 299  *
 300  * These are various macros that are useful in various spots in the
 301  * implementation, along with a few random constants that have no meaning
 302  * outside of the implementation.  There is no real structure to this cpp
 303  * mishmash -- but is there ever?
 304  */
 305 #define DTRACE_HASHSTR(hash, probe)     \
 306         dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
 307
 308 #define DTRACE_HASHNEXT(hash, probe)    \
 309         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
 310
 311 #define DTRACE_HASHPREV(hash, probe)    \
 312         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
 313
 314 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 315         (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
 316             *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
 317
 318 #define DTRACE_AGGHASHSIZE_SLEW         17
 319
 320 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 321
 322 /*
 323  * The key for a thread-local variable consists of the lower 61 bits of the
 324  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 325  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 326  * equal to a variable identifier.  This is necessary (but not sufficient) to
 327  * assure that global associative arrays never collide with thread-local
 328  * variables.  To guarantee that they cannot collide, we must also define the
 329  * order for keying dynamic variables.  That order is:
 330  *
 331  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 332  *
 333  * Because the variable-key and the tls-key are in orthogonal spaces, there is
 334  * no way for a global variable key signature to match a thread-local key
 335  * signature.
 336  */
 337 #define DTRACE_TLS_THRKEY(where) { \
 338         uint_t intr = 0; \
 339         uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
 340         for (; actv; actv >>= 1) \
 341                 intr++; \
 342         ASSERT(intr < (1 << 3)); \
 343         (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
 344             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 345 }
 346
 347 #define DT_BSWAP_8(x)   ((x) & 0xff)
 348 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 349 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 350 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 351
 352 #define DT_MASK_LO 0x00000000FFFFFFFFULL
 353
 354 #define DTRACE_STORE(type, tomax, offset, what) \
 355         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 356
 357 #ifndef __i386
 358 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 359         if (addr & (size - 1)) {                                        \
 360                 *flags |= CPU_DTRACE_BADALIGN;                          \
 361                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 362                 return (0);                                             \
 363         }
 364 #else
 365 #define DTRACE_ALIGNCHECK(addr, size, flags)
 366 #endif
 367
 368 /*
 369  * Test whether a range of memory starting at testaddr of size testsz falls
 370  * within the range of memory described by addr, sz.  We take care to avoid
 371  * problems with overflow and underflow of the unsigned quantities, and
 372  * disallow all negative sizes.  Ranges of size 0 are allowed.
 373  */
 374 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 375         ((testaddr) - (baseaddr) < (basesz) && \
 376         (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
 377         (testaddr) + (testsz) >= (testaddr))
 378
 379 /*
 380  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 381  * alloc_sz on the righthand side of the comparison in order to avoid overflow
 382  * or underflow in the comparison with it.  This is simpler than the INRANGE
 383  * check above, because we know that the dtms_scratch_ptr is valid in the
 384  * range.  Allocations of size zero are allowed.
 385  */
 386 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 387         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 388         (mstate)->dtms_scratch_ptr >= (alloc_sz))
 389
 390 #define DTRACE_LOADFUNC(bits)                                           \
 391 /*CSTYLED*/                                                             \
 392 uint##bits##_t                                                          \
 393 dtrace_load##bits(uintptr_t addr)                                       \
 394 {                                                                       \
 395         size_t size = bits / NBBY;                                      \
 396         /*CSTYLED*/                                                     \
 397         uint##bits##_t rval;                                            \
 398         int i;                                                          \
 399         volatile uint16_t *flags = (volatile uint16_t *)                \
 400             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 401                                                                         \
 402         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 403                                                                         \
 404         for (i = 0; i < dtrace_toxranges; i++) {                        \
 405                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 406                         continue;                                       \
 407                                                                         \
 408                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 409                         continue;                                       \
 410                                                                         \
 411                 /*                                                      \
 412                  * This address falls within a toxic region; return 0.  \
 413                  */                                                     \
 414                 *flags |= CPU_DTRACE_BADADDR;                           \
 415                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 416                 return (0);                                             \
 417         }                                                               \
 418                                                                         \
 419         *flags |= CPU_DTRACE_NOFAULT;                                   \
 420         /*CSTYLED*/                                                     \
 421         rval = *((volatile uint##bits##_t *)addr);                      \
 422         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 423                                                                         \
 424         return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);               \
 425 }
 426
 427 #ifdef _LP64
 428 #define dtrace_loadptr  dtrace_load64
 429 #else
 430 #define dtrace_loadptr  dtrace_load32
 431 #endif
 432
 433 #define DTRACE_DYNHASH_FREE     0
 434 #define DTRACE_DYNHASH_SINK     1
 435 #define DTRACE_DYNHASH_VALID    2
 436
 437 #define DTRACE_MATCH_FAIL       -1
 438 #define DTRACE_MATCH_NEXT       0
 439 #define DTRACE_MATCH_DONE       1
 440 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 441 #define DTRACE_STATE_ALIGN      64
 442
 443 #define DTRACE_FLAGS2FLT(flags)                                         \
 444         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
 445         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
 446         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
 447         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
 448         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
 449         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
 450         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
 451         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
 452         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
 453         DTRACEFLT_UNKNOWN)
 454
 455 #define DTRACEACT_ISSTRING(act)                                         \
 456         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
 457         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 458
 459 static size_t dtrace_strlen(const char *, size_t);
 460 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 461 static void dtrace_enabling_provide(dtrace_provider_t *);
 462 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
 463 static void dtrace_enabling_matchall(void);
 464 static void dtrace_enabling_reap(void);
 465 static dtrace_state_t *dtrace_anon_grab(void);
 466 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 467     dtrace_state_t *, uint64_t, uint64_t);
 468 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 469 static void dtrace_buffer_drop(dtrace_buffer_t *);
 470 static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
 471 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 472     dtrace_state_t *, dtrace_mstate_t *);
 473 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 474     dtrace_optval_t);
 475 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 476 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 477
 478 /*
 479  * DTrace Probe Context Functions
 480  *
 481  * These functions are called from probe context.  Because probe context is
 482  * any context in which C may be called, arbitrarily locks may be held,
 483  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 484  * As a result, functions called from probe context may only call other DTrace
 485  * support functions -- they may not interact at all with the system at large.
 486  * (Note that the ASSERT macro is made probe-context safe by redefining it in
 487  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 488  * loads are to be performed from probe context, they _must_ be in terms of
 489  * the safe dtrace_load*() variants.
 490  *
 491  * Some functions in this block are not actually called from probe context;
 492  * for these functions, there will be a comment above the function reading
 493  * "Note:  not called from probe context."
 494  */
 495 void
 496 dtrace_panic(const char *format, ...)
 497 {
 498         va_list alist;
 499
 500         va_start(alist, format);
 501         dtrace_vpanic(format, alist);
 502         va_end(alist);
 503 }
 504
 505 int
 506 dtrace_assfail(const char *a, const char *f, int l)
 507 {
 508         dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
 509
 510         /*
 511          * We just need something here that even the most clever compiler
 512          * cannot optimize away.
 513          */
 514         return (a[(uintptr_t)f]);
 515 }
 516
 517 /*
 518  * Atomically increment a specified error counter from probe context.
 519  */
 520 static void
 521 dtrace_error(uint32_t *counter)
 522 {
 523         /*
 524          * Most counters stored to in probe context are per-CPU counters.
 525          * However, there are some error conditions that are sufficiently
 526          * arcane that they don't merit per-CPU storage.  If these counters
 527          * are incremented concurrently on different CPUs, scalability will be
 528          * adversely affected -- but we don't expect them to be white-hot in a
 529          * correctly constructed enabling...
 530          */
 531         uint32_t oval, nval;
 532
 533         do {
 534                 oval = *counter;
 535
 536                 if ((nval = oval + 1) == 0) {
 537                         /*
 538                          * If the counter would wrap, set it to 1 -- assuring
 539                          * that the counter is never zero when we have seen
 540                          * errors.  (The counter must be 32-bits because we
 541                          * aren't guaranteed a 64-bit compare&swap operation.)
 542                          * To save this code both the infamy of being fingered
 543                          * by a priggish news story and the indignity of being
 544                          * the target of a neo-puritan witch trial, we're
 545                          * carefully avoiding any colorful description of the
 546                          * likelihood of this condition -- but suffice it to
 547                          * say that it is only slightly more likely than the
 548                          * overflow of predicate cache IDs, as discussed in
 549                          * dtrace_predicate_create().
 550                          */
 551                         nval = 1;
 552                 }
 553         } while (dtrace_cas32(counter, oval, nval) != oval);
 554 }
 555
 556 /*
 557  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 558  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 559  */
 560 DTRACE_LOADFUNC(8)
 561 DTRACE_LOADFUNC(16)
 562 DTRACE_LOADFUNC(32)
 563 DTRACE_LOADFUNC(64)
 564
 565 static int
 566 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 567 {
 568         if (dest < mstate->dtms_scratch_base)
 569                 return (0);
 570
 571         if (dest + size < dest)
 572                 return (0);
 573
 574         if (dest + size > mstate->dtms_scratch_ptr)
 575                 return (0);
 576
 577         return (1);
 578 }
 579
 580 static int
 581 dtrace_canstore_statvar(uint64_t addr, size_t sz,
 582     dtrace_statvar_t **svars, int nsvars)
 583 {
 584         int i;
 585
 586         for (i = 0; i < nsvars; i++) {
 587                 dtrace_statvar_t *svar = svars[i];
 588
 589                 if (svar == NULL || svar->dtsv_size == 0)
 590                         continue;
 591
 592                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
 593                         return (1);
 594         }
 595
 596         return (0);
 597 }
 598
 599 /*
 600  * Check to see if the address is within a memory region to which a store may
 601  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
 602  * region.  The caller of dtrace_canstore() is responsible for performing any
 603  * alignment checks that are needed before stores are actually executed.
 604  */
 605 static int
 606 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 607     dtrace_vstate_t *vstate)
 608 {
 609         /*
 610          * First, check to see if the address is in scratch space...
 611          */
 612         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
 613             mstate->dtms_scratch_size))
 614                 return (1);
 615
 616         /*
 617          * Now check to see if it's a dynamic variable.  This check will pick
 618          * up both thread-local variables and any global dynamically-allocated
 619          * variables.
 620          */
 621         if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
 622             vstate->dtvs_dynvars.dtds_size)) {
 623                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 624                 uintptr_t base = (uintptr_t)dstate->dtds_base +
 625                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
 626                 uintptr_t chunkoffs;
 627
 628                 /*
 629                  * Before we assume that we can store here, we need to make
 630                  * sure that it isn't in our metadata -- storing to our
 631                  * dynamic variable metadata would corrupt our state.  For
 632                  * the range to not include any dynamic variable metadata,
 633                  * it must:
 634                  *
 635                  *      (1) Start above the hash table that is at the base of
 636                  *      the dynamic variable space
 637                  *
 638                  *      (2) Have a starting chunk offset that is beyond the
 639                  *      dtrace_dynvar_t that is at the base of every chunk
 640                  *
 641                  *      (3) Not span a chunk boundary
 642                  *
 643                  */
 644                 if (addr < base)
 645                         return (0);
 646
 647                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
 648
 649                 if (chunkoffs < sizeof (dtrace_dynvar_t))
 650                         return (0);
 651
 652                 if (chunkoffs + sz > dstate->dtds_chunksize)
 653                         return (0);
 654
 655                 return (1);
 656         }
 657
 658         /*
 659          * Finally, check the static local and global variables.  These checks
 660          * take the longest, so we perform them last.
 661          */
 662         if (dtrace_canstore_statvar(addr, sz,
 663             vstate->dtvs_locals, vstate->dtvs_nlocals))
 664                 return (1);
 665
 666         if (dtrace_canstore_statvar(addr, sz,
 667             vstate->dtvs_globals, vstate->dtvs_nglobals))
 668                 return (1);
 669
 670         return (0);
 671 }
 672
 673
 674 /*
 675  * Convenience routine to check to see if the address is within a memory
 676  * region in which a load may be issued given the user's privilege level;
 677  * if not, it sets the appropriate error flags and loads 'addr' into the
 678  * illegal value slot.
 679  *
 680  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
 681  * appropriate memory access protection.
 682  */
 683 static int
 684 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 685     dtrace_vstate_t *vstate)
 686 {
 687         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
 688
 689         /*
 690          * If we hold the privilege to read from kernel memory, then
 691          * everything is readable.
 692          */
 693         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 694                 return (1);
 695
 696         /*
 697          * You can obviously read that which you can store.
 698          */
 699         if (dtrace_canstore(addr, sz, mstate, vstate))
 700                 return (1);
 701
 702         /*
 703          * We're allowed to read from our own string table.
 704          */
 705         if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
 706             mstate->dtms_difo->dtdo_strlen))
 707                 return (1);
 708
 709         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 710         *illval = addr;
 711         return (0);
 712 }
 713
 714 /*
 715  * Convenience routine to check to see if a given string is within a memory
 716  * region in which a load may be issued given the user's privilege level;
 717  * this exists so that we don't need to issue unnecessary dtrace_strlen()
 718  * calls in the event that the user has all privileges.
 719  */
 720 static int
 721 dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 722     dtrace_vstate_t *vstate)
 723 {
 724         size_t strsz;
 725
 726         /*
 727          * If we hold the privilege to read from kernel memory, then
 728          * everything is readable.
 729          */
 730         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 731                 return (1);
 732
 733         strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
 734         if (dtrace_canload(addr, strsz, mstate, vstate))
 735                 return (1);
 736
 737         return (0);
 738 }
 739
 740 /*
 741  * Convenience routine to check to see if a given variable is within a memory
 742  * region in which a load may be issued given the user's privilege level.
 743  */
 744 static int
 745 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
 746     dtrace_vstate_t *vstate)
 747 {
 748         size_t sz;
 749         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 750
 751         /*
 752          * If we hold the privilege to read from kernel memory, then
 753          * everything is readable.
 754          */
 755         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 756                 return (1);
 757
 758         if (type->dtdt_kind == DIF_TYPE_STRING)
 759                 sz = dtrace_strlen(src,
 760                     vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
 761         else
 762                 sz = type->dtdt_size;
 763
 764         return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
 765 }
 766
 767 /*
 768  * Compare two strings using safe loads.
 769  */
 770 static int
 771 dtrace_strncmp(char *s1, char *s2, size_t limit)
 772 {
 773         uint8_t c1, c2;
 774         volatile uint16_t *flags;
 775
 776         if (s1 == s2 || limit == 0)
 777                 return (0);
 778
 779         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
 780
 781         do {
 782                 if (s1 == NULL) {
 783                         c1 = '\0';
 784                 } else {
 785                         c1 = dtrace_load8((uintptr_t)s1++);
 786                 }
 787
 788                 if (s2 == NULL) {
 789                         c2 = '\0';
 790                 } else {
 791                         c2 = dtrace_load8((uintptr_t)s2++);
 792                 }
 793
 794                 if (c1 != c2)
 795                         return (c1 - c2);
 796         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
 797
 798         return (0);
 799 }
 800
 801 /*
 802  * Compute strlen(s) for a string using safe memory accesses.  The additional
 803  * len parameter is used to specify a maximum length to ensure completion.
 804  */
 805 static size_t
 806 dtrace_strlen(const char *s, size_t lim)
 807 {
 808         uint_t len;
 809
 810         for (len = 0; len != lim; len++) {
 811                 if (dtrace_load8((uintptr_t)s++) == '\0')
 812                         break;
 813         }
 814
 815         return (len);
 816 }
 817
 818 /*
 819  * Check if an address falls within a toxic region.
 820  */
 821 static int
 822 dtrace_istoxic(uintptr_t kaddr, size_t size)
 823 {
 824         uintptr_t taddr, tsize;
 825         int i;
 826
 827         for (i = 0; i < dtrace_toxranges; i++) {
 828                 taddr = dtrace_toxrange[i].dtt_base;
 829                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
 830
 831                 if (kaddr - taddr < tsize) {
 832                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 833                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
 834                         return (1);
 835                 }
 836
 837                 if (taddr - kaddr < size) {
 838                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 839                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
 840                         return (1);
 841                 }
 842         }
 843
 844         return (0);
 845 }
 846
 847 /*
 848  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
 849  * memory specified by the DIF program.  The dst is assumed to be safe memory
 850  * that we can store to directly because it is managed by DTrace.  As with
 851  * standard bcopy, overlapping copies are handled properly.
 852  */
 853 static void
 854 dtrace_bcopy(const void *src, void *dst, size_t len)
 855 {
 856         if (len != 0) {
 857                 uint8_t *s1 = dst;
 858                 const uint8_t *s2 = src;
 859
 860                 if (s1 <= s2) {
 861                         do {
 862                                 *s1++ = dtrace_load8((uintptr_t)s2++);
 863                         } while (--len != 0);
 864                 } else {
 865                         s2 += len;
 866                         s1 += len;
 867
 868                         do {
 869                                 *--s1 = dtrace_load8((uintptr_t)--s2);
 870                         } while (--len != 0);
 871                 }
 872         }
 873 }
 874
 875 /*
 876  * Copy src to dst using safe memory accesses, up to either the specified
 877  * length, or the point that a nul byte is encountered.  The src is assumed to
 878  * be unsafe memory specified by the DIF program.  The dst is assumed to be
 879  * safe memory that we can store to directly because it is managed by DTrace.
 880  * Unlike dtrace_bcopy(), overlapping regions are not handled.
 881  */
 882 static void
 883 dtrace_strcpy(const void *src, void *dst, size_t len)
 884 {
 885         if (len != 0) {
 886                 uint8_t *s1 = dst, c;
 887                 const uint8_t *s2 = src;
 888
 889                 do {
 890                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
 891                 } while (--len != 0 && c != '\0');
 892         }
 893 }
 894
 895 /*
 896  * Copy src to dst, deriving the size and type from the specified (BYREF)
 897  * variable type.  The src is assumed to be unsafe memory specified by the DIF
 898  * program.  The dst is assumed to be DTrace variable memory that is of the
 899  * specified type; we assume that we can store to directly.
 900  */
 901 static void
 902 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
 903 {
 904         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 905
 906         if (type->dtdt_kind == DIF_TYPE_STRING) {
 907                 dtrace_strcpy(src, dst, type->dtdt_size);
 908         } else {
 909                 dtrace_bcopy(src, dst, type->dtdt_size);
 910         }
 911 }
 912
 913 /*
 914  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
 915  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
 916  * safe memory that we can access directly because it is managed by DTrace.
 917  */
 918 static int
 919 dtrace_bcmp(const void *s1, const void *s2, size_t len)
 920 {
 921         volatile uint16_t *flags;
 922
 923         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
 924
 925         if (s1 == s2)
 926                 return (0);
 927
 928         if (s1 == NULL || s2 == NULL)
 929                 return (1);
 930
 931         if (s1 != s2 && len != 0) {
 932                 const uint8_t *ps1 = s1;
 933                 const uint8_t *ps2 = s2;
 934
 935                 do {
 936                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
 937                                 return (1);
 938                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
 939         }
 940         return (0);
 941 }
 942
 943 /*
 944  * Zero the specified region using a simple byte-by-byte loop.  Note that this
 945  * is for safe DTrace-managed memory only.
 946  */
 947 static void
 948 dtrace_bzero(void *dst, size_t len)
 949 {
 950         uchar_t *cp;
 951
 952         for (cp = dst; len != 0; len--)
 953                 *cp++ = 0;
 954 }
 955
 956 static void
 957 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
 958 {
 959         uint64_t result[2];
 960
 961         result[0] = addend1[0] + addend2[0];
 962         result[1] = addend1[1] + addend2[1] +
 963             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
 964
 965         sum[0] = result[0];
 966         sum[1] = result[1];
 967 }
 968
 969 /*
 970  * Shift the 128-bit value in a by b. If b is positive, shift left.
 971  * If b is negative, shift right.
 972  */
 973 static void
 974 dtrace_shift_128(uint64_t *a, int b)
 975 {
 976         uint64_t mask;
 977
 978         if (b == 0)
 979                 return;
 980
 981         if (b < 0) {
 982                 b = -b;
 983                 if (b >= 64) {
 984                         a[0] = a[1] >> (b - 64);
 985                         a[1] = 0;
 986                 } else {
 987                         a[0] >>= b;
 988                         mask = 1LL << (64 - b);
 989                         mask -= 1;
 990                         a[0] |= ((a[1] & mask) << (64 - b));
 991                         a[1] >>= b;
 992                 }
 993         } else {
 994                 if (b >= 64) {
 995                         a[1] = a[0] << (b - 64);
 996                         a[0] = 0;
 997                 } else {
 998                         a[1] <<= b;
 999                         mask = a[0] >> (64 - b);
1000                         a[1] |= mask;
1001                         a[0] <<= b;
1002                 }
1003         }
1004 }
1005
1006 /*
1007  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1008  * use native multiplication on those, and then re-combine into the
1009  * resulting 128-bit value.
1010  *
1011  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1012  *     hi1 * hi2 << 64 +
1013  *     hi1 * lo2 << 32 +
1014  *     hi2 * lo1 << 32 +
1015  *     lo1 * lo2
1016  */
1017 static void
1018 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1019 {
1020         uint64_t hi1, hi2, lo1, lo2;
1021         uint64_t tmp[2];
1022
1023         hi1 = factor1 >> 32;
1024         hi2 = factor2 >> 32;
1025
1026         lo1 = factor1 & DT_MASK_LO;
1027         lo2 = factor2 & DT_MASK_LO;
1028
1029         product[0] = lo1 * lo2;
1030         product[1] = hi1 * hi2;
1031
1032         tmp[0] = hi1 * lo2;
1033         tmp[1] = 0;
1034         dtrace_shift_128(tmp, 32);
1035         dtrace_add_128(product, tmp, product);
1036
1037         tmp[0] = hi2 * lo1;
1038         tmp[1] = 0;
1039         dtrace_shift_128(tmp, 32);
1040         dtrace_add_128(product, tmp, product);
1041 }
1042
1043 /*
1044  * This privilege check should be used by actions and subroutines to
1045  * verify that the user credentials of the process that enabled the
1046  * invoking ECB match the target credentials
1047  */
1048 static int
1049 dtrace_priv_proc_common_user(dtrace_state_t *state)
1050 {
1051         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1052
1053         /*
1054          * We should always have a non-NULL state cred here, since if cred
1055          * is null (anonymous tracing), we fast-path bypass this routine.
1056          */
1057         ASSERT(s_cr != NULL);
1058
1059         if ((cr = CRED()) != NULL &&
1060             s_cr->cr_uid == cr->cr_uid &&
1061             s_cr->cr_uid == cr->cr_ruid &&
1062             s_cr->cr_uid == cr->cr_suid &&
1063             s_cr->cr_gid == cr->cr_gid &&
1064             s_cr->cr_gid == cr->cr_rgid &&
1065             s_cr->cr_gid == cr->cr_sgid)
1066                 return (1);
1067
1068         return (0);
1069 }
1070
1071 /*
1072  * This privilege check should be used by actions and subroutines to
1073  * verify that the zone of the process that enabled the invoking ECB
1074  * matches the target credentials
1075  */
1076 static int
1077 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1078 {
1079         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1080
1081         /*
1082          * We should always have a non-NULL state cred here, since if cred
1083          * is null (anonymous tracing), we fast-path bypass this routine.
1084          */
1085         ASSERT(s_cr != NULL);
1086
1087         if ((cr = CRED()) != NULL &&
1088             s_cr->cr_zone == cr->cr_zone)
1089                 return (1);
1090
1091         return (0);
1092 }
1093
1094 /*
1095  * This privilege check should be used by actions and subroutines to
1096  * verify that the process has not setuid or changed credentials.
1097  */
1098 static int
1099 dtrace_priv_proc_common_nocd()
1100 {
1101         proc_t *proc;
1102
1103         if ((proc = ttoproc(curthread)) != NULL &&
1104             !(proc->p_flag & SNOCD))
1105                 return (1);
1106
1107         return (0);
1108 }
1109
1110 static int
1111 dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate)
1112 {
1113         int action = state->dts_cred.dcr_action;
1114
1115         if (!(mstate->dtms_access & DTRACE_ACCESS_PROC))
1116                 goto bad;
1117
1118         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1119             dtrace_priv_proc_common_zone(state) == 0)
1120                 goto bad;
1121
1122         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1123             dtrace_priv_proc_common_user(state) == 0)
1124                 goto bad;
1125
1126         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1127             dtrace_priv_proc_common_nocd() == 0)
1128                 goto bad;
1129
1130         return (1);
1131
1132 bad:
1133         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1134
1135         return (0);
1136 }
1137
1138 static int
1139 dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate)
1140 {
1141         if (mstate->dtms_access & DTRACE_ACCESS_PROC) {
1142                 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1143                         return (1);
1144
1145                 if (dtrace_priv_proc_common_zone(state) &&
1146                     dtrace_priv_proc_common_user(state) &&
1147                     dtrace_priv_proc_common_nocd())
1148                         return (1);
1149         }
1150
1151         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1152
1153         return (0);
1154 }
1155
1156 static int
1157 dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate)
1158 {
1159         if ((mstate->dtms_access & DTRACE_ACCESS_PROC) &&
1160             (state->dts_cred.dcr_action & DTRACE_CRA_PROC))
1161                 return (1);
1162
1163         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1164
1165         return (0);
1166 }
1167
1168 static int
1169 dtrace_priv_kernel(dtrace_state_t *state)
1170 {
1171         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1172                 return (1);
1173
1174         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1175
1176         return (0);
1177 }
1178
1179 static int
1180 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1181 {
1182         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1183                 return (1);
1184
1185         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1186
1187         return (0);
1188 }
1189
1190 /*
1191  * Determine if the dte_cond of the specified ECB allows for processing of
1192  * the current probe to continue.  Note that this routine may allow continued
1193  * processing, but with access(es) stripped from the mstate's dtms_access
1194  * field.
1195  */
1196 static int
1197 dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1198     dtrace_ecb_t *ecb)
1199 {
1200         dtrace_probe_t *probe = ecb->dte_probe;
1201         dtrace_provider_t *prov = probe->dtpr_provider;
1202         dtrace_pops_t *pops = &prov->dtpv_pops;
1203         int mode = DTRACE_MODE_NOPRIV_DROP;
1204
1205         ASSERT(ecb->dte_cond);
1206
1207         if (pops->dtps_mode != NULL) {
1208                 mode = pops->dtps_mode(prov->dtpv_arg,
1209                     probe->dtpr_id, probe->dtpr_arg);
1210
1211                 ASSERT((mode & DTRACE_MODE_USER) ||
1212                     (mode & DTRACE_MODE_KERNEL));
1213                 ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1214                     (mode & DTRACE_MODE_NOPRIV_DROP));
1215         }
1216
1217         /*
1218          * If the dte_cond bits indicate that this consumer is only allowed to
1219          * see user-mode firings of this probe, call the provider's dtps_mode()
1220          * entry point to check that the probe was fired while in a user
1221          * context.  If that's not the case, use the policy specified by the
1222          * provider to determine if we drop the probe or merely restrict
1223          * operation.
1224          */
1225         if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1226                 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1227
1228                 if (!(mode & DTRACE_MODE_USER)) {
1229                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1230                                 return (0);
1231
1232                         mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1233                 }
1234         }
1235
1236         /*
1237          * This is more subtle than it looks. We have to be absolutely certain
1238          * that CRED() isn't going to change out from under us so it's only
1239          * legit to examine that structure if we're in constrained situations.
1240          * Currently, the only times we'll this check is if a non-super-user
1241          * has enabled the profile or syscall providers -- providers that
1242          * allow visibility of all processes. For the profile case, the check
1243          * above will ensure that we're examining a user context.
1244          */
1245         if (ecb->dte_cond & DTRACE_COND_OWNER) {
1246                 cred_t *cr;
1247                 cred_t *s_cr = state->dts_cred.dcr_cred;
1248                 proc_t *proc;
1249
1250                 ASSERT(s_cr != NULL);
1251
1252                 if ((cr = CRED()) == NULL ||
1253                     s_cr->cr_uid != cr->cr_uid ||
1254                     s_cr->cr_uid != cr->cr_ruid ||
1255                     s_cr->cr_uid != cr->cr_suid ||
1256                     s_cr->cr_gid != cr->cr_gid ||
1257                     s_cr->cr_gid != cr->cr_rgid ||
1258                     s_cr->cr_gid != cr->cr_sgid ||
1259                     (proc = ttoproc(curthread)) == NULL ||
1260                     (proc->p_flag & SNOCD)) {
1261                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1262                                 return (0);
1263
1264                         mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1265                 }
1266         }
1267
1268         /*
1269          * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1270          * in our zone, check to see if our mode policy is to restrict rather
1271          * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1272          * and DTRACE_ACCESS_ARGS
1273          */
1274         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1275                 cred_t *cr;
1276                 cred_t *s_cr = state->dts_cred.dcr_cred;
1277
1278                 ASSERT(s_cr != NULL);
1279
1280                 if ((cr = CRED()) == NULL ||
1281                     s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1282                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1283                                 return (0);
1284
1285                         mstate->dtms_access &=
1286                             ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1287                 }
1288         }
1289
1290         return (1);
1291 }
1292
1293 /*
1294  * Note:  not called from probe context.  This function is called
1295  * asynchronously (and at a regular interval) from outside of probe context to
1296  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1297  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1298  */
1299 void
1300 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1301 {
1302         dtrace_dynvar_t *dirty;
1303         dtrace_dstate_percpu_t *dcpu;
1304         dtrace_dynvar_t **rinsep;
1305         int i, j, work = 0;
1306
1307         for (i = 0; i < NCPU; i++) {
1308                 dcpu = &dstate->dtds_percpu[i];
1309                 rinsep = &dcpu->dtdsc_rinsing;
1310
1311                 /*
1312                  * If the dirty list is NULL, there is no dirty work to do.
1313                  */
1314                 if (dcpu->dtdsc_dirty == NULL)
1315                         continue;
1316
1317                 if (dcpu->dtdsc_rinsing != NULL) {
1318                         /*
1319                          * If the rinsing list is non-NULL, then it is because
1320                          * this CPU was selected to accept another CPU's
1321                          * dirty list -- and since that time, dirty buffers
1322                          * have accumulated.  This is a highly unlikely
1323                          * condition, but we choose to ignore the dirty
1324                          * buffers -- they'll be picked up a future cleanse.
1325                          */
1326                         continue;
1327                 }
1328
1329                 if (dcpu->dtdsc_clean != NULL) {
1330                         /*
1331                          * If the clean list is non-NULL, then we're in a
1332                          * situation where a CPU has done deallocations (we
1333                          * have a non-NULL dirty list) but no allocations (we
1334                          * also have a non-NULL clean list).  We can't simply
1335                          * move the dirty list into the clean list on this
1336                          * CPU, yet we also don't want to allow this condition
1337                          * to persist, lest a short clean list prevent a
1338                          * massive dirty list from being cleaned (which in
1339                          * turn could lead to otherwise avoidable dynamic
1340                          * drops).  To deal with this, we look for some CPU
1341                          * with a NULL clean list, NULL dirty list, and NULL
1342                          * rinsing list -- and then we borrow this CPU to
1343                          * rinse our dirty list.
1344                          */
1345                         for (j = 0; j < NCPU; j++) {
1346                                 dtrace_dstate_percpu_t *rinser;
1347
1348                                 rinser = &dstate->dtds_percpu[j];
1349
1350                                 if (rinser->dtdsc_rinsing != NULL)
1351                                         continue;
1352
1353                                 if (rinser->dtdsc_dirty != NULL)
1354                                         continue;
1355
1356                                 if (rinser->dtdsc_clean != NULL)
1357                                         continue;
1358
1359                                 rinsep = &rinser->dtdsc_rinsing;
1360                                 break;
1361                         }
1362
1363                         if (j == NCPU) {
1364                                 /*
1365                                  * We were unable to find another CPU that
1366                                  * could accept this dirty list -- we are
1367                                  * therefore unable to clean it now.
1368                                  */
1369                                 dtrace_dynvar_failclean++;
1370                                 continue;
1371                         }
1372                 }
1373
1374                 work = 1;
1375
1376                 /*
1377                  * Atomically move the dirty list aside.
1378                  */
1379                 do {
1380                         dirty = dcpu->dtdsc_dirty;
1381
1382                         /*
1383                          * Before we zap the dirty list, set the rinsing list.
1384                          * (This allows for a potential assertion in
1385                          * dtrace_dynvar():  if a free dynamic variable appears
1386                          * on a hash chain, either the dirty list or the
1387                          * rinsing list for some CPU must be non-NULL.)
1388                          */
1389                         *rinsep = dirty;
1390                         dtrace_membar_producer();
1391                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1392                     dirty, NULL) != dirty);
1393         }
1394
1395         if (!work) {
1396                 /*
1397                  * We have no work to do; we can simply return.
1398                  */
1399                 return;
1400         }
1401
1402         dtrace_sync();
1403
1404         for (i = 0; i < NCPU; i++) {
1405                 dcpu = &dstate->dtds_percpu[i];
1406
1407                 if (dcpu->dtdsc_rinsing == NULL)
1408                         continue;
1409
1410                 /*
1411                  * We are now guaranteed that no hash chain contains a pointer
1412                  * into this dirty list; we can make it clean.
1413                  */
1414                 ASSERT(dcpu->dtdsc_clean == NULL);
1415                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1416                 dcpu->dtdsc_rinsing = NULL;
1417         }
1418
1419         /*
1420          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1421          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1422          * This prevents a race whereby a CPU incorrectly decides that
1423          * the state should be something other than DTRACE_DSTATE_CLEAN
1424          * after dtrace_dynvar_clean() has completed.
1425          */
1426         dtrace_sync();
1427
1428         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1429 }
1430
1431 /*
1432  * Depending on the value of the op parameter, this function looks-up,
1433  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1434  * allocation is requested, this function will return a pointer to a
1435  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1436  * variable can be allocated.  If NULL is returned, the appropriate counter
1437  * will be incremented.
1438  */
1439 dtrace_dynvar_t *
1440 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1441     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1442     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1443 {
1444         uint64_t hashval = DTRACE_DYNHASH_VALID;
1445         dtrace_dynhash_t *hash = dstate->dtds_hash;
1446         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1447         processorid_t me = CPU->cpu_id, cpu = me;
1448         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1449         size_t bucket, ksize;
1450         size_t chunksize = dstate->dtds_chunksize;
1451         uintptr_t kdata, lock, nstate;
1452         uint_t i;
1453
1454         ASSERT(nkeys != 0);
1455
1456         /*
1457          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1458          * algorithm.  For the by-value portions, we perform the algorithm in
1459          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1460          * bit, and seems to have only a minute effect on distribution.  For
1461          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1462          * over each referenced byte.  It's painful to do this, but it's much
1463          * better than pathological hash distribution.  The efficacy of the
1464          * hashing algorithm (and a comparison with other algorithms) may be
1465          * found by running the ::dtrace_dynstat MDB dcmd.
1466          */
1467         for (i = 0; i < nkeys; i++) {
1468                 if (key[i].dttk_size == 0) {
1469                         uint64_t val = key[i].dttk_value;
1470
1471                         hashval += (val >> 48) & 0xffff;
1472                         hashval += (hashval << 10);
1473                         hashval ^= (hashval >> 6);
1474
1475                         hashval += (val >> 32) & 0xffff;
1476                         hashval += (hashval << 10);
1477                         hashval ^= (hashval >> 6);
1478
1479                         hashval += (val >> 16) & 0xffff;
1480                         hashval += (hashval << 10);
1481                         hashval ^= (hashval >> 6);
1482
1483                         hashval += val & 0xffff;
1484                         hashval += (hashval << 10);
1485                         hashval ^= (hashval >> 6);
1486                 } else {
1487                         /*
1488                          * This is incredibly painful, but it beats the hell
1489                          * out of the alternative.
1490                          */
1491                         uint64_t j, size = key[i].dttk_size;
1492                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1493
1494                         if (!dtrace_canload(base, size, mstate, vstate))
1495                                 break;
1496
1497                         for (j = 0; j < size; j++) {
1498                                 hashval += dtrace_load8(base + j);
1499                                 hashval += (hashval << 10);
1500                                 hashval ^= (hashval >> 6);
1501                         }
1502                 }
1503         }
1504
1505         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1506                 return (NULL);
1507
1508         hashval += (hashval << 3);
1509         hashval ^= (hashval >> 11);
1510         hashval += (hashval << 15);
1511
1512         /*
1513          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1514          * comes out to be one of our two sentinel hash values.  If this
1515          * actually happens, we set the hashval to be a value known to be a
1516          * non-sentinel value.
1517          */
1518         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1519                 hashval = DTRACE_DYNHASH_VALID;
1520
1521         /*
1522          * Yes, it's painful to do a divide here.  If the cycle count becomes
1523          * important here, tricks can be pulled to reduce it.  (However, it's
1524          * critical that hash collisions be kept to an absolute minimum;
1525          * they're much more painful than a divide.)  It's better to have a
1526          * solution that generates few collisions and still keeps things
1527          * relatively simple.
1528          */
1529         bucket = hashval % dstate->dtds_hashsize;
1530
1531         if (op == DTRACE_DYNVAR_DEALLOC) {
1532                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1533
1534                 for (;;) {
1535                         while ((lock = *lockp) & 1)
1536                                 continue;
1537
1538                         if (dtrace_casptr((void *)lockp,
1539                             (void *)lock, (void *)(lock + 1)) == (void *)lock)
1540                                 break;
1541                 }
1542
1543                 dtrace_membar_producer();
1544         }
1545
1546 top:
1547         prev = NULL;
1548         lock = hash[bucket].dtdh_lock;
1549
1550         dtrace_membar_consumer();
1551
1552         start = hash[bucket].dtdh_chain;
1553         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1554             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1555             op != DTRACE_DYNVAR_DEALLOC));
1556
1557         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1558                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1559                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1560
1561                 if (dvar->dtdv_hashval != hashval) {
1562                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1563                                 /*
1564                                  * We've reached the sink, and therefore the
1565                                  * end of the hash chain; we can kick out of
1566                                  * the loop knowing that we have seen a valid
1567                                  * snapshot of state.
1568                                  */
1569                                 ASSERT(dvar->dtdv_next == NULL);
1570                                 ASSERT(dvar == &dtrace_dynhash_sink);
1571                                 break;
1572                         }
1573
1574                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1575                                 /*
1576                                  * We've gone off the rails:  somewhere along
1577                                  * the line, one of the members of this hash
1578                                  * chain was deleted.  Note that we could also
1579                                  * detect this by simply letting this loop run
1580                                  * to completion, as we would eventually hit
1581                                  * the end of the dirty list.  However, we
1582                                  * want to avoid running the length of the
1583                                  * dirty list unnecessarily (it might be quite
1584                                  * long), so we catch this as early as
1585                                  * possible by detecting the hash marker.  In
1586                                  * this case, we simply set dvar to NULL and
1587                                  * break; the conditional after the loop will
1588                                  * send us back to top.
1589                                  */
1590                                 dvar = NULL;
1591                                 break;
1592                         }
1593
1594                         goto next;
1595                 }
1596
1597                 if (dtuple->dtt_nkeys != nkeys)
1598                         goto next;
1599
1600                 for (i = 0; i < nkeys; i++, dkey++) {
1601                         if (dkey->dttk_size != key[i].dttk_size)
1602                                 goto next; /* size or type mismatch */
1603
1604                         if (dkey->dttk_size != 0) {
1605                                 if (dtrace_bcmp(
1606                                     (void *)(uintptr_t)key[i].dttk_value,
1607                                     (void *)(uintptr_t)dkey->dttk_value,
1608                                     dkey->dttk_size))
1609                                         goto next;
1610                         } else {
1611                                 if (dkey->dttk_value != key[i].dttk_value)
1612                                         goto next;
1613                         }
1614                 }
1615
1616                 if (op != DTRACE_DYNVAR_DEALLOC)
1617                         return (dvar);
1618
1619                 ASSERT(dvar->dtdv_next == NULL ||
1620                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1621
1622                 if (prev != NULL) {
1623                         ASSERT(hash[bucket].dtdh_chain != dvar);
1624                         ASSERT(start != dvar);
1625                         ASSERT(prev->dtdv_next == dvar);
1626                         prev->dtdv_next = dvar->dtdv_next;
1627                 } else {
1628                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
1629                             start, dvar->dtdv_next) != start) {
1630                                 /*
1631                                  * We have failed to atomically swing the
1632                                  * hash table head pointer, presumably because
1633                                  * of a conflicting allocation on another CPU.
1634                                  * We need to reread the hash chain and try
1635                                  * again.
1636                                  */
1637                                 goto top;
1638                         }
1639                 }
1640
1641                 dtrace_membar_producer();
1642
1643                 /*
1644                  * Now set the hash value to indicate that it's free.
1645                  */
1646                 ASSERT(hash[bucket].dtdh_chain != dvar);
1647                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1648
1649                 dtrace_membar_producer();
1650
1651                 /*
1652                  * Set the next pointer to point at the dirty list, and
1653                  * atomically swing the dirty pointer to the newly freed dvar.
1654                  */
1655                 do {
1656                         next = dcpu->dtdsc_dirty;
1657                         dvar->dtdv_next = next;
1658                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1659
1660                 /*
1661                  * Finally, unlock this hash bucket.
1662                  */
1663                 ASSERT(hash[bucket].dtdh_lock == lock);
1664                 ASSERT(lock & 1);
1665                 hash[bucket].dtdh_lock++;
1666
1667                 return (NULL);
1668 next:
1669                 prev = dvar;
1670                 continue;
1671         }
1672
1673         if (dvar == NULL) {
1674                 /*
1675                  * If dvar is NULL, it is because we went off the rails:
1676                  * one of the elements that we traversed in the hash chain
1677                  * was deleted while we were traversing it.  In this case,
1678                  * we assert that we aren't doing a dealloc (deallocs lock
1679                  * the hash bucket to prevent themselves from racing with
1680                  * one another), and retry the hash chain traversal.
1681                  */
1682                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1683                 goto top;
1684         }
1685
1686         if (op != DTRACE_DYNVAR_ALLOC) {
1687                 /*
1688                  * If we are not to allocate a new variable, we want to
1689                  * return NULL now.  Before we return, check that the value
1690                  * of the lock word hasn't changed.  If it has, we may have
1691                  * seen an inconsistent snapshot.
1692                  */
1693                 if (op == DTRACE_DYNVAR_NOALLOC) {
1694                         if (hash[bucket].dtdh_lock != lock)
1695                                 goto top;
1696                 } else {
1697                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1698                         ASSERT(hash[bucket].dtdh_lock == lock);
1699                         ASSERT(lock & 1);
1700                         hash[bucket].dtdh_lock++;
1701                 }
1702
1703                 return (NULL);
1704         }
1705
1706         /*
1707          * We need to allocate a new dynamic variable.  The size we need is the
1708          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1709          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1710          * the size of any referred-to data (dsize).  We then round the final
1711          * size up to the chunksize for allocation.
1712          */
1713         for (ksize = 0, i = 0; i < nkeys; i++)
1714                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1715
1716         /*
1717          * This should be pretty much impossible, but could happen if, say,
1718          * strange DIF specified the tuple.  Ideally, this should be an
1719          * assertion and not an error condition -- but that requires that the
1720          * chunksize calculation in dtrace_difo_chunksize() be absolutely
1721          * bullet-proof.  (That is, it must not be able to be fooled by
1722          * malicious DIF.)  Given the lack of backwards branches in DIF,
1723          * solving this would presumably not amount to solving the Halting
1724          * Problem -- but it still seems awfully hard.
1725          */
1726         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1727             ksize + dsize > chunksize) {
1728                 dcpu->dtdsc_drops++;
1729                 return (NULL);
1730         }
1731
1732         nstate = DTRACE_DSTATE_EMPTY;
1733
1734         do {
1735 retry:
1736                 free = dcpu->dtdsc_free;
1737
1738                 if (free == NULL) {
1739                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1740                         void *rval;
1741
1742                         if (clean == NULL) {
1743                                 /*
1744                                  * We're out of dynamic variable space on
1745                                  * this CPU.  Unless we have tried all CPUs,
1746                                  * we'll try to allocate from a different
1747                                  * CPU.
1748                                  */
1749                                 switch (dstate->dtds_state) {
1750                                 case DTRACE_DSTATE_CLEAN: {
1751                                         void *sp = &dstate->dtds_state;
1752
1753                                         if (++cpu >= NCPU)
1754                                                 cpu = 0;
1755
1756                                         if (dcpu->dtdsc_dirty != NULL &&
1757                                             nstate == DTRACE_DSTATE_EMPTY)
1758                                                 nstate = DTRACE_DSTATE_DIRTY;
1759
1760                                         if (dcpu->dtdsc_rinsing != NULL)
1761                                                 nstate = DTRACE_DSTATE_RINSING;
1762
1763                                         dcpu = &dstate->dtds_percpu[cpu];
1764
1765                                         if (cpu != me)
1766                                                 goto retry;
1767
1768                                         (void) dtrace_cas32(sp,
1769                                             DTRACE_DSTATE_CLEAN, nstate);
1770
1771                                         /*
1772                                          * To increment the correct bean
1773                                          * counter, take another lap.
1774                                          */
1775                                         goto retry;
1776                                 }
1777
1778                                 case DTRACE_DSTATE_DIRTY:
1779                                         dcpu->dtdsc_dirty_drops++;
1780                                         break;
1781
1782                                 case DTRACE_DSTATE_RINSING:
1783                                         dcpu->dtdsc_rinsing_drops++;
1784                                         break;
1785
1786                                 case DTRACE_DSTATE_EMPTY:
1787                                         dcpu->dtdsc_drops++;
1788                                         break;
1789                                 }
1790
1791                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1792                                 return (NULL);
1793                         }
1794
1795                         /*
1796                          * The clean list appears to be non-empty.  We want to
1797                          * move the clean list to the free list; we start by
1798                          * moving the clean pointer aside.
1799                          */
1800                         if (dtrace_casptr(&dcpu->dtdsc_clean,
1801                             clean, NULL) != clean) {
1802                                 /*
1803                                  * We are in one of two situations:
1804                                  *
1805                                  *  (a) The clean list was switched to the
1806                                  *      free list by another CPU.
1807                                  *
1808                                  *  (b) The clean list was added to by the
1809                                  *      cleansing cyclic.
1810                                  *
1811                                  * In either of these situations, we can
1812                                  * just reattempt the free list allocation.
1813                                  */
1814                                 goto retry;
1815                         }
1816
1817                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1818
1819                         /*
1820                          * Now we'll move the clean list to our free list.
1821                          * It's impossible for this to fail:  the only way
1822                          * the free list can be updated is through this
1823                          * code path, and only one CPU can own the clean list.
1824                          * Thus, it would only be possible for this to fail if
1825                          * this code were racing with dtrace_dynvar_clean().
1826                          * (That is, if dtrace_dynvar_clean() updated the clean
1827                          * list, and we ended up racing to update the free
1828                          * list.)  This race is prevented by the dtrace_sync()
1829                          * in dtrace_dynvar_clean() -- which flushes the
1830                          * owners of the clean lists out before resetting
1831                          * the clean lists.
1832                          */
1833                         dcpu = &dstate->dtds_percpu[me];
1834                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1835                         ASSERT(rval == NULL);
1836                         goto retry;
1837                 }
1838
1839                 dvar = free;
1840                 new_free = dvar->dtdv_next;
1841         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
1842
1843         /*
1844          * We have now allocated a new chunk.  We copy the tuple keys into the
1845          * tuple array and copy any referenced key data into the data space
1846          * following the tuple array.  As we do this, we relocate dttk_value
1847          * in the final tuple to point to the key data address in the chunk.
1848          */
1849         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
1850         dvar->dtdv_data = (void *)(kdata + ksize);
1851         dvar->dtdv_tuple.dtt_nkeys = nkeys;
1852
1853         for (i = 0; i < nkeys; i++) {
1854                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
1855                 size_t kesize = key[i].dttk_size;
1856
1857                 if (kesize != 0) {
1858                         dtrace_bcopy(
1859                             (const void *)(uintptr_t)key[i].dttk_value,
1860                             (void *)kdata, kesize);
1861                         dkey->dttk_value = kdata;
1862                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
1863                 } else {
1864                         dkey->dttk_value = key[i].dttk_value;
1865                 }
1866
1867                 dkey->dttk_size = kesize;
1868         }
1869
1870         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
1871         dvar->dtdv_hashval = hashval;
1872         dvar->dtdv_next = start;
1873
1874         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
1875                 return (dvar);
1876
1877         /*
1878          * The cas has failed.  Either another CPU is adding an element to
1879          * this hash chain, or another CPU is deleting an element from this
1880          * hash chain.  The simplest way to deal with both of these cases
1881          * (though not necessarily the most efficient) is to free our
1882          * allocated block and tail-call ourselves.  Note that the free is
1883          * to the dirty list and _not_ to the free list.  This is to prevent
1884          * races with allocators, above.
1885          */
1886         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1887
1888         dtrace_membar_producer();
1889
1890         do {
1891                 free = dcpu->dtdsc_dirty;
1892                 dvar->dtdv_next = free;
1893         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
1894
1895         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
1896 }
1897
1898 /*ARGSUSED*/
1899 static void
1900 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
1901 {
1902         if ((int64_t)nval < (int64_t)*oval)
1903                 *oval = nval;
1904 }
1905
1906 /*ARGSUSED*/
1907 static void
1908 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
1909 {
1910         if ((int64_t)nval > (int64_t)*oval)
1911                 *oval = nval;
1912 }
1913
1914 static void
1915 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
1916 {
1917         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
1918         int64_t val = (int64_t)nval;
1919
1920         if (val < 0) {
1921                 for (i = 0; i < zero; i++) {
1922                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
1923                                 quanta[i] += incr;
1924                                 return;
1925                         }
1926                 }
1927         } else {
1928                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
1929                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
1930                                 quanta[i - 1] += incr;
1931                                 return;
1932                         }
1933                 }
1934
1935                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
1936                 return;
1937         }
1938
1939         ASSERT(0);
1940 }
1941
1942 static void
1943 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
1944 {
1945         uint64_t arg = *lquanta++;
1946         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
1947         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
1948         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
1949         int32_t val = (int32_t)nval, level;
1950
1951         ASSERT(step != 0);
1952         ASSERT(levels != 0);
1953
1954         if (val < base) {
1955                 /*
1956                  * This is an underflow.
1957                  */
1958                 lquanta[0] += incr;
1959                 return;
1960         }
1961
1962         level = (val - base) / step;
1963
1964         if (level < levels) {
1965                 lquanta[level + 1] += incr;
1966                 return;
1967         }
1968
1969         /*
1970          * This is an overflow.
1971          */
1972         lquanta[levels + 1] += incr;
1973 }
1974
1975 static int
1976 dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
1977     uint16_t high, uint16_t nsteps, int64_t value)
1978 {
1979         int64_t this = 1, last, next;
1980         int base = 1, order;
1981
1982         ASSERT(factor <= nsteps);
1983         ASSERT(nsteps % factor == 0);
1984
1985         for (order = 0; order < low; order++)
1986                 this *= factor;
1987
1988         /*
1989          * If our value is less than our factor taken to the power of the
1990          * low order of magnitude, it goes into the zeroth bucket.
1991          */
1992         if (value < (last = this))
1993                 return (0);
1994
1995         for (this *= factor; order <= high; order++) {
1996                 int nbuckets = this > nsteps ? nsteps : this;
1997
1998                 if ((next = this * factor) < this) {
1999                         /*
2000                          * We should not generally get log/linear quantizations
2001                          * with a high magnitude that allows 64-bits to
2002                          * overflow, but we nonetheless protect against this
2003                          * by explicitly checking for overflow, and clamping
2004                          * our value accordingly.
2005                          */
2006                         value = this - 1;
2007                 }
2008
2009                 if (value < this) {
2010                         /*
2011                          * If our value lies within this order of magnitude,
2012                          * determine its position by taking the offset within
2013                          * the order of magnitude, dividing by the bucket
2014                          * width, and adding to our (accumulated) base.
2015                          */
2016                         return (base + (value - last) / (this / nbuckets));
2017                 }
2018
2019                 base += nbuckets - (nbuckets / factor);
2020                 last = this;
2021                 this = next;
2022         }
2023
2024         /*
2025          * Our value is greater than or equal to our factor taken to the
2026          * power of one plus the high magnitude -- return the top bucket.
2027          */
2028         return (base);
2029 }
2030
2031 static void
2032 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2033 {
2034         uint64_t arg = *llquanta++;
2035         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2036         uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2037         uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2038         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2039
2040         llquanta[dtrace_aggregate_llquantize_bucket(factor,
2041             low, high, nsteps, nval)] += incr;
2042 }
2043
2044 /*ARGSUSED*/
2045 static void
2046 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2047 {
2048         data[0]++;
2049         data[1] += nval;
2050 }
2051
2052 /*ARGSUSED*/
2053 static void
2054 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2055 {
2056         int64_t snval = (int64_t)nval;
2057         uint64_t tmp[2];
2058
2059         data[0]++;
2060         data[1] += nval;
2061
2062         /*
2063          * What we want to say here is:
2064          *
2065          * data[2] += nval * nval;
2066          *
2067          * But given that nval is 64-bit, we could easily overflow, so
2068          * we do this as 128-bit arithmetic.
2069          */
2070         if (snval < 0)
2071                 snval = -snval;
2072
2073         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2074         dtrace_add_128(data + 2, tmp, data + 2);
2075 }
2076
2077 /*ARGSUSED*/
2078 static void
2079 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2080 {
2081         *oval = *oval + 1;
2082 }
2083
2084 /*ARGSUSED*/
2085 static void
2086 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2087 {
2088         *oval += nval;
2089 }
2090
2091 /*
2092  * Aggregate given the tuple in the principal data buffer, and the aggregating
2093  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2094  * buffer is specified as the buf parameter.  This routine does not return
2095  * failure; if there is no space in the aggregation buffer, the data will be
2096  * dropped, and a corresponding counter incremented.
2097  */
2098 static void
2099 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2100     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2101 {
2102         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2103         uint32_t i, ndx, size, fsize;
2104         uint32_t align = sizeof (uint64_t) - 1;
2105         dtrace_aggbuffer_t *agb;
2106         dtrace_aggkey_t *key;
2107         uint32_t hashval = 0, limit, isstr;
2108         caddr_t tomax, data, kdata;
2109         dtrace_actkind_t action;
2110         dtrace_action_t *act;
2111         uintptr_t offs;
2112
2113         if (buf == NULL)
2114                 return;
2115
2116         if (!agg->dtag_hasarg) {
2117                 /*
2118                  * Currently, only quantize() and lquantize() take additional
2119                  * arguments, and they have the same semantics:  an increment
2120                  * value that defaults to 1 when not present.  If additional
2121                  * aggregating actions take arguments, the setting of the
2122                  * default argument value will presumably have to become more
2123                  * sophisticated...
2124                  */
2125                 arg = 1;
2126         }
2127
2128         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2129         size = rec->dtrd_offset - agg->dtag_base;
2130         fsize = size + rec->dtrd_size;
2131
2132         ASSERT(dbuf->dtb_tomax != NULL);
2133         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2134
2135         if ((tomax = buf->dtb_tomax) == NULL) {
2136                 dtrace_buffer_drop(buf);
2137                 return;
2138         }
2139
2140         /*
2141          * The metastructure is always at the bottom of the buffer.
2142          */
2143         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2144             sizeof (dtrace_aggbuffer_t));
2145
2146         if (buf->dtb_offset == 0) {
2147                 /*
2148                  * We just kludge up approximately 1/8th of the size to be
2149                  * buckets.  If this guess ends up being routinely
2150                  * off-the-mark, we may need to dynamically readjust this
2151                  * based on past performance.
2152                  */
2153                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2154
2155                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2156                     (uintptr_t)tomax || hashsize == 0) {
2157                         /*
2158                          * We've been given a ludicrously small buffer;
2159                          * increment our drop count and leave.
2160                          */
2161                         dtrace_buffer_drop(buf);
2162                         return;
2163                 }
2164
2165                 /*
2166                  * And now, a pathetic attempt to try to get a an odd (or
2167                  * perchance, a prime) hash size for better hash distribution.
2168                  */
2169                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2170                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2171
2172                 agb->dtagb_hashsize = hashsize;
2173                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2174                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2175                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2176
2177                 for (i = 0; i < agb->dtagb_hashsize; i++)
2178                         agb->dtagb_hash[i] = NULL;
2179         }
2180
2181         ASSERT(agg->dtag_first != NULL);
2182         ASSERT(agg->dtag_first->dta_intuple);
2183
2184         /*
2185          * Calculate the hash value based on the key.  Note that we _don't_
2186          * include the aggid in the hashing (but we will store it as part of
2187          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2188          * algorithm: a simple, quick algorithm that has no known funnels, and
2189          * gets good distribution in practice.  The efficacy of the hashing
2190          * algorithm (and a comparison with other algorithms) may be found by
2191          * running the ::dtrace_aggstat MDB dcmd.
2192          */
2193         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2194                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2195                 limit = i + act->dta_rec.dtrd_size;
2196                 ASSERT(limit <= size);
2197                 isstr = DTRACEACT_ISSTRING(act);
2198
2199                 for (; i < limit; i++) {
2200                         hashval += data[i];
2201                         hashval += (hashval << 10);
2202                         hashval ^= (hashval >> 6);
2203
2204                         if (isstr && data[i] == '\0')
2205                                 break;
2206                 }
2207         }
2208
2209         hashval += (hashval << 3);
2210         hashval ^= (hashval >> 11);
2211         hashval += (hashval << 15);
2212
2213         /*
2214          * Yes, the divide here is expensive -- but it's generally the least
2215          * of the performance issues given the amount of data that we iterate
2216          * over to compute hash values, compare data, etc.
2217          */
2218         ndx = hashval % agb->dtagb_hashsize;
2219
2220         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2221                 ASSERT((caddr_t)key >= tomax);
2222                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2223
2224                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2225                         continue;
2226
2227                 kdata = key->dtak_data;
2228                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2229
2230                 for (act = agg->dtag_first; act->dta_intuple;
2231                     act = act->dta_next) {
2232                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2233                         limit = i + act->dta_rec.dtrd_size;
2234                         ASSERT(limit <= size);
2235                         isstr = DTRACEACT_ISSTRING(act);
2236
2237                         for (; i < limit; i++) {
2238                                 if (kdata[i] != data[i])
2239                                         goto next;
2240
2241                                 if (isstr && data[i] == '\0')
2242                                         break;
2243                         }
2244                 }
2245
2246                 if (action != key->dtak_action) {
2247                         /*
2248                          * We are aggregating on the same value in the same
2249                          * aggregation with two different aggregating actions.
2250                          * (This should have been picked up in the compiler,
2251                          * so we may be dealing with errant or devious DIF.)
2252                          * This is an error condition; we indicate as much,
2253                          * and return.
2254                          */
2255                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2256                         return;
2257                 }
2258
2259                 /*
2260                  * This is a hit:  we need to apply the aggregator to
2261                  * the value at this key.
2262                  */
2263                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2264                 return;
2265 next:
2266                 continue;
2267         }
2268
2269         /*
2270          * We didn't find it.  We need to allocate some zero-filled space,
2271          * link it into the hash table appropriately, and apply the aggregator
2272          * to the (zero-filled) value.
2273          */
2274         offs = buf->dtb_offset;
2275         while (offs & (align - 1))
2276                 offs += sizeof (uint32_t);
2277
2278         /*
2279          * If we don't have enough room to both allocate a new key _and_
2280          * its associated data, increment the drop count and return.
2281          */
2282         if ((uintptr_t)tomax + offs + fsize >
2283             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2284                 dtrace_buffer_drop(buf);
2285                 return;
2286         }
2287
2288         /*CONSTCOND*/
2289         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2290         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2291         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2292
2293         key->dtak_data = kdata = tomax + offs;
2294         buf->dtb_offset = offs + fsize;
2295
2296         /*
2297          * Now copy the data across.
2298          */
2299         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2300
2301         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2302                 kdata[i] = data[i];
2303
2304         /*
2305          * Because strings are not zeroed out by default, we need to iterate
2306          * looking for actions that store strings, and we need to explicitly
2307          * pad these strings out with zeroes.
2308          */
2309         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2310                 int nul;
2311
2312                 if (!DTRACEACT_ISSTRING(act))
2313                         continue;
2314
2315                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2316                 limit = i + act->dta_rec.dtrd_size;
2317                 ASSERT(limit <= size);
2318
2319                 for (nul = 0; i < limit; i++) {
2320                         if (nul) {
2321                                 kdata[i] = '\0';
2322                                 continue;
2323                         }
2324
2325                         if (data[i] != '\0')
2326                                 continue;
2327
2328                         nul = 1;
2329                 }
2330         }
2331
2332         for (i = size; i < fsize; i++)
2333                 kdata[i] = 0;
2334
2335         key->dtak_hashval = hashval;
2336         key->dtak_size = size;
2337         key->dtak_action = action;
2338         key->dtak_next = agb->dtagb_hash[ndx];
2339         agb->dtagb_hash[ndx] = key;
2340
2341         /*
2342          * Finally, apply the aggregator.
2343          */
2344         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2345         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2346 }
2347
2348 /*
2349  * Given consumer state, this routine finds a speculation in the INACTIVE
2350  * state and transitions it into the ACTIVE state.  If there is no speculation
2351  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2352  * incremented -- it is up to the caller to take appropriate action.
2353  */
2354 static int
2355 dtrace_speculation(dtrace_state_t *state)
2356 {
2357         int i = 0;
2358         dtrace_speculation_state_t current;
2359         uint32_t *stat = &state->dts_speculations_unavail, count;
2360
2361         while (i < state->dts_nspeculations) {
2362                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2363
2364                 current = spec->dtsp_state;
2365
2366                 if (current != DTRACESPEC_INACTIVE) {
2367                         if (current == DTRACESPEC_COMMITTINGMANY ||
2368                             current == DTRACESPEC_COMMITTING ||
2369                             current == DTRACESPEC_DISCARDING)
2370                                 stat = &state->dts_speculations_busy;
2371                         i++;
2372                         continue;
2373                 }
2374
2375                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2376                     current, DTRACESPEC_ACTIVE) == current)
2377                         return (i + 1);
2378         }
2379
2380         /*
2381          * We couldn't find a speculation.  If we found as much as a single
2382          * busy speculation buffer, we'll attribute this failure as "busy"
2383          * instead of "unavail".
2384          */
2385         do {
2386                 count = *stat;
2387         } while (dtrace_cas32(stat, count, count + 1) != count);
2388
2389         return (0);
2390 }
2391
2392 /*
2393  * This routine commits an active speculation.  If the specified speculation
2394  * is not in a valid state to perform a commit(), this routine will silently do
2395  * nothing.  The state of the specified speculation is transitioned according
2396  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2397  */
2398 static void
2399 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2400     dtrace_specid_t which)
2401 {
2402         dtrace_speculation_t *spec;
2403         dtrace_buffer_t *src, *dest;
2404         uintptr_t daddr, saddr, dlimit;
2405         dtrace_speculation_state_t current, new;
2406         intptr_t offs;
2407
2408         if (which == 0)
2409                 return;
2410
2411         if (which > state->dts_nspeculations) {
2412                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2413                 return;
2414         }
2415
2416         spec = &state->dts_speculations[which - 1];
2417         src = &spec->dtsp_buffer[cpu];
2418         dest = &state->dts_buffer[cpu];
2419
2420         do {
2421                 current = spec->dtsp_state;
2422
2423                 if (current == DTRACESPEC_COMMITTINGMANY)
2424                         break;
2425
2426                 switch (current) {
2427                 case DTRACESPEC_INACTIVE:
2428                 case DTRACESPEC_DISCARDING:
2429                         return;
2430
2431                 case DTRACESPEC_COMMITTING:
2432                         /*
2433                          * This is only possible if we are (a) commit()'ing
2434                          * without having done a prior speculate() on this CPU
2435                          * and (b) racing with another commit() on a different
2436                          * CPU.  There's nothing to do -- we just assert that
2437                          * our offset is 0.
2438                          */
2439                         ASSERT(src->dtb_offset == 0);
2440                         return;
2441
2442                 case DTRACESPEC_ACTIVE:
2443                         new = DTRACESPEC_COMMITTING;
2444                         break;
2445
2446                 case DTRACESPEC_ACTIVEONE:
2447                         /*
2448                          * This speculation is active on one CPU.  If our
2449                          * buffer offset is non-zero, we know that the one CPU
2450                          * must be us.  Otherwise, we are committing on a
2451                          * different CPU from the speculate(), and we must
2452                          * rely on being asynchronously cleaned.
2453                          */
2454                         if (src->dtb_offset != 0) {
2455                                 new = DTRACESPEC_COMMITTING;
2456                                 break;
2457                         }
2458                         /*FALLTHROUGH*/
2459
2460                 case DTRACESPEC_ACTIVEMANY:
2461                         new = DTRACESPEC_COMMITTINGMANY;
2462                         break;
2463
2464                 default:
2465                         ASSERT(0);
2466                 }
2467         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2468             current, new) != current);
2469
2470         /*
2471          * We have set the state to indicate that we are committing this
2472          * speculation.  Now reserve the necessary space in the destination
2473          * buffer.
2474          */
2475         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2476             sizeof (uint64_t), state, NULL)) < 0) {
2477                 dtrace_buffer_drop(dest);
2478                 goto out;
2479         }
2480
2481         /*
2482          * We have the space; copy the buffer across.  (Note that this is a
2483          * highly subobtimal bcopy(); in the unlikely event that this becomes
2484          * a serious performance issue, a high-performance DTrace-specific
2485          * bcopy() should obviously be invented.)
2486          */
2487         daddr = (uintptr_t)dest->dtb_tomax + offs;
2488         dlimit = daddr + src->dtb_offset;
2489         saddr = (uintptr_t)src->dtb_tomax;
2490
2491         /*
2492          * First, the aligned portion.
2493          */
2494         while (dlimit - daddr >= sizeof (uint64_t)) {
2495                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2496
2497                 daddr += sizeof (uint64_t);
2498                 saddr += sizeof (uint64_t);
2499         }
2500
2501         /*
2502          * Now any left-over bit...
2503          */
2504         while (dlimit - daddr)
2505                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2506
2507         /*
2508          * Finally, commit the reserved space in the destination buffer.
2509          */
2510         dest->dtb_offset = offs + src->dtb_offset;
2511
2512 out:
2513         /*
2514          * If we're lucky enough to be the only active CPU on this speculation
2515          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2516          */
2517         if (current == DTRACESPEC_ACTIVE ||
2518             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2519                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2520                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2521
2522                 ASSERT(rval == DTRACESPEC_COMMITTING);
2523         }
2524
2525         src->dtb_offset = 0;
2526         src->dtb_xamot_drops += src->dtb_drops;
2527         src->dtb_drops = 0;
2528 }
2529
2530 /*
2531  * This routine discards an active speculation.  If the specified speculation
2532  * is not in a valid state to perform a discard(), this routine will silently
2533  * do nothing.  The state of the specified speculation is transitioned
2534  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2535  */
2536 static void
2537 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2538     dtrace_specid_t which)
2539 {
2540         dtrace_speculation_t *spec;
2541         dtrace_speculation_state_t current, new;
2542         dtrace_buffer_t *buf;
2543
2544         if (which == 0)
2545                 return;
2546
2547         if (which > state->dts_nspeculations) {
2548                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2549                 return;
2550         }
2551
2552         spec = &state->dts_speculations[which - 1];
2553         buf = &spec->dtsp_buffer[cpu];
2554
2555         do {
2556                 current = spec->dtsp_state;
2557
2558                 switch (current) {
2559                 case DTRACESPEC_INACTIVE:
2560                 case DTRACESPEC_COMMITTINGMANY:
2561                 case DTRACESPEC_COMMITTING:
2562                 case DTRACESPEC_DISCARDING:
2563                         return;
2564
2565                 case DTRACESPEC_ACTIVE:
2566                 case DTRACESPEC_ACTIVEMANY:
2567                         new = DTRACESPEC_DISCARDING;
2568                         break;
2569
2570                 case DTRACESPEC_ACTIVEONE:
2571                         if (buf->dtb_offset != 0) {
2572                                 new = DTRACESPEC_INACTIVE;
2573                         } else {
2574                                 new = DTRACESPEC_DISCARDING;
2575                         }
2576                         break;
2577
2578                 default:
2579                         ASSERT(0);
2580                 }
2581         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2582             current, new) != current);
2583
2584         buf->dtb_offset = 0;
2585         buf->dtb_drops = 0;
2586 }
2587
2588 /*
2589  * Note:  not called from probe context.  This function is called
2590  * asynchronously from cross call context to clean any speculations that are
2591  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2592  * transitioned back to the INACTIVE state until all CPUs have cleaned the
2593  * speculation.
2594  */
2595 static void
2596 dtrace_speculation_clean_here(dtrace_state_t *state)
2597 {
2598         dtrace_icookie_t cookie;
2599         processorid_t cpu = CPU->cpu_id;
2600         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2601         dtrace_specid_t i;
2602
2603         cookie = dtrace_interrupt_disable();
2604
2605         if (dest->dtb_tomax == NULL) {
2606                 dtrace_interrupt_enable(cookie);
2607                 return;
2608         }
2609
2610         for (i = 0; i < state->dts_nspeculations; i++) {
2611                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2612                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2613
2614                 if (src->dtb_tomax == NULL)
2615                         continue;
2616
2617                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2618                         src->dtb_offset = 0;
2619                         continue;
2620                 }
2621
2622                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2623                         continue;
2624
2625                 if (src->dtb_offset == 0)
2626                         continue;
2627
2628                 dtrace_speculation_commit(state, cpu, i + 1);
2629         }
2630
2631         dtrace_interrupt_enable(cookie);
2632 }
2633
2634 /*
2635  * Note:  not called from probe context.  This function is called
2636  * asynchronously (and at a regular interval) to clean any speculations that
2637  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2638  * is work to be done, it cross calls all CPUs to perform that work;
2639  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2640  * INACTIVE state until they have been cleaned by all CPUs.
2641  */
2642 static void
2643 dtrace_speculation_clean(dtrace_state_t *state)
2644 {
2645         int work = 0, rv;
2646         dtrace_specid_t i;
2647
2648         for (i = 0; i < state->dts_nspeculations; i++) {
2649                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2650
2651                 ASSERT(!spec->dtsp_cleaning);
2652
2653                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2654                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2655                         continue;
2656
2657                 work++;
2658                 spec->dtsp_cleaning = 1;
2659         }
2660
2661         if (!work)
2662                 return;
2663
2664         dtrace_xcall(DTRACE_CPUALL,
2665             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2666
2667         /*
2668          * We now know that all CPUs have committed or discarded their
2669          * speculation buffers, as appropriate.  We can now set the state
2670          * to inactive.
2671          */
2672         for (i = 0; i < state->dts_nspeculations; i++) {
2673                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2674                 dtrace_speculation_state_t current, new;
2675
2676                 if (!spec->dtsp_cleaning)
2677                         continue;
2678
2679                 current = spec->dtsp_state;
2680                 ASSERT(current == DTRACESPEC_DISCARDING ||
2681                     current == DTRACESPEC_COMMITTINGMANY);
2682
2683                 new = DTRACESPEC_INACTIVE;
2684
2685                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2686                 ASSERT(rv == current);
2687                 spec->dtsp_cleaning = 0;
2688         }
2689 }
2690
2691 /*
2692  * Called as part of a speculate() to get the speculative buffer associated
2693  * with a given speculation.  Returns NULL if the specified speculation is not
2694  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2695  * the active CPU is not the specified CPU -- the speculation will be
2696  * atomically transitioned into the ACTIVEMANY state.
2697  */
2698 static dtrace_buffer_t *
2699 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2700     dtrace_specid_t which)
2701 {
2702         dtrace_speculation_t *spec;
2703         dtrace_speculation_state_t current, new;
2704         dtrace_buffer_t *buf;
2705
2706         if (which == 0)
2707                 return (NULL);
2708
2709         if (which > state->dts_nspeculations) {
2710                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2711                 return (NULL);
2712         }
2713
2714         spec = &state->dts_speculations[which - 1];
2715         buf = &spec->dtsp_buffer[cpuid];
2716
2717         do {
2718                 current = spec->dtsp_state;
2719
2720                 switch (current) {
2721                 case DTRACESPEC_INACTIVE:
2722                 case DTRACESPEC_COMMITTINGMANY:
2723                 case DTRACESPEC_DISCARDING:
2724                         return (NULL);
2725
2726                 case DTRACESPEC_COMMITTING:
2727                         ASSERT(buf->dtb_offset == 0);
2728                         return (NULL);
2729
2730                 case DTRACESPEC_ACTIVEONE:
2731                         /*
2732                          * This speculation is currently active on one CPU.
2733                          * Check the offset in the buffer; if it's non-zero,
2734                          * that CPU must be us (and we leave the state alone).
2735                          * If it's zero, assume that we're starting on a new
2736                          * CPU -- and change the state to indicate that the
2737                          * speculation is active on more than one CPU.
2738                          */
2739                         if (buf->dtb_offset != 0)
2740                                 return (buf);
2741
2742                         new = DTRACESPEC_ACTIVEMANY;
2743                         break;
2744
2745                 case DTRACESPEC_ACTIVEMANY:
2746                         return (buf);
2747
2748                 case DTRACESPEC_ACTIVE:
2749                         new = DTRACESPEC_ACTIVEONE;
2750                         break;
2751
2752                 default:
2753                         ASSERT(0);
2754                 }
2755         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2756             current, new) != current);
2757
2758         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2759         return (buf);
2760 }
2761
2762 /*
2763  * Return a string.  In the event that the user lacks the privilege to access
2764  * arbitrary kernel memory, we copy the string out to scratch memory so that we
2765  * don't fail access checking.
2766  *
2767  * dtrace_dif_variable() uses this routine as a helper for various
2768  * builtin values such as 'execname' and 'probefunc.'
2769  */
2770 uintptr_t
2771 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2772     dtrace_mstate_t *mstate)
2773 {
2774         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2775         uintptr_t ret;
2776         size_t strsz;
2777
2778         /*
2779          * The easy case: this probe is allowed to read all of memory, so
2780          * we can just return this as a vanilla pointer.
2781          */
2782         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2783                 return (addr);
2784
2785         /*
2786          * This is the tougher case: we copy the string in question from
2787          * kernel memory into scratch memory and return it that way: this
2788          * ensures that we won't trip up when access checking tests the
2789          * BYREF return value.
2790          */
2791         strsz = dtrace_strlen((char *)addr, size) + 1;
2792
2793         if (mstate->dtms_scratch_ptr + strsz >
2794             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2795                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2796                 return (NULL);
2797         }
2798
2799         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2800             strsz);
2801         ret = mstate->dtms_scratch_ptr;
2802         mstate->dtms_scratch_ptr += strsz;
2803         return (ret);
2804 }
2805
2806 /*
2807  * This function implements the DIF emulator's variable lookups.  The emulator
2808  * passes a reserved variable identifier and optional built-in array index.
2809  */
2810 static uint64_t
2811 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2812     uint64_t ndx)
2813 {
2814         /*
2815          * If we're accessing one of the uncached arguments, we'll turn this
2816          * into a reference in the args array.
2817          */
2818         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2819                 ndx = v - DIF_VAR_ARG0;
2820                 v = DIF_VAR_ARGS;
2821         }
2822
2823         switch (v) {
2824         case DIF_VAR_ARGS:
2825                 if (!(mstate->dtms_access & DTRACE_ACCESS_ARGS)) {
2826                         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |=
2827                             CPU_DTRACE_KPRIV;
2828                         return (0);
2829                 }
2830
2831                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2832                 if (ndx >= sizeof (mstate->dtms_arg) /
2833                     sizeof (mstate->dtms_arg[0])) {
2834                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2835                         dtrace_provider_t *pv;
2836                         uint64_t val;
2837
2838                         pv = mstate->dtms_probe->dtpr_provider;
2839                         if (pv->dtpv_pops.dtps_getargval != NULL)
2840                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2841                                     mstate->dtms_probe->dtpr_id,
2842                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
2843                         else
2844                                 val = dtrace_getarg(ndx, aframes);
2845
2846                         /*
2847                          * This is regrettably required to keep the compiler
2848                          * from tail-optimizing the call to dtrace_getarg().
2849                          * The condition always evaluates to true, but the
2850                          * compiler has no way of figuring that out a priori.
2851                          * (None of this would be necessary if the compiler
2852                          * could be relied upon to _always_ tail-optimize
2853                          * the call to dtrace_getarg() -- but it can't.)
2854                          */
2855                         if (mstate->dtms_probe != NULL)
2856                                 return (val);
2857
2858                         ASSERT(0);
2859                 }
2860
2861                 return (mstate->dtms_arg[ndx]);
2862
2863         case DIF_VAR_UREGS: {
2864                 klwp_t *lwp;
2865
2866                 if (!dtrace_priv_proc(state, mstate))
2867                         return (0);
2868
2869                 if ((lwp = curthread->t_lwp) == NULL) {
2870                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
2871                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
2872                         return (0);
2873                 }
2874
2875                 return (dtrace_getreg(lwp->lwp_regs, ndx));
2876         }
2877
2878         case DIF_VAR_VMREGS: {
2879                 uint64_t rval;
2880
2881                 if (!dtrace_priv_kernel(state))
2882                         return (0);
2883
2884                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2885
2886                 rval = dtrace_getvmreg(ndx,
2887                     &cpu_core[CPU->cpu_id].cpuc_dtrace_flags);
2888
2889                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2890
2891                 return (rval);
2892         }
2893
2894         case DIF_VAR_CURTHREAD:
2895                 if (!dtrace_priv_kernel(state))
2896                         return (0);
2897                 return ((uint64_t)(uintptr_t)curthread);
2898
2899         case DIF_VAR_TIMESTAMP:
2900                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
2901                         mstate->dtms_timestamp = dtrace_gethrtime();
2902                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
2903                 }
2904                 return (mstate->dtms_timestamp);
2905
2906         case DIF_VAR_VTIMESTAMP:
2907                 ASSERT(dtrace_vtime_references != 0);
2908                 return (curthread->t_dtrace_vtime);
2909
2910         case DIF_VAR_WALLTIMESTAMP:
2911                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
2912                         mstate->dtms_walltimestamp = dtrace_gethrestime();
2913                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
2914                 }
2915                 return (mstate->dtms_walltimestamp);
2916
2917         case DIF_VAR_IPL:
2918                 if (!dtrace_priv_kernel(state))
2919                         return (0);
2920                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
2921                         mstate->dtms_ipl = dtrace_getipl();
2922                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
2923                 }
2924                 return (mstate->dtms_ipl);
2925
2926         case DIF_VAR_EPID:
2927                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
2928                 return (mstate->dtms_epid);
2929
2930         case DIF_VAR_ID:
2931                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
2932                 return (mstate->dtms_probe->dtpr_id);
2933
2934         case DIF_VAR_STACKDEPTH:
2935                 if (!dtrace_priv_kernel(state))
2936                         return (0);
2937                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
2938                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2939
2940                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
2941                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
2942                 }
2943                 return (mstate->dtms_stackdepth);
2944
2945         case DIF_VAR_USTACKDEPTH:
2946                 if (!dtrace_priv_proc(state, mstate))
2947                         return (0);
2948                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
2949                         /*
2950                          * See comment in DIF_VAR_PID.
2951                          */
2952                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
2953                             CPU_ON_INTR(CPU)) {
2954                                 mstate->dtms_ustackdepth = 0;
2955                         } else {
2956                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
2957                                 mstate->dtms_ustackdepth =
2958                                     dtrace_getustackdepth();
2959                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
2960                         }
2961                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
2962                 }
2963                 return (mstate->dtms_ustackdepth);
2964
2965         case DIF_VAR_CALLER:
2966                 if (!dtrace_priv_kernel(state))
2967                         return (0);
2968                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
2969                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2970
2971                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
2972                                 /*
2973                                  * If this is an unanchored probe, we are
2974                                  * required to go through the slow path:
2975                                  * dtrace_caller() only guarantees correct
2976                                  * results for anchored probes.
2977                                  */
2978                                 pc_t caller[2];
2979
2980                                 dtrace_getpcstack(caller, 2, aframes,
2981                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
2982                                 mstate->dtms_caller = caller[1];
2983                         } else if ((mstate->dtms_caller =
2984                             dtrace_caller(aframes)) == -1) {
2985                                 /*
2986                                  * We have failed to do this the quick way;
2987                                  * we must resort to the slower approach of
2988                                  * calling dtrace_getpcstack().
2989                                  */
2990                                 pc_t caller;
2991
2992                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
2993                                 mstate->dtms_caller = caller;
2994                         }
2995
2996                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
2997                 }
2998                 return (mstate->dtms_caller);
2999
3000         case DIF_VAR_UCALLER:
3001                 if (!dtrace_priv_proc(state, mstate))
3002                         return (0);
3003
3004                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3005                         uint64_t ustack[3];
3006
3007                         /*
3008                          * dtrace_getupcstack() fills in the first uint64_t
3009                          * with the current PID.  The second uint64_t will
3010                          * be the program counter at user-level.  The third
3011                          * uint64_t will contain the caller, which is what
3012                          * we're after.
3013                          */
3014                         ustack[2] = NULL;
3015                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3016                         dtrace_getupcstack(ustack, 3);
3017                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3018                         mstate->dtms_ucaller = ustack[2];
3019                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3020                 }
3021
3022                 return (mstate->dtms_ucaller);
3023
3024         case DIF_VAR_PROBEPROV:
3025                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3026                 return (dtrace_dif_varstr(
3027                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3028                     state, mstate));
3029
3030         case DIF_VAR_PROBEMOD:
3031                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3032                 return (dtrace_dif_varstr(
3033                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3034                     state, mstate));
3035
3036         case DIF_VAR_PROBEFUNC:
3037                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3038                 return (dtrace_dif_varstr(
3039                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3040                     state, mstate));
3041
3042         case DIF_VAR_PROBENAME:
3043                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3044                 return (dtrace_dif_varstr(
3045                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3046                     state, mstate));
3047
3048         case DIF_VAR_PID:
3049                 if (!dtrace_priv_proc(state, mstate))
3050                         return (0);
3051
3052                 /*
3053                  * Note that we are assuming that an unanchored probe is
3054                  * always due to a high-level interrupt.  (And we're assuming
3055                  * that there is only a single high level interrupt.)
3056                  */
3057                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3058                         return (pid0.pid_id);
3059
3060                 /*
3061                  * It is always safe to dereference one's own t_procp pointer:
3062                  * it always points to a valid, allocated proc structure.
3063                  * Further, it is always safe to dereference the p_pidp member
3064                  * of one's own proc structure.  (These are truisms becuase
3065                  * threads and processes don't clean up their own state --
3066                  * they leave that task to whomever reaps them.)
3067                  */
3068                 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3069
3070         case DIF_VAR_PPID:
3071                 if (!dtrace_priv_proc(state, mstate))
3072                         return (0);
3073
3074                 /*
3075                  * See comment in DIF_VAR_PID.
3076                  */
3077                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3078                         return (pid0.pid_id);
3079
3080                 /*
3081                  * It is always safe to dereference one's own t_procp pointer:
3082                  * it always points to a valid, allocated proc structure.
3083                  * (This is true because threads don't clean up their own
3084                  * state -- they leave that task to whomever reaps them.)
3085                  */
3086                 return ((uint64_t)curthread->t_procp->p_ppid);
3087
3088         case DIF_VAR_TID:
3089                 /*
3090                  * See comment in DIF_VAR_PID.
3091                  */
3092                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3093                         return (0);
3094
3095                 return ((uint64_t)curthread->t_tid);
3096
3097         case DIF_VAR_EXECNAME:
3098                 if (!dtrace_priv_proc(state, mstate))
3099                         return (0);
3100
3101                 /*
3102                  * See comment in DIF_VAR_PID.
3103                  */
3104                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3105                         return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3106
3107                 /*
3108                  * It is always safe to dereference one's own t_procp pointer:
3109                  * it always points to a valid, allocated proc structure.
3110                  * (This is true because threads don't clean up their own
3111                  * state -- they leave that task to whomever reaps them.)
3112                  */
3113                 return (dtrace_dif_varstr(
3114                     (uintptr_t)curthread->t_procp->p_user.u_comm,
3115                     state, mstate));
3116
3117         case DIF_VAR_ZONENAME:
3118                 if (!dtrace_priv_proc(state, mstate))
3119                         return (0);
3120
3121                 /*
3122                  * See comment in DIF_VAR_PID.
3123                  */
3124                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3125                         return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3126
3127                 /*
3128                  * It is always safe to dereference one's own t_procp pointer:
3129                  * it always points to a valid, allocated proc structure.
3130                  * (This is true because threads don't clean up their own
3131                  * state -- they leave that task to whomever reaps them.)
3132                  */
3133                 return (dtrace_dif_varstr(
3134                     (uintptr_t)curthread->t_procp->p_zone->zone_name,
3135                     state, mstate));
3136
3137         case DIF_VAR_UID:
3138                 if (!dtrace_priv_proc(state, mstate))
3139                         return (0);
3140
3141                 /*
3142                  * See comment in DIF_VAR_PID.
3143                  */
3144                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3145                         return ((uint64_t)p0.p_cred->cr_uid);
3146
3147                 /*
3148                  * It is always safe to dereference one's own t_procp pointer:
3149                  * it always points to a valid, allocated proc structure.
3150                  * (This is true because threads don't clean up their own
3151                  * state -- they leave that task to whomever reaps them.)
3152                  *
3153                  * Additionally, it is safe to dereference one's own process
3154                  * credential, since this is never NULL after process birth.
3155                  */
3156                 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3157
3158         case DIF_VAR_GID:
3159                 if (!dtrace_priv_proc(state, mstate))
3160                         return (0);
3161
3162                 /*
3163                  * See comment in DIF_VAR_PID.
3164                  */
3165                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3166                         return ((uint64_t)p0.p_cred->cr_gid);
3167
3168                 /*
3169                  * It is always safe to dereference one's own t_procp pointer:
3170                  * it always points to a valid, allocated proc structure.
3171                  * (This is true because threads don't clean up their own
3172                  * state -- they leave that task to whomever reaps them.)
3173                  *
3174                  * Additionally, it is safe to dereference one's own process
3175                  * credential, since this is never NULL after process birth.
3176                  */
3177                 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3178
3179         case DIF_VAR_ERRNO: {
3180                 klwp_t *lwp;
3181                 if (!dtrace_priv_proc(state, mstate))
3182                         return (0);
3183
3184                 /*
3185                  * See comment in DIF_VAR_PID.
3186                  */
3187                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3188                         return (0);
3189
3190                 /*
3191                  * It is always safe to dereference one's own t_lwp pointer in
3192                  * the event that this pointer is non-NULL.  (This is true
3193                  * because threads and lwps don't clean up their own state --
3194                  * they leave that task to whomever reaps them.)
3195                  */
3196                 if ((lwp = curthread->t_lwp) == NULL)
3197                         return (0);
3198
3199                 return ((uint64_t)lwp->lwp_errno);
3200         }
3201         default:
3202                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3203                 return (0);
3204         }
3205 }
3206
3207 /*
3208  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3209  * Notice that we don't bother validating the proper number of arguments or
3210  * their types in the tuple stack.  This isn't needed because all argument
3211  * interpretation is safe because of our load safety -- the worst that can
3212  * happen is that a bogus program can obtain bogus results.
3213  */
3214 static void
3215 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3216     dtrace_key_t *tupregs, int nargs,
3217     dtrace_mstate_t *mstate, dtrace_state_t *state)
3218 {
3219         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3220         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3221         dtrace_vstate_t *vstate = &state->dts_vstate;
3222
3223         union {
3224                 mutex_impl_t mi;
3225                 uint64_t mx;
3226         } m;
3227
3228         union {
3229                 krwlock_t ri;
3230                 uintptr_t rw;
3231         } r;
3232
3233         switch (subr) {
3234         case DIF_SUBR_RAND:
3235                 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3236                 break;
3237
3238         case DIF_SUBR_MUTEX_OWNED:
3239                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3240                     mstate, vstate)) {
3241                         regs[rd] = NULL;
3242                         break;
3243                 }
3244
3245                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3246                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3247                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3248                 else
3249                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3250                 break;
3251
3252         case DIF_SUBR_MUTEX_OWNER:
3253                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3254                     mstate, vstate)) {
3255                         regs[rd] = NULL;
3256                         break;
3257                 }
3258
3259                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3260                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3261                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3262                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3263                 else
3264                         regs[rd] = 0;
3265                 break;
3266
3267         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3268                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3269                     mstate, vstate)) {
3270                         regs[rd] = NULL;
3271                         break;
3272                 }
3273
3274                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3275                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3276                 break;
3277
3278         case DIF_SUBR_MUTEX_TYPE_SPIN:
3279                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3280                     mstate, vstate)) {
3281                         regs[rd] = NULL;
3282                         break;
3283                 }
3284
3285                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3286                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3287                 break;
3288
3289         case DIF_SUBR_RW_READ_HELD: {
3290                 uintptr_t tmp;
3291
3292                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3293                     mstate, vstate)) {
3294                         regs[rd] = NULL;
3295                         break;
3296                 }
3297
3298                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3299                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3300                 break;
3301         }
3302
3303         case DIF_SUBR_RW_WRITE_HELD:
3304                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3305                     mstate, vstate)) {
3306                         regs[rd] = NULL;
3307                         break;
3308                 }
3309
3310                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3311                 regs[rd] = _RW_WRITE_HELD(&r.ri);
3312                 break;
3313
3314         case DIF_SUBR_RW_ISWRITER:
3315                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3316                     mstate, vstate)) {
3317                         regs[rd] = NULL;
3318                         break;
3319                 }
3320
3321                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3322                 regs[rd] = _RW_ISWRITER(&r.ri);
3323                 break;
3324
3325         case DIF_SUBR_BCOPY: {
3326                 /*
3327                  * We need to be sure that the destination is in the scratch
3328                  * region -- no other region is allowed.
3329                  */
3330                 uintptr_t src = tupregs[0].dttk_value;
3331                 uintptr_t dest = tupregs[1].dttk_value;
3332                 size_t size = tupregs[2].dttk_value;
3333
3334                 if (!dtrace_inscratch(dest, size, mstate)) {
3335                         *flags |= CPU_DTRACE_BADADDR;
3336                         *illval = regs[rd];
3337                         break;
3338                 }
3339
3340                 if (!dtrace_canload(src, size, mstate, vstate)) {
3341                         regs[rd] = NULL;
3342                         break;
3343                 }
3344
3345                 dtrace_bcopy((void *)src, (void *)dest, size);
3346                 break;
3347         }
3348
3349         case DIF_SUBR_ALLOCA:
3350         case DIF_SUBR_COPYIN: {
3351                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3352                 uint64_t size =
3353                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3354                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3355
3356                 /*
3357                  * This action doesn't require any credential checks since
3358                  * probes will not activate in user contexts to which the
3359                  * enabling user does not have permissions.
3360                  */
3361
3362                 /*
3363                  * Rounding up the user allocation size could have overflowed
3364                  * a large, bogus allocation (like -1ULL) to 0.
3365                  */
3366                 if (scratch_size < size ||
3367                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
3368                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3369                         regs[rd] = NULL;
3370                         break;
3371                 }
3372
3373                 if (subr == DIF_SUBR_COPYIN) {
3374                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3375                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3376                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3377                 }
3378
3379                 mstate->dtms_scratch_ptr += scratch_size;
3380                 regs[rd] = dest;
3381                 break;
3382         }
3383
3384         case DIF_SUBR_COPYINTO: {
3385                 uint64_t size = tupregs[1].dttk_value;
3386                 uintptr_t dest = tupregs[2].dttk_value;
3387
3388                 /*
3389                  * This action doesn't require any credential checks since
3390                  * probes will not activate in user contexts to which the
3391                  * enabling user does not have permissions.
3392                  */
3393                 if (!dtrace_inscratch(dest, size, mstate)) {
3394                         *flags |= CPU_DTRACE_BADADDR;
3395                         *illval = regs[rd];
3396                         break;
3397                 }
3398
3399                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3400                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3401                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3402                 break;
3403         }
3404
3405         case DIF_SUBR_COPYINSTR: {
3406                 uintptr_t dest = mstate->dtms_scratch_ptr;
3407                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3408
3409                 if (nargs > 1 && tupregs[1].dttk_value < size)
3410                         size = tupregs[1].dttk_value + 1;
3411
3412                 /*
3413                  * This action doesn't require any credential checks since
3414                  * probes will not activate in user contexts to which the
3415                  * enabling user does not have permissions.
3416                  */
3417                 if (!DTRACE_INSCRATCH(mstate, size)) {
3418                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3419                         regs[rd] = NULL;
3420                         break;
3421                 }
3422
3423                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3424                 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3425                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3426
3427                 ((char *)dest)[size - 1] = '\0';
3428                 mstate->dtms_scratch_ptr += size;
3429                 regs[rd] = dest;
3430                 break;
3431         }
3432
3433         case DIF_SUBR_MSGSIZE:
3434         case DIF_SUBR_MSGDSIZE: {
3435                 uintptr_t baddr = tupregs[0].dttk_value, daddr;
3436                 uintptr_t wptr, rptr;
3437                 size_t count = 0;
3438                 int cont = 0;
3439
3440                 while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3441
3442                         if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3443                             vstate)) {
3444                                 regs[rd] = NULL;
3445                                 break;
3446                         }
3447
3448                         wptr = dtrace_loadptr(baddr +
3449                             offsetof(mblk_t, b_wptr));
3450
3451                         rptr = dtrace_loadptr(baddr +
3452                             offsetof(mblk_t, b_rptr));
3453
3454                         if (wptr < rptr) {
3455                                 *flags |= CPU_DTRACE_BADADDR;
3456                                 *illval = tupregs[0].dttk_value;
3457                                 break;
3458                         }
3459
3460                         daddr = dtrace_loadptr(baddr +
3461                             offsetof(mblk_t, b_datap));
3462
3463                         baddr = dtrace_loadptr(baddr +
3464                             offsetof(mblk_t, b_cont));
3465
3466                         /*
3467                          * We want to prevent against denial-of-service here,
3468                          * so we're only going to search the list for
3469                          * dtrace_msgdsize_max mblks.
3470                          */
3471                         if (cont++ > dtrace_msgdsize_max) {
3472                                 *flags |= CPU_DTRACE_ILLOP;
3473                                 break;
3474                         }
3475
3476                         if (subr == DIF_SUBR_MSGDSIZE) {
3477                                 if (dtrace_load8(daddr +
3478                                     offsetof(dblk_t, db_type)) != M_DATA)
3479                                         continue;
3480                         }
3481
3482                         count += wptr - rptr;
3483                 }
3484
3485                 if (!(*flags & CPU_DTRACE_FAULT))
3486                         regs[rd] = count;
3487
3488                 break;
3489         }
3490
3491         case DIF_SUBR_PROGENYOF: {
3492                 pid_t pid = tupregs[0].dttk_value;
3493                 proc_t *p;
3494                 int rval = 0;
3495
3496                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3497
3498                 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3499                         if (p->p_pidp->pid_id == pid) {
3500                                 rval = 1;
3501                                 break;
3502                         }
3503                 }
3504
3505                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3506
3507                 regs[rd] = rval;
3508                 break;
3509         }
3510
3511         case DIF_SUBR_SPECULATION:
3512                 regs[rd] = dtrace_speculation(state);
3513                 break;
3514
3515         case DIF_SUBR_COPYOUT: {
3516                 uintptr_t kaddr = tupregs[0].dttk_value;
3517                 uintptr_t uaddr = tupregs[1].dttk_value;
3518                 uint64_t size = tupregs[2].dttk_value;
3519
3520                 if (!dtrace_destructive_disallow &&
3521                     dtrace_priv_proc_control(state, mstate) &&
3522                     !dtrace_istoxic(kaddr, size)) {
3523                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3524                         dtrace_copyout(kaddr, uaddr, size, flags);
3525                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3526                 }
3527                 break;
3528         }
3529
3530         case DIF_SUBR_COPYOUTSTR: {
3531                 uintptr_t kaddr = tupregs[0].dttk_value;
3532                 uintptr_t uaddr = tupregs[1].dttk_value;
3533                 uint64_t size = tupregs[2].dttk_value;
3534
3535                 if (!dtrace_destructive_disallow &&
3536                     dtrace_priv_proc_control(state, mstate) &&
3537                     !dtrace_istoxic(kaddr, size)) {
3538                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3539                         dtrace_copyoutstr(kaddr, uaddr, size, flags);
3540                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3541                 }
3542                 break;
3543         }
3544
3545         case DIF_SUBR_STRLEN: {
3546                 size_t sz;
3547                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3548                 sz = dtrace_strlen((char *)addr,
3549                     state->dts_options[DTRACEOPT_STRSIZE]);
3550
3551                 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3552                         regs[rd] = NULL;
3553                         break;
3554                 }
3555
3556                 regs[rd] = sz;
3557
3558                 break;
3559         }
3560
3561         case DIF_SUBR_STRCHR:
3562         case DIF_SUBR_STRRCHR: {
3563                 /*
3564                  * We're going to iterate over the string looking for the
3565                  * specified character.  We will iterate until we have reached
3566                  * the string length or we have found the character.  If this
3567                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3568                  * of the specified character instead of the first.
3569                  */
3570                 uintptr_t saddr = tupregs[0].dttk_value;
3571                 uintptr_t addr = tupregs[0].dttk_value;
3572                 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3573                 char c, target = (char)tupregs[1].dttk_value;
3574
3575                 for (regs[rd] = NULL; addr < limit; addr++) {
3576                         if ((c = dtrace_load8(addr)) == target) {
3577                                 regs[rd] = addr;
3578
3579                                 if (subr == DIF_SUBR_STRCHR)
3580                                         break;
3581                         }
3582
3583                         if (c == '\0')
3584                                 break;
3585                 }
3586
3587                 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3588                         regs[rd] = NULL;
3589                         break;
3590                 }
3591
3592                 break;
3593         }
3594
3595         case DIF_SUBR_STRSTR:
3596         case DIF_SUBR_INDEX:
3597         case DIF_SUBR_RINDEX: {
3598                 /*
3599                  * We're going to iterate over the string looking for the
3600                  * specified string.  We will iterate until we have reached
3601                  * the string length or we have found the string.  (Yes, this
3602                  * is done in the most naive way possible -- but considering
3603                  * that the string we're searching for is likely to be
3604                  * relatively short, the complexity of Rabin-Karp or similar
3605                  * hardly seems merited.)
3606                  */
3607                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3608                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3609                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3610                 size_t len = dtrace_strlen(addr, size);
3611                 size_t sublen = dtrace_strlen(substr, size);
3612                 char *limit = addr + len, *orig = addr;
3613                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3614                 int inc = 1;
3615
3616                 regs[rd] = notfound;
3617
3618                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3619                         regs[rd] = NULL;
3620                         break;
3621                 }
3622
3623                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3624                     vstate)) {
3625                         regs[rd] = NULL;
3626                         break;
3627                 }
3628
3629                 /*
3630                  * strstr() and index()/rindex() have similar semantics if
3631                  * both strings are the empty string: strstr() returns a
3632                  * pointer to the (empty) string, and index() and rindex()
3633                  * both return index 0 (regardless of any position argument).
3634                  */
3635                 if (sublen == 0 && len == 0) {
3636                         if (subr == DIF_SUBR_STRSTR)
3637                                 regs[rd] = (uintptr_t)addr;
3638                         else
3639                                 regs[rd] = 0;
3640                         break;
3641                 }
3642
3643                 if (subr != DIF_SUBR_STRSTR) {
3644                         if (subr == DIF_SUBR_RINDEX) {
3645                                 limit = orig - 1;
3646                                 addr += len;
3647                                 inc = -1;
3648                         }
3649
3650                         /*
3651                          * Both index() and rindex() take an optional position
3652                          * argument that denotes the starting position.
3653                          */
3654                         if (nargs == 3) {
3655                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
3656
3657                                 /*
3658                                  * If the position argument to index() is
3659                                  * negative, Perl implicitly clamps it at
3660                                  * zero.  This semantic is a little surprising
3661                                  * given the special meaning of negative
3662                                  * positions to similar Perl functions like
3663                                  * substr(), but it appears to reflect a
3664                                  * notion that index() can start from a
3665                                  * negative index and increment its way up to
3666                                  * the string.  Given this notion, Perl's
3667                                  * rindex() is at least self-consistent in
3668                                  * that it implicitly clamps positions greater
3669                                  * than the string length to be the string
3670                                  * length.  Where Perl completely loses
3671                                  * coherence, however, is when the specified
3672                                  * substring is the empty string ("").  In
3673                                  * this case, even if the position is
3674                                  * negative, rindex() returns 0 -- and even if
3675                                  * the position is greater than the length,
3676                                  * index() returns the string length.  These
3677                                  * semantics violate the notion that index()
3678                                  * should never return a value less than the
3679                                  * specified position and that rindex() should
3680                                  * never return a value greater than the
3681                                  * specified position.  (One assumes that
3682                                  * these semantics are artifacts of Perl's
3683                                  * implementation and not the results of
3684                                  * deliberate design -- it beggars belief that
3685                                  * even Larry Wall could desire such oddness.)
3686                                  * While in the abstract one would wish for
3687                                  * consistent position semantics across
3688                                  * substr(), index() and rindex() -- or at the
3689                                  * very least self-consistent position
3690                                  * semantics for index() and rindex() -- we
3691                                  * instead opt to keep with the extant Perl
3692                                  * semantics, in all their broken glory.  (Do
3693                                  * we have more desire to maintain Perl's
3694                                  * semantics than Perl does?  Probably.)
3695                                  */
3696                                 if (subr == DIF_SUBR_RINDEX) {
3697                                         if (pos < 0) {
3698                                                 if (sublen == 0)
3699                                                         regs[rd] = 0;
3700                                                 break;
3701                                         }
3702
3703                                         if (pos > len)
3704                                                 pos = len;
3705                                 } else {
3706                                         if (pos < 0)
3707                                                 pos = 0;
3708
3709                                         if (pos >= len) {
3710                                                 if (sublen == 0)
3711                                                         regs[rd] = len;
3712                                                 break;
3713                                         }
3714                                 }
3715
3716                                 addr = orig + pos;
3717                         }
3718                 }
3719
3720                 for (regs[rd] = notfound; addr != limit; addr += inc) {
3721                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
3722                                 if (subr != DIF_SUBR_STRSTR) {
3723                                         /*
3724                                          * As D index() and rindex() are
3725                                          * modeled on Perl (and not on awk),
3726                                          * we return a zero-based (and not a
3727                                          * one-based) index.  (For you Perl
3728                                          * weenies: no, we're not going to add
3729                                          * $[ -- and shouldn't you be at a con
3730                                          * or something?)
3731                                          */
3732                                         regs[rd] = (uintptr_t)(addr - orig);
3733                                         break;
3734                                 }
3735
3736                                 ASSERT(subr == DIF_SUBR_STRSTR);
3737                                 regs[rd] = (uintptr_t)addr;
3738                                 break;
3739                         }
3740                 }
3741
3742                 break;
3743         }
3744
3745         case DIF_SUBR_STRTOK: {
3746                 uintptr_t addr = tupregs[0].dttk_value;
3747                 uintptr_t tokaddr = tupregs[1].dttk_value;
3748                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3749                 uintptr_t limit, toklimit = tokaddr + size;
3750                 uint8_t c, tokmap[32];   /* 256 / 8 */
3751                 char *dest = (char *)mstate->dtms_scratch_ptr;
3752                 int i;
3753
3754                 /*
3755                  * Check both the token buffer and (later) the input buffer,
3756                  * since both could be non-scratch addresses.
3757                  */
3758                 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3759                         regs[rd] = NULL;
3760                         break;
3761                 }
3762
3763                 if (!DTRACE_INSCRATCH(mstate, size)) {
3764                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3765                         regs[rd] = NULL;
3766                         break;
3767                 }
3768
3769                 if (addr == NULL) {
3770                         /*
3771                          * If the address specified is NULL, we use our saved
3772                          * strtok pointer from the mstate.  Note that this
3773                          * means that the saved strtok pointer is _only_
3774                          * valid within multiple enablings of the same probe --
3775                          * it behaves like an implicit clause-local variable.
3776                          */
3777                         addr = mstate->dtms_strtok;
3778                 } else {
3779                         /*
3780                          * If the user-specified address is non-NULL we must
3781                          * access check it.  This is the only time we have
3782                          * a chance to do so, since this address may reside
3783                          * in the string table of this clause-- future calls
3784                          * (when we fetch addr from mstate->dtms_strtok)
3785                          * would fail this access check.
3786                          */
3787                         if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3788                                 regs[rd] = NULL;
3789                                 break;
3790                         }
3791                 }
3792
3793                 /*
3794                  * First, zero the token map, and then process the token
3795                  * string -- setting a bit in the map for every character
3796                  * found in the token string.
3797                  */
3798                 for (i = 0; i < sizeof (tokmap); i++)
3799                         tokmap[i] = 0;
3800
3801                 for (; tokaddr < toklimit; tokaddr++) {
3802                         if ((c = dtrace_load8(tokaddr)) == '\0')
3803                                 break;
3804
3805                         ASSERT((c >> 3) < sizeof (tokmap));
3806                         tokmap[c >> 3] |= (1 << (c & 0x7));
3807                 }
3808
3809                 for (limit = addr + size; addr < limit; addr++) {
3810                         /*
3811                          * We're looking for a character that is _not_ contained
3812                          * in the token string.
3813                          */
3814                         if ((c = dtrace_load8(addr)) == '\0')
3815                                 break;
3816
3817                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3818                                 break;
3819                 }
3820
3821                 if (c == '\0') {
3822                         /*
3823                          * We reached the end of the string without finding
3824                          * any character that was not in the token string.
3825                          * We return NULL in this case, and we set the saved
3826                          * address to NULL as well.
3827                          */
3828                         regs[rd] = NULL;
3829                         mstate->dtms_strtok = NULL;
3830                         break;
3831                 }
3832
3833                 /*
3834                  * From here on, we're copying into the destination string.
3835                  */
3836                 for (i = 0; addr < limit && i < size - 1; addr++) {
3837                         if ((c = dtrace_load8(addr)) == '\0')
3838                                 break;
3839
3840                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
3841                                 break;
3842
3843                         ASSERT(i < size);
3844                         dest[i++] = c;
3845                 }
3846
3847                 ASSERT(i < size);
3848                 dest[i] = '\0';
3849                 regs[rd] = (uintptr_t)dest;
3850                 mstate->dtms_scratch_ptr += size;
3851                 mstate->dtms_strtok = addr;
3852                 break;
3853         }
3854
3855         case DIF_SUBR_SUBSTR: {
3856                 uintptr_t s = tupregs[0].dttk_value;
3857                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3858                 char *d = (char *)mstate->dtms_scratch_ptr;
3859                 int64_t index = (int64_t)tupregs[1].dttk_value;
3860                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
3861                 size_t len = dtrace_strlen((char *)s, size);
3862                 int64_t i;
3863
3864                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
3865                         regs[rd] = NULL;
3866                         break;
3867                 }
3868
3869                 if (!DTRACE_INSCRATCH(mstate, size)) {
3870                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3871                         regs[rd] = NULL;
3872                         break;
3873                 }
3874
3875                 if (nargs <= 2)
3876                         remaining = (int64_t)size;
3877
3878                 if (index < 0) {
3879                         index += len;
3880
3881                         if (index < 0 && index + remaining > 0) {
3882                                 remaining += index;
3883                                 index = 0;
3884                         }
3885                 }
3886
3887                 if (index >= len || index < 0) {
3888                         remaining = 0;
3889                 } else if (remaining < 0) {
3890                         remaining += len - index;
3891                 } else if (index + remaining > size) {
3892                         remaining = size - index;
3893                 }
3894
3895                 for (i = 0; i < remaining; i++) {
3896                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
3897                                 break;
3898                 }
3899
3900                 d[i] = '\0';
3901
3902                 mstate->dtms_scratch_ptr += size;
3903                 regs[rd] = (uintptr_t)d;
3904                 break;
3905         }
3906
3907         case DIF_SUBR_GETMAJOR:
3908 #ifdef _LP64
3909                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
3910 #else
3911                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
3912 #endif
3913                 break;
3914
3915         case DIF_SUBR_GETMINOR:
3916 #ifdef _LP64
3917                 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
3918 #else
3919                 regs[rd] = tupregs[0].dttk_value & MAXMIN;
3920 #endif
3921                 break;
3922
3923         case DIF_SUBR_DDI_PATHNAME: {
3924                 /*
3925                  * This one is a galactic mess.  We are going to roughly
3926                  * emulate ddi_pathname(), but it's made more complicated
3927                  * by the fact that we (a) want to include the minor name and
3928                  * (b) must proceed iteratively instead of recursively.
3929                  */
3930                 uintptr_t dest = mstate->dtms_scratch_ptr;
3931                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3932                 char *start = (char *)dest, *end = start + size - 1;
3933                 uintptr_t daddr = tupregs[0].dttk_value;
3934                 int64_t minor = (int64_t)tupregs[1].dttk_value;
3935                 char *s;
3936                 int i, len, depth = 0;
3937
3938                 /*
3939                  * Due to all the pointer jumping we do and context we must
3940                  * rely upon, we just mandate that the user must have kernel
3941                  * read privileges to use this routine.
3942                  */
3943                 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
3944                         *flags |= CPU_DTRACE_KPRIV;
3945                         *illval = daddr;
3946                         regs[rd] = NULL;
3947                 }
3948
3949                 if (!DTRACE_INSCRATCH(mstate, size)) {
3950                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3951                         regs[rd] = NULL;
3952                         break;
3953                 }
3954
3955                 *end = '\0';
3956
3957                 /*
3958                  * We want to have a name for the minor.  In order to do this,
3959                  * we need to walk the minor list from the devinfo.  We want
3960                  * to be sure that we don't infinitely walk a circular list,
3961                  * so we check for circularity by sending a scout pointer
3962                  * ahead two elements for every element that we iterate over;
3963                  * if the list is circular, these will ultimately point to the
3964                  * same element.  You may recognize this little trick as the
3965                  * answer to a stupid interview question -- one that always
3966                  * seems to be asked by those who had to have it laboriously
3967                  * explained to them, and who can't even concisely describe
3968                  * the conditions under which one would be forced to resort to
3969                  * this technique.  Needless to say, those conditions are
3970                  * found here -- and probably only here.  Is this the only use
3971                  * of this infamous trick in shipping, production code?  If it
3972                  * isn't, it probably should be...
3973                  */
3974                 if (minor != -1) {
3975                         uintptr_t maddr = dtrace_loadptr(daddr +
3976                             offsetof(struct dev_info, devi_minor));
3977
3978                         uintptr_t next = offsetof(struct ddi_minor_data, next);
3979                         uintptr_t name = offsetof(struct ddi_minor_data,
3980                             d_minor) + offsetof(struct ddi_minor, name);
3981                         uintptr_t dev = offsetof(struct ddi_minor_data,
3982                             d_minor) + offsetof(struct ddi_minor, dev);
3983                         uintptr_t scout;
3984
3985                         if (maddr != NULL)
3986                                 scout = dtrace_loadptr(maddr + next);
3987
3988                         while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3989                                 uint64_t m;
3990 #ifdef _LP64
3991                                 m = dtrace_load64(maddr + dev) & MAXMIN64;
3992 #else
3993                                 m = dtrace_load32(maddr + dev) & MAXMIN;
3994 #endif
3995                                 if (m != minor) {
3996                                         maddr = dtrace_loadptr(maddr + next);
3997
3998                                         if (scout == NULL)
3999                                                 continue;
4000
4001                                         scout = dtrace_loadptr(scout + next);
4002
4003                                         if (scout == NULL)
4004                                                 continue;
4005
4006                                         scout = dtrace_loadptr(scout + next);
4007
4008                                         if (scout == NULL)
4009                                                 continue;
4010
4011                                         if (scout == maddr) {
4012                                                 *flags |= CPU_DTRACE_ILLOP;
4013                                                 break;
4014                                         }
4015
4016                                         continue;
4017                                 }
4018
4019                                 /*
4020                                  * We have the minor data.  Now we need to
4021                                  * copy the minor's name into the end of the
4022                                  * pathname.
4023                                  */
4024                                 s = (char *)dtrace_loadptr(maddr + name);
4025                                 len = dtrace_strlen(s, size);
4026
4027                                 if (*flags & CPU_DTRACE_FAULT)
4028                                         break;
4029
4030                                 if (len != 0) {
4031                                         if ((end -= (len + 1)) < start)
4032                                                 break;
4033
4034                                         *end = ':';
4035                                 }
4036
4037                                 for (i = 1; i <= len; i++)
4038                                         end[i] = dtrace_load8((uintptr_t)s++);
4039                                 break;
4040                         }
4041                 }
4042
4043                 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4044                         ddi_node_state_t devi_state;
4045
4046                         devi_state = dtrace_load32(daddr +
4047                             offsetof(struct dev_info, devi_node_state));
4048
4049                         if (*flags & CPU_DTRACE_FAULT)
4050                                 break;
4051
4052                         if (devi_state >= DS_INITIALIZED) {
4053                                 s = (char *)dtrace_loadptr(daddr +
4054                                     offsetof(struct dev_info, devi_addr));
4055                                 len = dtrace_strlen(s, size);
4056
4057                                 if (*flags & CPU_DTRACE_FAULT)
4058                                         break;
4059
4060                                 if (len != 0) {
4061                                         if ((end -= (len + 1)) < start)
4062                                                 break;
4063
4064                                         *end = '@';
4065                                 }
4066
4067                                 for (i = 1; i <= len; i++)
4068                                         end[i] = dtrace_load8((uintptr_t)s++);
4069                         }
4070
4071                         /*
4072                          * Now for the node name...
4073                          */
4074                         s = (char *)dtrace_loadptr(daddr +
4075                             offsetof(struct dev_info, devi_node_name));
4076
4077                         daddr = dtrace_loadptr(daddr +
4078                             offsetof(struct dev_info, devi_parent));
4079
4080                         /*
4081                          * If our parent is NULL (that is, if we're the root
4082                          * node), we're going to use the special path
4083                          * "devices".
4084                          */
4085                         if (daddr == NULL)
4086                                 s = "devices";
4087
4088                         len = dtrace_strlen(s, size);
4089                         if (*flags & CPU_DTRACE_FAULT)
4090                                 break;
4091
4092                         if ((end -= (len + 1)) < start)
4093                                 break;
4094
4095                         for (i = 1; i <= len; i++)
4096                                 end[i] = dtrace_load8((uintptr_t)s++);
4097                         *end = '/';
4098
4099                         if (depth++ > dtrace_devdepth_max) {
4100                                 *flags |= CPU_DTRACE_ILLOP;
4101                                 break;
4102                         }
4103                 }
4104
4105                 if (end < start)
4106                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4107
4108                 if (daddr == NULL) {
4109                         regs[rd] = (uintptr_t)end;
4110                         mstate->dtms_scratch_ptr += size;
4111                 }
4112
4113                 break;
4114         }
4115
4116         case DIF_SUBR_STRJOIN: {
4117                 char *d = (char *)mstate->dtms_scratch_ptr;
4118                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4119                 uintptr_t s1 = tupregs[0].dttk_value;
4120                 uintptr_t s2 = tupregs[1].dttk_value;
4121                 int i = 0;
4122
4123                 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4124                     !dtrace_strcanload(s2, size, mstate, vstate)) {
4125                         regs[rd] = NULL;
4126                         break;
4127                 }
4128
4129                 if (!DTRACE_INSCRATCH(mstate, size)) {
4130                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4131                         regs[rd] = NULL;
4132                         break;
4133                 }
4134
4135                 for (;;) {
4136                         if (i >= size) {
4137                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4138                                 regs[rd] = NULL;
4139                                 break;
4140                         }
4141
4142                         if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4143                                 i--;
4144                                 break;
4145                         }
4146                 }
4147
4148                 for (;;) {
4149                         if (i >= size) {
4150                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4151                                 regs[rd] = NULL;
4152                                 break;
4153                         }
4154
4155                         if ((d[i++] = dtrace_load8(s2++)) == '\0')
4156                                 break;
4157                 }
4158
4159                 if (i < size) {
4160                         mstate->dtms_scratch_ptr += i;
4161                         regs[rd] = (uintptr_t)d;
4162                 }
4163
4164                 break;
4165         }
4166
4167         case DIF_SUBR_LLTOSTR: {
4168                 int64_t i = (int64_t)tupregs[0].dttk_value;
4169                 int64_t val = i < 0 ? i * -1 : i;
4170                 uint64_t size = 22;     /* enough room for 2^64 in decimal */
4171                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4172
4173                 if (!DTRACE_INSCRATCH(mstate, size)) {
4174                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4175                         regs[rd] = NULL;
4176                         break;
4177                 }
4178
4179                 for (*end-- = '\0'; val; val /= 10)
4180                         *end-- = '0' + (val % 10);
4181
4182                 if (i == 0)
4183                         *end-- = '0';
4184
4185                 if (i < 0)
4186                         *end-- = '-';
4187
4188                 regs[rd] = (uintptr_t)end + 1;
4189                 mstate->dtms_scratch_ptr += size;
4190                 break;
4191         }
4192
4193         case DIF_SUBR_HTONS:
4194         case DIF_SUBR_NTOHS:
4195 #ifdef _BIG_ENDIAN
4196                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4197 #else
4198                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4199 #endif
4200                 break;
4201
4202
4203         case DIF_SUBR_HTONL:
4204         case DIF_SUBR_NTOHL:
4205 #ifdef _BIG_ENDIAN
4206                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4207 #else
4208                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4209 #endif
4210                 break;
4211
4212
4213         case DIF_SUBR_HTONLL:
4214         case DIF_SUBR_NTOHLL:
4215 #ifdef _BIG_ENDIAN
4216                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4217 #else
4218                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4219 #endif
4220                 break;
4221
4222
4223         case DIF_SUBR_DIRNAME:
4224         case DIF_SUBR_BASENAME: {
4225                 char *dest = (char *)mstate->dtms_scratch_ptr;
4226                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4227                 uintptr_t src = tupregs[0].dttk_value;
4228                 int i, j, len = dtrace_strlen((char *)src, size);
4229                 int lastbase = -1, firstbase = -1, lastdir = -1;
4230                 int start, end;
4231
4232                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4233                         regs[rd] = NULL;
4234                         break;
4235                 }
4236
4237                 if (!DTRACE_INSCRATCH(mstate, size)) {
4238                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4239                         regs[rd] = NULL;
4240                         break;
4241                 }
4242
4243                 /*
4244                  * The basename and dirname for a zero-length string is
4245                  * defined to be "."
4246                  */
4247                 if (len == 0) {
4248                         len = 1;
4249                         src = (uintptr_t)".";
4250                 }
4251
4252                 /*
4253                  * Start from the back of the string, moving back toward the
4254                  * front until we see a character that isn't a slash.  That
4255                  * character is the last character in the basename.
4256                  */
4257                 for (i = len - 1; i >= 0; i--) {
4258                         if (dtrace_load8(src + i) != '/')
4259                                 break;
4260                 }
4261
4262                 if (i >= 0)
4263                         lastbase = i;
4264
4265                 /*
4266                  * Starting from the last character in the basename, move
4267                  * towards the front until we find a slash.  The character
4268                  * that we processed immediately before that is the first
4269                  * character in the basename.
4270                  */
4271                 for (; i >= 0; i--) {
4272                         if (dtrace_load8(src + i) == '/')
4273                                 break;
4274                 }
4275
4276                 if (i >= 0)
4277                         firstbase = i + 1;
4278
4279                 /*
4280                  * Now keep going until we find a non-slash character.  That
4281                  * character is the last character in the dirname.
4282                  */
4283                 for (; i >= 0; i--) {
4284                         if (dtrace_load8(src + i) != '/')
4285                                 break;
4286                 }
4287
4288                 if (i >= 0)
4289                         lastdir = i;
4290
4291                 ASSERT(!(lastbase == -1 && firstbase != -1));
4292                 ASSERT(!(firstbase == -1 && lastdir != -1));
4293
4294                 if (lastbase == -1) {
4295                         /*
4296                          * We didn't find a non-slash character.  We know that
4297                          * the length is non-zero, so the whole string must be
4298                          * slashes.  In either the dirname or the basename
4299                          * case, we return '/'.
4300                          */
4301                         ASSERT(firstbase == -1);
4302                         firstbase = lastbase = lastdir = 0;
4303                 }
4304
4305                 if (firstbase == -1) {
4306                         /*
4307                          * The entire string consists only of a basename
4308                          * component.  If we're looking for dirname, we need
4309                          * to change our string to be just "."; if we're
4310                          * looking for a basename, we'll just set the first
4311                          * character of the basename to be 0.
4312                          */
4313                         if (subr == DIF_SUBR_DIRNAME) {
4314                                 ASSERT(lastdir == -1);
4315                                 src = (uintptr_t)".";
4316                                 lastdir = 0;
4317                         } else {
4318                                 firstbase = 0;
4319                         }
4320                 }
4321
4322                 if (subr == DIF_SUBR_DIRNAME) {
4323                         if (lastdir == -1) {
4324                                 /*
4325                                  * We know that we have a slash in the name --
4326                                  * or lastdir would be set to 0, above.  And
4327                                  * because lastdir is -1, we know that this
4328                                  * slash must be the first character.  (That
4329                                  * is, the full string must be of the form
4330                                  * "/basename".)  In this case, the last
4331                                  * character of the directory name is 0.
4332                                  */
4333                                 lastdir = 0;
4334                         }
4335
4336                         start = 0;
4337                         end = lastdir;
4338                 } else {
4339                         ASSERT(subr == DIF_SUBR_BASENAME);
4340                         ASSERT(firstbase != -1 && lastbase != -1);
4341                         start = firstbase;
4342                         end = lastbase;
4343                 }
4344
4345                 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4346                         dest[j] = dtrace_load8(src + i);
4347
4348                 dest[j] = '\0';
4349                 regs[rd] = (uintptr_t)dest;
4350                 mstate->dtms_scratch_ptr += size;
4351                 break;
4352         }
4353
4354         case DIF_SUBR_CLEANPATH: {
4355                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4356                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4357                 uintptr_t src = tupregs[0].dttk_value;
4358                 int i = 0, j = 0;
4359
4360                 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4361                         regs[rd] = NULL;
4362                         break;
4363                 }
4364
4365                 if (!DTRACE_INSCRATCH(mstate, size)) {
4366                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4367                         regs[rd] = NULL;
4368                         break;
4369                 }
4370
4371                 /*
4372                  * Move forward, loading each character.
4373                  */
4374                 do {
4375                         c = dtrace_load8(src + i++);
4376 next:
4377                         if (j + 5 >= size)      /* 5 = strlen("/..c\0") */
4378                                 break;
4379
4380                         if (c != '/') {
4381                                 dest[j++] = c;
4382                                 continue;
4383                         }
4384
4385                         c = dtrace_load8(src + i++);
4386
4387                         if (c == '/') {
4388                                 /*
4389                                  * We have two slashes -- we can just advance
4390                                  * to the next character.
4391                                  */
4392                                 goto next;
4393                         }
4394
4395                         if (c != '.') {
4396                                 /*
4397                                  * This is not "." and it's not ".." -- we can
4398                                  * just store the "/" and this character and
4399                                  * drive on.
4400                                  */
4401                                 dest[j++] = '/';
4402                                 dest[j++] = c;
4403                                 continue;
4404                         }
4405
4406                         c = dtrace_load8(src + i++);
4407
4408                         if (c == '/') {
4409                                 /*
4410                                  * This is a "/./" component.  We're not going
4411                                  * to store anything in the destination buffer;
4412                                  * we're just going to go to the next component.
4413                                  */
4414                                 goto next;
4415                         }
4416
4417                         if (c != '.') {
4418                                 /*
4419                                  * This is not ".." -- we can just store the
4420                                  * "/." and this character and continue
4421                                  * processing.
4422                                  */
4423                                 dest[j++] = '/';
4424                                 dest[j++] = '.';
4425                                 dest[j++] = c;
4426                                 continue;
4427                         }
4428
4429                         c = dtrace_load8(src + i++);
4430
4431                         if (c != '/' && c != '\0') {
4432                                 /*
4433                                  * This is not ".." -- it's "..[mumble]".
4434                                  * We'll store the "/.." and this character
4435                                  * and continue processing.
4436                                  */
4437                                 dest[j++] = '/';
4438                                 dest[j++] = '.';
4439                                 dest[j++] = '.';
4440                                 dest[j++] = c;
4441                                 continue;
4442                         }
4443
4444                         /*
4445                          * This is "/../" or "/..\0".  We need to back up
4446                          * our destination pointer until we find a "/".
4447                          */
4448                         i--;
4449                         while (j != 0 && dest[--j] != '/')
4450                                 continue;
4451
4452                         if (c == '\0')
4453                                 dest[++j] = '/';
4454                 } while (c != '\0');
4455
4456                 dest[j] = '\0';
4457                 regs[rd] = (uintptr_t)dest;
4458                 mstate->dtms_scratch_ptr += size;
4459                 break;
4460         }
4461
4462         case DIF_SUBR_INET_NTOA:
4463         case DIF_SUBR_INET_NTOA6:
4464         case DIF_SUBR_INET_NTOP: {
4465                 size_t size;
4466                 int af, argi, i;
4467                 char *base, *end;
4468
4469                 if (subr == DIF_SUBR_INET_NTOP) {
4470                         af = (int)tupregs[0].dttk_value;
4471                         argi = 1;
4472                 } else {
4473                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4474                         argi = 0;
4475                 }
4476
4477                 if (af == AF_INET) {
4478                         ipaddr_t ip4;
4479                         uint8_t *ptr8, val;
4480
4481                         /*
4482                          * Safely load the IPv4 address.
4483                          */
4484                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
4485
4486                         /*
4487                          * Check an IPv4 string will fit in scratch.
4488                          */
4489                         size = INET_ADDRSTRLEN;
4490                         if (!DTRACE_INSCRATCH(mstate, size)) {
4491                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4492                                 regs[rd] = NULL;
4493                                 break;
4494                         }
4495                         base = (char *)mstate->dtms_scratch_ptr;
4496                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4497
4498                         /*
4499                          * Stringify as a dotted decimal quad.
4500                          */
4501                         *end-- = '\0';
4502                         ptr8 = (uint8_t *)&ip4;
4503                         for (i = 3; i >= 0; i--) {
4504                                 val = ptr8[i];
4505
4506                                 if (val == 0) {
4507                                         *end-- = '0';
4508                                 } else {
4509                                         for (; val; val /= 10) {
4510                                                 *end-- = '0' + (val % 10);
4511                                         }
4512                                 }
4513
4514                                 if (i > 0)
4515                                         *end-- = '.';
4516                         }
4517                         ASSERT(end + 1 >= base);
4518
4519                 } else if (af == AF_INET6) {
4520                         struct in6_addr ip6;
4521                         int firstzero, tryzero, numzero, v6end;
4522                         uint16_t val;
4523                         const char digits[] = "0123456789abcdef";
4524
4525                         /*
4526                          * Stringify using RFC 1884 convention 2 - 16 bit
4527                          * hexadecimal values with a zero-run compression.
4528                          * Lower case hexadecimal digits are used.
4529                          *      eg, fe80::214:4fff:fe0b:76c8.
4530                          * The IPv4 embedded form is returned for inet_ntop,
4531                          * just the IPv4 string is returned for inet_ntoa6.
4532                          */
4533
4534                         /*
4535                          * Safely load the IPv6 address.
4536                          */
4537                         dtrace_bcopy(
4538                             (void *)(uintptr_t)tupregs[argi].dttk_value,
4539                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4540
4541                         /*
4542                          * Check an IPv6 string will fit in scratch.
4543                          */
4544                         size = INET6_ADDRSTRLEN;
4545                         if (!DTRACE_INSCRATCH(mstate, size)) {
4546                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4547                                 regs[rd] = NULL;
4548                                 break;
4549                         }
4550                         base = (char *)mstate->dtms_scratch_ptr;
4551                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4552                         *end-- = '\0';
4553
4554                         /*
4555                          * Find the longest run of 16 bit zero values
4556                          * for the single allowed zero compression - "::".
4557                          */
4558                         firstzero = -1;
4559                         tryzero = -1;
4560                         numzero = 1;
4561                         for (i = 0; i < sizeof (struct in6_addr); i++) {
4562                                 if (ip6._S6_un._S6_u8[i] == 0 &&
4563                                     tryzero == -1 && i % 2 == 0) {
4564                                         tryzero = i;
4565                                         continue;
4566                                 }
4567
4568                                 if (tryzero != -1 &&
4569                                     (ip6._S6_un._S6_u8[i] != 0 ||
4570                                     i == sizeof (struct in6_addr) - 1)) {
4571
4572                                         if (i - tryzero <= numzero) {
4573                                                 tryzero = -1;
4574                                                 continue;
4575                                         }
4576
4577                                         firstzero = tryzero;
4578                                         numzero = i - i % 2 - tryzero;
4579                                         tryzero = -1;
4580
4581                                         if (ip6._S6_un._S6_u8[i] == 0 &&
4582                                             i == sizeof (struct in6_addr) - 1)
4583                                                 numzero += 2;
4584                                 }
4585                         }
4586                         ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
4587
4588                         /*
4589                          * Check for an IPv4 embedded address.
4590                          */
4591                         v6end = sizeof (struct in6_addr) - 2;
4592                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4593                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
4594                                 for (i = sizeof (struct in6_addr) - 1;
4595                                     i >= DTRACE_V4MAPPED_OFFSET; i--) {
4596                                         ASSERT(end >= base);
4597
4598                                         val = ip6._S6_un._S6_u8[i];
4599
4600                                         if (val == 0) {
4601                                                 *end-- = '0';
4602                                         } else {
4603                                                 for (; val; val /= 10) {
4604                                                         *end-- = '0' + val % 10;
4605                                                 }
4606                                         }
4607
4608                                         if (i > DTRACE_V4MAPPED_OFFSET)
4609                                                 *end-- = '.';
4610                                 }
4611
4612                                 if (subr == DIF_SUBR_INET_NTOA6)
4613                                         goto inetout;
4614
4615                                 /*
4616                                  * Set v6end to skip the IPv4 address that
4617                                  * we have already stringified.
4618                                  */
4619                                 v6end = 10;
4620                         }
4621
4622                         /*
4623                          * Build the IPv6 string by working through the
4624                          * address in reverse.
4625                          */
4626                         for (i = v6end; i >= 0; i -= 2) {
4627                                 ASSERT(end >= base);
4628
4629                                 if (i == firstzero + numzero - 2) {
4630                                         *end-- = ':';
4631                                         *end-- = ':';
4632                                         i -= numzero - 2;
4633                                         continue;
4634                                 }
4635
4636                                 if (i < 14 && i != firstzero - 2)
4637                                         *end-- = ':';
4638
4639                                 val = (ip6._S6_un._S6_u8[i] << 8) +
4640                                     ip6._S6_un._S6_u8[i + 1];
4641
4642                                 if (val == 0) {
4643                                         *end-- = '0';
4644                                 } else {
4645                                         for (; val; val /= 16) {
4646                                                 *end-- = digits[val % 16];
4647                                         }
4648                                 }
4649                         }
4650                         ASSERT(end + 1 >= base);
4651
4652                 } else {
4653                         /*
4654                          * The user didn't use AH_INET or AH_INET6.
4655                          */
4656                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4657                         regs[rd] = NULL;
4658                         break;
4659                 }
4660
4661 inetout:        regs[rd] = (uintptr_t)end + 1;
4662                 mstate->dtms_scratch_ptr += size;
4663                 break;
4664         }
4665
4666         }
4667 }
4668
4669 /*
4670  * Emulate the execution of DTrace IR instructions specified by the given
4671  * DIF object.  This function is deliberately void of assertions as all of
4672  * the necessary checks are handled by a call to dtrace_difo_validate().
4673  */
4674 static uint64_t
4675 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4676     dtrace_vstate_t *vstate, dtrace_state_t *state)
4677 {
4678         const dif_instr_t *text = difo->dtdo_buf;
4679         const uint_t textlen = difo->dtdo_len;
4680         const char *strtab = difo->dtdo_strtab;
4681         const uint64_t *inttab = difo->dtdo_inttab;
4682
4683         uint64_t rval = 0;
4684         dtrace_statvar_t *svar;
4685         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4686         dtrace_difv_t *v;
4687         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4688         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4689
4690         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4691         uint64_t regs[DIF_DIR_NREGS];
4692         uint64_t *tmp;
4693
4694         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4695         int64_t cc_r;
4696         uint_t pc = 0, id, opc;
4697         uint8_t ttop = 0;
4698         dif_instr_t instr;
4699         uint_t r1, r2, rd;
4700
4701         /*
4702          * We stash the current DIF object into the machine state: we need it
4703          * for subsequent access checking.
4704          */
4705         mstate->dtms_difo = difo;
4706
4707         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
4708
4709         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4710                 opc = pc;
4711
4712                 instr = text[pc++];
4713                 r1 = DIF_INSTR_R1(instr);
4714                 r2 = DIF_INSTR_R2(instr);
4715                 rd = DIF_INSTR_RD(instr);
4716
4717                 switch (DIF_INSTR_OP(instr)) {
4718                 case DIF_OP_OR:
4719                         regs[rd] = regs[r1] | regs[r2];
4720                         break;
4721                 case DIF_OP_XOR:
4722                         regs[rd] = regs[r1] ^ regs[r2];
4723                         break;
4724                 case DIF_OP_AND:
4725                         regs[rd] = regs[r1] & regs[r2];
4726                         break;
4727                 case DIF_OP_SLL:
4728                         regs[rd] = regs[r1] << regs[r2];
4729                         break;
4730                 case DIF_OP_SRL:
4731                         regs[rd] = regs[r1] >> regs[r2];
4732                         break;
4733                 case DIF_OP_SUB:
4734                         regs[rd] = regs[r1] - regs[r2];
4735                         break;
4736                 case DIF_OP_ADD:
4737                         regs[rd] = regs[r1] + regs[r2];
4738                         break;
4739                 case DIF_OP_MUL:
4740                         regs[rd] = regs[r1] * regs[r2];
4741                         break;
4742                 case DIF_OP_SDIV:
4743                         if (regs[r2] == 0) {
4744                                 regs[rd] = 0;
4745                                 *flags |= CPU_DTRACE_DIVZERO;
4746                         } else {
4747                                 regs[rd] = (int64_t)regs[r1] /
4748                                     (int64_t)regs[r2];
4749                         }
4750                         break;
4751
4752                 case DIF_OP_UDIV:
4753                         if (regs[r2] == 0) {
4754                                 regs[rd] = 0;
4755                                 *flags |= CPU_DTRACE_DIVZERO;
4756                         } else {
4757                                 regs[rd] = regs[r1] / regs[r2];
4758                         }
4759                         break;
4760
4761                 case DIF_OP_SREM:
4762                         if (regs[r2] == 0) {
4763                                 regs[rd] = 0;
4764                                 *flags |= CPU_DTRACE_DIVZERO;
4765                         } else {
4766                                 regs[rd] = (int64_t)regs[r1] %
4767                                     (int64_t)regs[r2];
4768                         }
4769                         break;
4770
4771                 case DIF_OP_UREM:
4772                         if (regs[r2] == 0) {
4773                                 regs[rd] = 0;
4774                                 *flags |= CPU_DTRACE_DIVZERO;
4775                         } else {
4776                                 regs[rd] = regs[r1] % regs[r2];
4777                         }
4778                         break;
4779
4780                 case DIF_OP_NOT:
4781                         regs[rd] = ~regs[r1];
4782                         break;
4783                 case DIF_OP_MOV:
4784                         regs[rd] = regs[r1];
4785                         break;
4786                 case DIF_OP_CMP:
4787                         cc_r = regs[r1] - regs[r2];
4788                         cc_n = cc_r < 0;
4789                         cc_z = cc_r == 0;
4790                         cc_v = 0;
4791                         cc_c = regs[r1] < regs[r2];
4792                         break;
4793                 case DIF_OP_TST:
4794                         cc_n = cc_v = cc_c = 0;
4795                         cc_z = regs[r1] == 0;
4796                         break;
4797                 case DIF_OP_BA:
4798                         pc = DIF_INSTR_LABEL(instr);
4799                         break;
4800                 case DIF_OP_BE:
4801                         if (cc_z)
4802                                 pc = DIF_INSTR_LABEL(instr);
4803                         break;
4804                 case DIF_OP_BNE:
4805                         if (cc_z == 0)
4806                                 pc = DIF_INSTR_LABEL(instr);
4807                         break;
4808                 case DIF_OP_BG:
4809                         if ((cc_z | (cc_n ^ cc_v)) == 0)
4810                                 pc = DIF_INSTR_LABEL(instr);
4811                         break;
4812                 case DIF_OP_BGU:
4813                         if ((cc_c | cc_z) == 0)
4814                                 pc = DIF_INSTR_LABEL(instr);
4815                         break;
4816                 case DIF_OP_BGE:
4817                         if ((cc_n ^ cc_v) == 0)
4818                                 pc = DIF_INSTR_LABEL(instr);
4819                         break;
4820                 case DIF_OP_BGEU:
4821                         if (cc_c == 0)
4822                                 pc = DIF_INSTR_LABEL(instr);
4823                         break;
4824                 case DIF_OP_BL:
4825                         if (cc_n ^ cc_v)
4826                                 pc = DIF_INSTR_LABEL(instr);
4827                         break;
4828                 case DIF_OP_BLU:
4829                         if (cc_c)
4830                                 pc = DIF_INSTR_LABEL(instr);
4831                         break;
4832                 case DIF_OP_BLE:
4833                         if (cc_z | (cc_n ^ cc_v))
4834                                 pc = DIF_INSTR_LABEL(instr);
4835                         break;
4836                 case DIF_OP_BLEU:
4837                         if (cc_c | cc_z)
4838                                 pc = DIF_INSTR_LABEL(instr);
4839                         break;
4840                 case DIF_OP_RLDSB:
4841                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4842                                 *flags |= CPU_DTRACE_KPRIV;
4843                                 *illval = regs[r1];
4844                                 break;
4845                         }
4846                         /*FALLTHROUGH*/
4847                 case DIF_OP_LDSB:
4848                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4849                         break;
4850                 case DIF_OP_RLDSH:
4851                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4852                                 *flags |= CPU_DTRACE_KPRIV;
4853                                 *illval = regs[r1];
4854                                 break;
4855                         }
4856                         /*FALLTHROUGH*/
4857                 case DIF_OP_LDSH:
4858                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4859                         break;
4860                 case DIF_OP_RLDSW:
4861                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4862                                 *flags |= CPU_DTRACE_KPRIV;
4863                                 *illval = regs[r1];
4864                                 break;
4865                         }
4866                         /*FALLTHROUGH*/
4867                 case DIF_OP_LDSW:
4868                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4869                         break;
4870                 case DIF_OP_RLDUB:
4871                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4872                                 *flags |= CPU_DTRACE_KPRIV;
4873                                 *illval = regs[r1];
4874                                 break;
4875                         }
4876                         /*FALLTHROUGH*/
4877                 case DIF_OP_LDUB:
4878                         regs[rd] = dtrace_load8(regs[r1]);
4879                         break;
4880                 case DIF_OP_RLDUH:
4881                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4882                                 *flags |= CPU_DTRACE_KPRIV;
4883                                 *illval = regs[r1];
4884                                 break;
4885                         }
4886                         /*FALLTHROUGH*/
4887                 case DIF_OP_LDUH:
4888                         regs[rd] = dtrace_load16(regs[r1]);
4889                         break;
4890                 case DIF_OP_RLDUW:
4891                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4892                                 *flags |= CPU_DTRACE_KPRIV;
4893                                 *illval = regs[r1];
4894                                 break;
4895                         }
4896                         /*FALLTHROUGH*/
4897                 case DIF_OP_LDUW:
4898                         regs[rd] = dtrace_load32(regs[r1]);
4899                         break;
4900                 case DIF_OP_RLDX:
4901                         if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
4902                                 *flags |= CPU_DTRACE_KPRIV;
4903                                 *illval = regs[r1];
4904                                 break;
4905                         }
4906                         /*FALLTHROUGH*/
4907                 case DIF_OP_LDX:
4908                         regs[rd] = dtrace_load64(regs[r1]);
4909                         break;
4910                 case DIF_OP_ULDSB:
4911                         regs[rd] = (int8_t)
4912                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4913                         break;
4914                 case DIF_OP_ULDSH:
4915                         regs[rd] = (int16_t)
4916                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4917                         break;
4918                 case DIF_OP_ULDSW:
4919                         regs[rd] = (int32_t)
4920                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4921                         break;
4922                 case DIF_OP_ULDUB:
4923                         regs[rd] =
4924                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
4925                         break;
4926                 case DIF_OP_ULDUH:
4927                         regs[rd] =
4928                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
4929                         break;
4930                 case DIF_OP_ULDUW:
4931                         regs[rd] =
4932                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
4933                         break;
4934                 case DIF_OP_ULDX:
4935                         regs[rd] =
4936                             dtrace_fuword64((void *)(uintptr_t)regs[r1]);
4937                         break;
4938                 case DIF_OP_RET:
4939                         rval = regs[rd];
4940                         pc = textlen;
4941                         break;
4942                 case DIF_OP_NOP:
4943                         break;
4944                 case DIF_OP_SETX:
4945                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
4946                         break;
4947                 case DIF_OP_SETS:
4948                         regs[rd] = (uint64_t)(uintptr_t)
4949                             (strtab + DIF_INSTR_STRING(instr));
4950                         break;
4951                 case DIF_OP_SCMP: {
4952                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
4953                         uintptr_t s1 = regs[r1];
4954                         uintptr_t s2 = regs[r2];
4955
4956                         if (s1 != NULL &&
4957                             !dtrace_strcanload(s1, sz, mstate, vstate))
4958                                 break;
4959                         if (s2 != NULL &&
4960                             !dtrace_strcanload(s2, sz, mstate, vstate))
4961                                 break;
4962
4963                         cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
4964
4965                         cc_n = cc_r < 0;
4966                         cc_z = cc_r == 0;
4967                         cc_v = cc_c = 0;
4968                         break;
4969                 }
4970                 case DIF_OP_LDGA:
4971                         regs[rd] = dtrace_dif_variable(mstate, state,
4972                             r1, regs[r2]);
4973                         break;
4974                 case DIF_OP_LDGS:
4975                         id = DIF_INSTR_VAR(instr);
4976
4977                         if (id >= DIF_VAR_OTHER_UBASE) {
4978                                 uintptr_t a;
4979
4980                                 id -= DIF_VAR_OTHER_UBASE;
4981                                 svar = vstate->dtvs_globals[id];
4982                                 ASSERT(svar != NULL);
4983                                 v = &svar->dtsv_var;
4984
4985                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
4986                                         regs[rd] = svar->dtsv_data;
4987                                         break;
4988                                 }
4989
4990                                 a = (uintptr_t)svar->dtsv_data;
4991
4992                                 if (*(uint8_t *)a == UINT8_MAX) {
4993                                         /*
4994                                          * If the 0th byte is set to UINT8_MAX
4995                                          * then this is to be treated as a
4996                                          * reference to a NULL variable.
4997                                          */
4998                                         regs[rd] = NULL;
4999                                 } else {
5000                                         regs[rd] = a + sizeof (uint64_t);
5001                                 }
5002
5003                                 break;
5004                         }
5005
5006                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5007                         break;
5008
5009                 case DIF_OP_STGS:
5010                         id = DIF_INSTR_VAR(instr);
5011
5012                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5013                         id -= DIF_VAR_OTHER_UBASE;
5014
5015                         svar = vstate->dtvs_globals[id];
5016                         ASSERT(svar != NULL);
5017                         v = &svar->dtsv_var;
5018
5019                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5020                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5021
5022                                 ASSERT(a != NULL);
5023                                 ASSERT(svar->dtsv_size != 0);
5024
5025                                 if (regs[rd] == NULL) {
5026                                         *(uint8_t *)a = UINT8_MAX;
5027                                         break;
5028                                 } else {
5029                                         *(uint8_t *)a = 0;
5030                                         a += sizeof (uint64_t);
5031                                 }
5032                                 if (!dtrace_vcanload(
5033                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5034                                     mstate, vstate))
5035                                         break;
5036
5037                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5038                                     (void *)a, &v->dtdv_type);
5039                                 break;
5040                         }
5041
5042                         svar->dtsv_data = regs[rd];
5043                         break;
5044
5045                 case DIF_OP_LDTA:
5046                         /*
5047                          * There are no DTrace built-in thread-local arrays at
5048                          * present.  This opcode is saved for future work.
5049                          */
5050                         *flags |= CPU_DTRACE_ILLOP;
5051                         regs[rd] = 0;
5052                         break;
5053
5054                 case DIF_OP_LDLS:
5055                         id = DIF_INSTR_VAR(instr);
5056
5057                         if (id < DIF_VAR_OTHER_UBASE) {
5058                                 /*
5059                                  * For now, this has no meaning.
5060                                  */
5061                                 regs[rd] = 0;
5062                                 break;
5063                         }
5064
5065                         id -= DIF_VAR_OTHER_UBASE;
5066
5067                         ASSERT(id < vstate->dtvs_nlocals);
5068                         ASSERT(vstate->dtvs_locals != NULL);
5069
5070                         svar = vstate->dtvs_locals[id];
5071                         ASSERT(svar != NULL);
5072                         v = &svar->dtsv_var;
5073
5074                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5075                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5076                                 size_t sz = v->dtdv_type.dtdt_size;
5077
5078                                 sz += sizeof (uint64_t);
5079                                 ASSERT(svar->dtsv_size == NCPU * sz);
5080                                 a += CPU->cpu_id * sz;
5081
5082                                 if (*(uint8_t *)a == UINT8_MAX) {
5083                                         /*
5084                                          * If the 0th byte is set to UINT8_MAX
5085                                          * then this is to be treated as a
5086                                          * reference to a NULL variable.
5087                                          */
5088                                         regs[rd] = NULL;
5089                                 } else {
5090                                         regs[rd] = a + sizeof (uint64_t);
5091                                 }
5092
5093                                 break;
5094                         }
5095
5096                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5097                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5098                         regs[rd] = tmp[CPU->cpu_id];
5099                         break;
5100
5101                 case DIF_OP_STLS:
5102                         id = DIF_INSTR_VAR(instr);
5103
5104                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5105                         id -= DIF_VAR_OTHER_UBASE;
5106                         ASSERT(id < vstate->dtvs_nlocals);
5107
5108                         ASSERT(vstate->dtvs_locals != NULL);
5109                         svar = vstate->dtvs_locals[id];
5110                         ASSERT(svar != NULL);
5111                         v = &svar->dtsv_var;
5112
5113                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5114                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5115                                 size_t sz = v->dtdv_type.dtdt_size;
5116
5117                                 sz += sizeof (uint64_t);
5118                                 ASSERT(svar->dtsv_size == NCPU * sz);
5119                                 a += CPU->cpu_id * sz;
5120
5121                                 if (regs[rd] == NULL) {
5122                                         *(uint8_t *)a = UINT8_MAX;
5123                                         break;
5124                                 } else {
5125                                         *(uint8_t *)a = 0;
5126                                         a += sizeof (uint64_t);
5127                                 }
5128
5129                                 if (!dtrace_vcanload(
5130                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5131                                     mstate, vstate))
5132                                         break;
5133
5134                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5135                                     (void *)a, &v->dtdv_type);
5136                                 break;
5137                         }
5138
5139                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
5140                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5141                         tmp[CPU->cpu_id] = regs[rd];
5142                         break;
5143
5144                 case DIF_OP_LDTS: {
5145                         dtrace_dynvar_t *dvar;
5146                         dtrace_key_t *key;
5147
5148                         id = DIF_INSTR_VAR(instr);
5149                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5150                         id -= DIF_VAR_OTHER_UBASE;
5151                         v = &vstate->dtvs_tlocals[id];
5152
5153                         key = &tupregs[DIF_DTR_NREGS];
5154                         key[0].dttk_value = (uint64_t)id;
5155                         key[0].dttk_size = 0;
5156                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5157                         key[1].dttk_size = 0;
5158
5159                         dvar = dtrace_dynvar(dstate, 2, key,
5160                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5161                             mstate, vstate);
5162
5163                         if (dvar == NULL) {
5164                                 regs[rd] = 0;
5165                                 break;
5166                         }
5167
5168                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5169                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5170                         } else {
5171                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5172                         }
5173
5174                         break;
5175                 }
5176
5177                 case DIF_OP_STTS: {
5178                         dtrace_dynvar_t *dvar;
5179                         dtrace_key_t *key;
5180
5181                         id = DIF_INSTR_VAR(instr);
5182                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5183                         id -= DIF_VAR_OTHER_UBASE;
5184
5185                         key = &tupregs[DIF_DTR_NREGS];
5186                         key[0].dttk_value = (uint64_t)id;
5187                         key[0].dttk_size = 0;
5188                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5189                         key[1].dttk_size = 0;
5190                         v = &vstate->dtvs_tlocals[id];
5191
5192                         dvar = dtrace_dynvar(dstate, 2, key,
5193                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5194                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5195                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5196                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5197
5198                         /*
5199                          * Given that we're storing to thread-local data,
5200                          * we need to flush our predicate cache.
5201                          */
5202                         curthread->t_predcache = NULL;
5203
5204                         if (dvar == NULL)
5205                                 break;
5206
5207                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5208                                 if (!dtrace_vcanload(
5209                                     (void *)(uintptr_t)regs[rd],
5210                                     &v->dtdv_type, mstate, vstate))
5211                                         break;
5212
5213                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5214                                     dvar->dtdv_data, &v->dtdv_type);
5215                         } else {
5216                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5217                         }
5218
5219                         break;
5220                 }
5221
5222                 case DIF_OP_SRA:
5223                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
5224                         break;
5225
5226                 case DIF_OP_CALL:
5227                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5228                             regs, tupregs, ttop, mstate, state);
5229                         break;
5230
5231                 case DIF_OP_PUSHTR:
5232                         if (ttop == DIF_DTR_NREGS) {
5233                                 *flags |= CPU_DTRACE_TUPOFLOW;
5234                                 break;
5235                         }
5236
5237                         if (r1 == DIF_TYPE_STRING) {
5238                                 /*
5239                                  * If this is a string type and the size is 0,
5240                                  * we'll use the system-wide default string
5241                                  * size.  Note that we are _not_ looking at
5242                                  * the value of the DTRACEOPT_STRSIZE option;
5243                                  * had this been set, we would expect to have
5244                                  * a non-zero size value in the "pushtr".
5245                                  */
5246                                 tupregs[ttop].dttk_size =
5247                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
5248                                     regs[r2] ? regs[r2] :
5249                                     dtrace_strsize_default) + 1;
5250                         } else {
5251                                 tupregs[ttop].dttk_size = regs[r2];
5252                         }
5253
5254                         tupregs[ttop++].dttk_value = regs[rd];
5255                         break;
5256
5257                 case DIF_OP_PUSHTV:
5258                         if (ttop == DIF_DTR_NREGS) {
5259                                 *flags |= CPU_DTRACE_TUPOFLOW;
5260                                 break;
5261                         }
5262
5263                         tupregs[ttop].dttk_value = regs[rd];
5264                         tupregs[ttop++].dttk_size = 0;
5265                         break;
5266
5267                 case DIF_OP_POPTS:
5268                         if (ttop != 0)
5269                                 ttop--;
5270                         break;
5271
5272                 case DIF_OP_FLUSHTS:
5273                         ttop = 0;
5274                         break;
5275
5276                 case DIF_OP_LDGAA:
5277                 case DIF_OP_LDTAA: {
5278                         dtrace_dynvar_t *dvar;
5279                         dtrace_key_t *key = tupregs;
5280                         uint_t nkeys = ttop;
5281
5282                         id = DIF_INSTR_VAR(instr);
5283                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5284                         id -= DIF_VAR_OTHER_UBASE;
5285
5286                         key[nkeys].dttk_value = (uint64_t)id;
5287                         key[nkeys++].dttk_size = 0;
5288
5289                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5290                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5291                                 key[nkeys++].dttk_size = 0;
5292                                 v = &vstate->dtvs_tlocals[id];
5293                         } else {
5294                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5295                         }
5296
5297                         dvar = dtrace_dynvar(dstate, nkeys, key,
5298                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5299                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5300                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5301
5302                         if (dvar == NULL) {
5303                                 regs[rd] = 0;
5304                                 break;
5305                         }
5306
5307                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5308                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5309                         } else {
5310                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5311                         }
5312
5313                         break;
5314                 }
5315
5316                 case DIF_OP_STGAA:
5317                 case DIF_OP_STTAA: {
5318                         dtrace_dynvar_t *dvar;
5319                         dtrace_key_t *key = tupregs;
5320                         uint_t nkeys = ttop;
5321
5322                         id = DIF_INSTR_VAR(instr);
5323                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5324                         id -= DIF_VAR_OTHER_UBASE;
5325
5326                         key[nkeys].dttk_value = (uint64_t)id;
5327                         key[nkeys++].dttk_size = 0;
5328
5329                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5330                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5331                                 key[nkeys++].dttk_size = 0;
5332                                 v = &vstate->dtvs_tlocals[id];
5333                         } else {
5334                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5335                         }
5336
5337                         dvar = dtrace_dynvar(dstate, nkeys, key,
5338                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5339                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5340                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5341                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5342
5343                         if (dvar == NULL)
5344                                 break;
5345
5346                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5347                                 if (!dtrace_vcanload(
5348                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5349                                     mstate, vstate))
5350                                         break;
5351
5352                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5353                                     dvar->dtdv_data, &v->dtdv_type);
5354                         } else {
5355                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5356                         }
5357
5358                         break;
5359                 }
5360
5361                 case DIF_OP_ALLOCS: {
5362                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5363                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5364
5365                         /*
5366                          * Rounding up the user allocation size could have
5367                          * overflowed large, bogus allocations (like -1ULL) to
5368                          * 0.
5369                          */
5370                         if (size < regs[r1] ||
5371                             !DTRACE_INSCRATCH(mstate, size)) {
5372                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5373                                 regs[rd] = NULL;
5374                                 break;
5375                         }
5376
5377                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5378                         mstate->dtms_scratch_ptr += size;
5379                         regs[rd] = ptr;
5380                         break;
5381                 }
5382
5383                 case DIF_OP_COPYS:
5384                         if (!dtrace_canstore(regs[rd], regs[r2],
5385                             mstate, vstate)) {
5386                                 *flags |= CPU_DTRACE_BADADDR;
5387                                 *illval = regs[rd];
5388                                 break;
5389                         }
5390
5391                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5392                                 break;
5393
5394                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
5395                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5396                         break;
5397
5398                 case DIF_OP_STB:
5399                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5400                                 *flags |= CPU_DTRACE_BADADDR;
5401                                 *illval = regs[rd];
5402                                 break;
5403                         }
5404                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5405                         break;
5406
5407                 case DIF_OP_STH:
5408                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5409                                 *flags |= CPU_DTRACE_BADADDR;
5410                                 *illval = regs[rd];
5411                                 break;
5412                         }
5413                         if (regs[rd] & 1) {
5414                                 *flags |= CPU_DTRACE_BADALIGN;
5415                                 *illval = regs[rd];
5416                                 break;
5417                         }
5418                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5419                         break;
5420
5421                 case DIF_OP_STW:
5422                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5423                                 *flags |= CPU_DTRACE_BADADDR;
5424                                 *illval = regs[rd];
5425                                 break;
5426                         }
5427                         if (regs[rd] & 3) {
5428                                 *flags |= CPU_DTRACE_BADALIGN;
5429                                 *illval = regs[rd];
5430                                 break;
5431                         }
5432                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5433                         break;
5434
5435                 case DIF_OP_STX:
5436                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5437                                 *flags |= CPU_DTRACE_BADADDR;
5438                                 *illval = regs[rd];
5439                                 break;
5440                         }
5441                         if (regs[rd] & 7) {
5442                                 *flags |= CPU_DTRACE_BADALIGN;
5443                                 *illval = regs[rd];
5444                                 break;
5445                         }
5446                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5447                         break;
5448                 }
5449         }
5450
5451         if (!(*flags & CPU_DTRACE_FAULT))
5452                 return (rval);
5453
5454         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5455         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5456
5457         return (0);
5458 }
5459
5460 static void
5461 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5462 {
5463         dtrace_probe_t *probe = ecb->dte_probe;
5464         dtrace_provider_t *prov = probe->dtpr_provider;
5465         char c[DTRACE_FULLNAMELEN + 80], *str;
5466         char *msg = "dtrace: breakpoint action at probe ";
5467         char *ecbmsg = " (ecb ";
5468         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5469         uintptr_t val = (uintptr_t)ecb;
5470         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5471
5472         if (dtrace_destructive_disallow)
5473                 return;
5474
5475         /*
5476          * It's impossible to be taking action on the NULL probe.
5477          */
5478         ASSERT(probe != NULL);
5479
5480         /*
5481          * This is a poor man's (destitute man's?) sprintf():  we want to
5482          * print the provider name, module name, function name and name of
5483          * the probe, along with the hex address of the ECB with the breakpoint
5484          * action -- all of which we must place in the character buffer by
5485          * hand.
5486          */
5487         while (*msg != '\0')
5488                 c[i++] = *msg++;
5489
5490         for (str = prov->dtpv_name; *str != '\0'; str++)
5491                 c[i++] = *str;
5492         c[i++] = ':';
5493
5494         for (str = probe->dtpr_mod; *str != '\0'; str++)
5495                 c[i++] = *str;
5496         c[i++] = ':';
5497
5498         for (str = probe->dtpr_func; *str != '\0'; str++)
5499                 c[i++] = *str;
5500         c[i++] = ':';
5501
5502         for (str = probe->dtpr_name; *str != '\0'; str++)
5503                 c[i++] = *str;
5504
5505         while (*ecbmsg != '\0')
5506                 c[i++] = *ecbmsg++;
5507
5508         while (shift >= 0) {
5509                 mask = (uintptr_t)0xf << shift;
5510
5511                 if (val >= ((uintptr_t)1 << shift))
5512                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5513                 shift -= 4;
5514         }
5515
5516         c[i++] = ')';
5517         c[i] = '\0';
5518
5519         debug_enter(c);
5520 }
5521
5522 static void
5523 dtrace_action_panic(dtrace_ecb_t *ecb)
5524 {
5525         dtrace_probe_t *probe = ecb->dte_probe;
5526
5527         /*
5528          * It's impossible to be taking action on the NULL probe.
5529          */
5530         ASSERT(probe != NULL);
5531
5532         if (dtrace_destructive_disallow)
5533                 return;
5534
5535         if (dtrace_panicked != NULL)
5536                 return;
5537
5538         if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
5539                 return;
5540
5541         /*
5542          * We won the right to panic.  (We want to be sure that only one
5543          * thread calls panic() from dtrace_probe(), and that panic() is
5544          * called exactly once.)
5545          */
5546         dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5547             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5548             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5549 }
5550
5551 static void
5552 dtrace_action_raise(uint64_t sig)
5553 {
5554         if (dtrace_destructive_disallow)
5555                 return;
5556
5557         if (sig >= NSIG) {
5558                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5559                 return;
5560         }
5561
5562         /*
5563          * raise() has a queue depth of 1 -- we ignore all subsequent
5564          * invocations of the raise() action.
5565          */
5566         if (curthread->t_dtrace_sig == 0)
5567                 curthread->t_dtrace_sig = (uint8_t)sig;
5568
5569         curthread->t_sig_check = 1;
5570         aston(curthread);
5571 }
5572
5573 static void
5574 dtrace_action_stop(void)
5575 {
5576         if (dtrace_destructive_disallow)
5577                 return;
5578
5579         if (!curthread->t_dtrace_stop) {
5580                 curthread->t_dtrace_stop = 1;
5581                 curthread->t_sig_check = 1;
5582                 aston(curthread);
5583         }
5584 }
5585
5586 static void
5587 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5588 {
5589         hrtime_t now;
5590         volatile uint16_t *flags;
5591         cpu_t *cpu = CPU;
5592
5593         if (dtrace_destructive_disallow)
5594                 return;
5595
5596         flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5597
5598         now = dtrace_gethrtime();
5599
5600         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5601                 /*
5602                  * We need to advance the mark to the current time.
5603                  */
5604                 cpu->cpu_dtrace_chillmark = now;
5605                 cpu->cpu_dtrace_chilled = 0;
5606         }
5607
5608         /*
5609          * Now check to see if the requested chill time would take us over
5610          * the maximum amount of time allowed in the chill interval.  (Or
5611          * worse, if the calculation itself induces overflow.)
5612          */
5613         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5614             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5615                 *flags |= CPU_DTRACE_ILLOP;
5616                 return;
5617         }
5618
5619         while (dtrace_gethrtime() - now < val)
5620                 continue;
5621
5622         /*
5623          * Normally, we assure that the value of the variable "timestamp" does
5624          * not change within an ECB.  The presence of chill() represents an
5625          * exception to this rule, however.
5626          */
5627         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5628         cpu->cpu_dtrace_chilled += val;
5629 }
5630
5631 static void
5632 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5633     uint64_t *buf, uint64_t arg)
5634 {
5635         int nframes = DTRACE_USTACK_NFRAMES(arg);
5636         int strsize = DTRACE_USTACK_STRSIZE(arg);
5637         uint64_t *pcs = &buf[1], *fps;
5638         char *str = (char *)&pcs[nframes];
5639         int size, offs = 0, i, j;
5640         uintptr_t old = mstate->dtms_scratch_ptr, saved;
5641         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5642         char *sym;
5643
5644         /*
5645          * Should be taking a faster path if string space has not been
5646          * allocated.
5647          */
5648         ASSERT(strsize != 0);
5649
5650         /*
5651          * We will first allocate some temporary space for the frame pointers.
5652          */
5653         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5654         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5655             (nframes * sizeof (uint64_t));
5656
5657         if (!DTRACE_INSCRATCH(mstate, size)) {
5658                 /*
5659                  * Not enough room for our frame pointers -- need to indicate
5660                  * that we ran out of scratch space.
5661                  */
5662                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5663                 return;
5664         }
5665
5666         mstate->dtms_scratch_ptr += size;
5667         saved = mstate->dtms_scratch_ptr;
5668
5669         /*
5670          * Now get a stack with both program counters and frame pointers.
5671          */
5672         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5673         dtrace_getufpstack(buf, fps, nframes + 1);
5674         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5675
5676         /*
5677          * If that faulted, we're cooked.
5678          */
5679         if (*flags & CPU_DTRACE_FAULT)
5680                 goto out;
5681
5682         /*
5683          * Now we want to walk up the stack, calling the USTACK helper.  For
5684          * each iteration, we restore the scratch pointer.
5685          */
5686         for (i = 0; i < nframes; i++) {
5687                 mstate->dtms_scratch_ptr = saved;
5688
5689                 if (offs >= strsize)
5690                         break;
5691
5692                 sym = (char *)(uintptr_t)dtrace_helper(
5693                     DTRACE_HELPER_ACTION_USTACK,
5694                     mstate, state, pcs[i], fps[i]);
5695
5696                 /*
5697                  * If we faulted while running the helper, we're going to
5698                  * clear the fault and null out the corresponding string.
5699                  */
5700                 if (*flags & CPU_DTRACE_FAULT) {
5701                         *flags &= ~CPU_DTRACE_FAULT;
5702                         str[offs++] = '\0';
5703                         continue;
5704                 }
5705
5706                 if (sym == NULL) {
5707                         str[offs++] = '\0';
5708                         continue;
5709                 }
5710
5711                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5712
5713                 /*
5714                  * Now copy in the string that the helper returned to us.
5715                  */
5716                 for (j = 0; offs + j < strsize; j++) {
5717                         if ((str[offs + j] = sym[j]) == '\0')
5718                                 break;
5719                 }
5720
5721                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5722
5723                 offs += j + 1;
5724         }
5725
5726         if (offs >= strsize) {
5727                 /*
5728                  * If we didn't have room for all of the strings, we don't
5729                  * abort processing -- this needn't be a fatal error -- but we
5730                  * still want to increment a counter (dts_stkstroverflows) to
5731                  * allow this condition to be warned about.  (If this is from
5732                  * a jstack() action, it is easily tuned via jstackstrsize.)
5733                  */
5734                 dtrace_error(&state->dts_stkstroverflows);
5735         }
5736
5737         while (offs < strsize)
5738                 str[offs++] = '\0';
5739
5740 out:
5741         mstate->dtms_scratch_ptr = old;
5742 }
5743
5744 /*
5745  * If you're looking for the epicenter of DTrace, you just found it.  This
5746  * is the function called by the provider to fire a probe -- from which all
5747  * subsequent probe-context DTrace activity emanates.
5748  */
5749 void
5750 dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
5751     uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
5752 {
5753         processorid_t cpuid;
5754         dtrace_icookie_t cookie;
5755         dtrace_probe_t *probe;
5756         dtrace_mstate_t mstate;
5757         dtrace_ecb_t *ecb;
5758         dtrace_action_t *act;
5759         intptr_t offs;
5760         size_t size;
5761         int vtime, onintr;
5762         volatile uint16_t *flags;
5763         hrtime_t now;
5764
5765         /*
5766          * Kick out immediately if this CPU is still being born (in which case
5767          * curthread will be set to -1) or the current thread can't allow
5768          * probes in its current context.
5769          */
5770         if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
5771                 return;
5772
5773         cookie = dtrace_interrupt_disable();
5774         probe = dtrace_probes[id - 1];
5775         cpuid = CPU->cpu_id;
5776         onintr = CPU_ON_INTR(CPU);
5777
5778         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5779             probe->dtpr_predcache == curthread->t_predcache) {
5780                 /*
5781                  * We have hit in the predicate cache; we know that
5782                  * this predicate would evaluate to be false.
5783                  */
5784                 dtrace_interrupt_enable(cookie);
5785                 return;
5786         }
5787
5788         if (panic_quiesce) {
5789                 /*
5790                  * We don't trace anything if we're panicking.
5791                  */
5792                 dtrace_interrupt_enable(cookie);
5793                 return;
5794         }
5795
5796         now = dtrace_gethrtime();
5797         vtime = dtrace_vtime_references != 0;
5798
5799         if (vtime && curthread->t_dtrace_start)
5800                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5801
5802         mstate.dtms_difo = NULL;
5803         mstate.dtms_probe = probe;
5804         mstate.dtms_strtok = NULL;
5805         mstate.dtms_arg[0] = arg0;
5806         mstate.dtms_arg[1] = arg1;
5807         mstate.dtms_arg[2] = arg2;
5808         mstate.dtms_arg[3] = arg3;
5809         mstate.dtms_arg[4] = arg4;
5810
5811         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5812
5813         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5814                 dtrace_predicate_t *pred = ecb->dte_predicate;
5815                 dtrace_state_t *state = ecb->dte_state;
5816                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5817                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5818                 dtrace_vstate_t *vstate = &state->dts_vstate;
5819                 dtrace_provider_t *prov = probe->dtpr_provider;
5820                 int committed = 0;
5821                 caddr_t tomax;
5822
5823                 /*
5824                  * A little subtlety with the following (seemingly innocuous)
5825                  * declaration of the automatic 'val':  by looking at the
5826                  * code, you might think that it could be declared in the
5827                  * action processing loop, below.  (That is, it's only used in
5828                  * the action processing loop.)  However, it must be declared
5829                  * out of that scope because in the case of DIF expression
5830                  * arguments to aggregating actions, one iteration of the
5831                  * action loop will use the last iteration's value.
5832                  */
5833 #ifdef lint
5834                 uint64_t val = 0;
5835 #else
5836                 uint64_t val;
5837 #endif
5838
5839                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5840                 mstate.dtms_access = DTRACE_ACCESS_ARGS | DTRACE_ACCESS_PROC;
5841                 *flags &= ~CPU_DTRACE_ERROR;
5842
5843                 if (prov == dtrace_provider) {
5844                         /*
5845                          * If dtrace itself is the provider of this probe,
5846                          * we're only going to continue processing the ECB if
5847                          * arg0 (the dtrace_state_t) is equal to the ECB's
5848                          * creating state.  (This prevents disjoint consumers
5849                          * from seeing one another's metaprobes.)
5850                          */
5851                         if (arg0 != (uint64_t)(uintptr_t)state)
5852                                 continue;
5853                 }
5854
5855                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5856                         /*
5857                          * We're not currently active.  If our provider isn't
5858                          * the dtrace pseudo provider, we're not interested.
5859                          */
5860                         if (prov != dtrace_provider)
5861                                 continue;
5862
5863                         /*
5864                          * Now we must further check if we are in the BEGIN
5865                          * probe.  If we are, we will only continue processing
5866                          * if we're still in WARMUP -- if one BEGIN enabling
5867                          * has invoked the exit() action, we don't want to
5868                          * evaluate subsequent BEGIN enablings.
5869                          */
5870                         if (probe->dtpr_id == dtrace_probeid_begin &&
5871                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
5872                                 ASSERT(state->dts_activity ==
5873                                     DTRACE_ACTIVITY_DRAINING);
5874                                 continue;
5875                         }
5876                 }
5877
5878                 if (ecb->dte_cond && !dtrace_priv_probe(state, &mstate, ecb))
5879                         continue;
5880
5881                 if (now - state->dts_alive > dtrace_deadman_timeout) {
5882                         /*
5883                          * We seem to be dead.  Unless we (a) have kernel
5884                          * destructive permissions (b) have expicitly enabled
5885                          * destructive actions and (c) destructive actions have
5886                          * not been disabled, we're going to transition into
5887                          * the KILLED state, from which no further processing
5888                          * on this state will be performed.
5889                          */
5890                         if (!dtrace_priv_kernel_destructive(state) ||
5891                             !state->dts_cred.dcr_destructive ||
5892                             dtrace_destructive_disallow) {
5893                                 void *activity = &state->dts_activity;
5894                                 dtrace_activity_t current;
5895
5896                                 do {
5897                                         current = state->dts_activity;
5898                                 } while (dtrace_cas32(activity, current,
5899                                     DTRACE_ACTIVITY_KILLED) != current);
5900
5901                                 continue;
5902                         }
5903                 }
5904
5905                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
5906                     ecb->dte_alignment, state, &mstate)) < 0)
5907                         continue;
5908
5909                 tomax = buf->dtb_tomax;
5910                 ASSERT(tomax != NULL);
5911
5912                 if (ecb->dte_size != 0)
5913                         DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
5914
5915                 mstate.dtms_epid = ecb->dte_epid;
5916                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
5917
5918                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
5919                         mstate.dtms_access |= DTRACE_ACCESS_KERNEL;
5920
5921                 if (pred != NULL) {
5922                         dtrace_difo_t *dp = pred->dtp_difo;
5923                         int rval;
5924
5925                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
5926
5927                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
5928                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
5929
5930                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
5931                                         /*
5932                                          * Update the predicate cache...
5933                                          */
5934                                         ASSERT(cid == pred->dtp_cacheid);
5935                                         curthread->t_predcache = cid;
5936                                 }
5937
5938                                 continue;
5939                         }
5940                 }
5941
5942                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
5943                     act != NULL; act = act->dta_next) {
5944                         size_t valoffs;
5945                         dtrace_difo_t *dp;
5946                         dtrace_recdesc_t *rec = &act->dta_rec;
5947
5948                         size = rec->dtrd_size;
5949                         valoffs = offs + rec->dtrd_offset;
5950
5951                         if (DTRACEACT_ISAGG(act->dta_kind)) {
5952                                 uint64_t v = 0xbad;
5953                                 dtrace_aggregation_t *agg;
5954
5955                                 agg = (dtrace_aggregation_t *)act;
5956
5957                                 if ((dp = act->dta_difo) != NULL)
5958                                         v = dtrace_dif_emulate(dp,
5959                                             &mstate, vstate, state);
5960
5961                                 if (*flags & CPU_DTRACE_ERROR)
5962                                         continue;
5963
5964                                 /*
5965                                  * Note that we always pass the expression
5966                                  * value from the previous iteration of the
5967                                  * action loop.  This value will only be used
5968                                  * if there is an expression argument to the
5969                                  * aggregating action, denoted by the
5970                                  * dtag_hasarg field.
5971                                  */
5972                                 dtrace_aggregate(agg, buf,
5973                                     offs, aggbuf, v, val);
5974                                 continue;
5975                         }
5976
5977                         switch (act->dta_kind) {
5978                         case DTRACEACT_STOP:
5979                                 if (dtrace_priv_proc_destructive(state,
5980                                     &mstate))
5981                                         dtrace_action_stop();
5982                                 continue;
5983
5984                         case DTRACEACT_BREAKPOINT:
5985                                 if (dtrace_priv_kernel_destructive(state))
5986                                         dtrace_action_breakpoint(ecb);
5987                                 continue;
5988
5989                         case DTRACEACT_PANIC:
5990                                 if (dtrace_priv_kernel_destructive(state))
5991                                         dtrace_action_panic(ecb);
5992                                 continue;
5993
5994                         case DTRACEACT_STACK:
5995                                 if (!dtrace_priv_kernel(state))
5996                                         continue;
5997
5998                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
5999                                     size / sizeof (pc_t), probe->dtpr_aframes,
6000                                     DTRACE_ANCHORED(probe) ? NULL :
6001                                     (uint32_t *)arg0);
6002
6003                                 continue;
6004
6005                         case DTRACEACT_JSTACK:
6006                         case DTRACEACT_USTACK:
6007                                 if (!dtrace_priv_proc(state, &mstate))
6008                                         continue;
6009
6010                                 /*
6011                                  * See comment in DIF_VAR_PID.
6012                                  */
6013                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6014                                     CPU_ON_INTR(CPU)) {
6015                                         int depth = DTRACE_USTACK_NFRAMES(
6016                                             rec->dtrd_arg) + 1;
6017
6018                                         dtrace_bzero((void *)(tomax + valoffs),
6019                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6020                                             + depth * sizeof (uint64_t));
6021
6022                                         continue;
6023                                 }
6024
6025                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6026                                     curproc->p_dtrace_helpers != NULL) {
6027                                         /*
6028                                          * This is the slow path -- we have
6029                                          * allocated string space, and we're
6030                                          * getting the stack of a process that
6031                                          * has helpers.  Call into a separate
6032                                          * routine to perform this processing.
6033                                          */
6034                                         dtrace_action_ustack(&mstate, state,
6035                                             (uint64_t *)(tomax + valoffs),
6036                                             rec->dtrd_arg);
6037                                         continue;
6038                                 }
6039
6040                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6041                                 dtrace_getupcstack((uint64_t *)
6042                                     (tomax + valoffs),
6043                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6044                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6045                                 continue;
6046
6047                         default:
6048                                 break;
6049                         }
6050
6051                         dp = act->dta_difo;
6052                         ASSERT(dp != NULL);
6053
6054                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6055
6056                         if (*flags & CPU_DTRACE_ERROR)
6057                                 continue;
6058
6059                         switch (act->dta_kind) {
6060                         case DTRACEACT_SPECULATE:
6061                                 ASSERT(buf == &state->dts_buffer[cpuid]);
6062                                 buf = dtrace_speculation_buffer(state,
6063                                     cpuid, val);
6064
6065                                 if (buf == NULL) {
6066                                         *flags |= CPU_DTRACE_DROP;
6067                                         continue;
6068                                 }
6069
6070                                 offs = dtrace_buffer_reserve(buf,
6071                                     ecb->dte_needed, ecb->dte_alignment,
6072                                     state, NULL);
6073
6074                                 if (offs < 0) {
6075                                         *flags |= CPU_DTRACE_DROP;
6076                                         continue;
6077                                 }
6078
6079                                 tomax = buf->dtb_tomax;
6080                                 ASSERT(tomax != NULL);
6081
6082                                 if (ecb->dte_size != 0)
6083                                         DTRACE_STORE(uint32_t, tomax, offs,
6084                                             ecb->dte_epid);
6085                                 continue;
6086
6087                         case DTRACEACT_CHILL:
6088                                 if (dtrace_priv_kernel_destructive(state))
6089                                         dtrace_action_chill(&mstate, val);
6090                                 continue;
6091
6092                         case DTRACEACT_RAISE:
6093                                 if (dtrace_priv_proc_destructive(state,
6094                                     &mstate))
6095                                         dtrace_action_raise(val);
6096                                 continue;
6097
6098                         case DTRACEACT_COMMIT:
6099                                 ASSERT(!committed);
6100
6101                                 /*
6102                                  * We need to commit our buffer state.
6103                                  */
6104                                 if (ecb->dte_size)
6105                                         buf->dtb_offset = offs + ecb->dte_size;
6106                                 buf = &state->dts_buffer[cpuid];
6107                                 dtrace_speculation_commit(state, cpuid, val);
6108                                 committed = 1;
6109                                 continue;
6110
6111                         case DTRACEACT_DISCARD:
6112                                 dtrace_speculation_discard(state, cpuid, val);
6113                                 continue;
6114
6115                         case DTRACEACT_DIFEXPR:
6116                         case DTRACEACT_LIBACT:
6117                         case DTRACEACT_PRINTF:
6118                         case DTRACEACT_PRINTA:
6119                         case DTRACEACT_SYSTEM:
6120                         case DTRACEACT_FREOPEN:
6121                                 break;
6122
6123                         case DTRACEACT_SYM:
6124                         case DTRACEACT_MOD:
6125                                 if (!dtrace_priv_kernel(state))
6126                                         continue;
6127                                 break;
6128
6129                         case DTRACEACT_USYM:
6130                         case DTRACEACT_UMOD:
6131                         case DTRACEACT_UADDR: {
6132                                 struct pid *pid = curthread->t_procp->p_pidp;
6133
6134                                 if (!dtrace_priv_proc(state, &mstate))
6135                                         continue;
6136
6137                                 DTRACE_STORE(uint64_t, tomax,
6138                                     valoffs, (uint64_t)pid->pid_id);
6139                                 DTRACE_STORE(uint64_t, tomax,
6140                                     valoffs + sizeof (uint64_t), val);
6141
6142                                 continue;
6143                         }
6144
6145                         case DTRACEACT_EXIT: {
6146                                 /*
6147                                  * For the exit action, we are going to attempt
6148                                  * to atomically set our activity to be
6149                                  * draining.  If this fails (either because
6150                                  * another CPU has beat us to the exit action,
6151                                  * or because our current activity is something
6152                                  * other than ACTIVE or WARMUP), we will
6153                                  * continue.  This assures that the exit action
6154                                  * can be successfully recorded at most once
6155                                  * when we're in the ACTIVE state.  If we're
6156                                  * encountering the exit() action while in
6157                                  * COOLDOWN, however, we want to honor the new
6158                                  * status code.  (We know that we're the only
6159                                  * thread in COOLDOWN, so there is no race.)
6160                                  */
6161                                 void *activity = &state->dts_activity;
6162                                 dtrace_activity_t current = state->dts_activity;
6163
6164                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
6165                                         break;
6166
6167                                 if (current != DTRACE_ACTIVITY_WARMUP)
6168                                         current = DTRACE_ACTIVITY_ACTIVE;
6169
6170                                 if (dtrace_cas32(activity, current,
6171                                     DTRACE_ACTIVITY_DRAINING) != current) {
6172                                         *flags |= CPU_DTRACE_DROP;
6173                                         continue;
6174                                 }
6175
6176                                 break;
6177                         }
6178
6179                         default:
6180                                 ASSERT(0);
6181                         }
6182
6183                         if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6184                                 uintptr_t end = valoffs + size;
6185
6186                                 if (!dtrace_vcanload((void *)(uintptr_t)val,
6187                                     &dp->dtdo_rtype, &mstate, vstate))
6188                                         continue;
6189
6190                                 /*
6191                                  * If this is a string, we're going to only
6192                                  * load until we find the zero byte -- after
6193                                  * which we'll store zero bytes.
6194                                  */
6195                                 if (dp->dtdo_rtype.dtdt_kind ==
6196                                     DIF_TYPE_STRING) {
6197                                         char c = '\0' + 1;
6198                                         int intuple = act->dta_intuple;
6199                                         size_t s;
6200
6201                                         for (s = 0; s < size; s++) {
6202                                                 if (c != '\0')
6203                                                         c = dtrace_load8(val++);
6204
6205                                                 DTRACE_STORE(uint8_t, tomax,
6206                                                     valoffs++, c);
6207
6208                                                 if (c == '\0' && intuple)
6209                                                         break;
6210                                         }
6211
6212                                         continue;
6213                                 }
6214
6215                                 while (valoffs < end) {
6216                                         DTRACE_STORE(uint8_t, tomax, valoffs++,
6217                                             dtrace_load8(val++));
6218                                 }
6219
6220                                 continue;
6221                         }
6222
6223                         switch (size) {
6224                         case 0:
6225                                 break;
6226
6227                         case sizeof (uint8_t):
6228                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6229                                 break;
6230                         case sizeof (uint16_t):
6231                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6232                                 break;
6233                         case sizeof (uint32_t):
6234                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6235                                 break;
6236                         case sizeof (uint64_t):
6237                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6238                                 break;
6239                         default:
6240                                 /*
6241                                  * Any other size should have been returned by
6242                                  * reference, not by value.
6243                                  */
6244                                 ASSERT(0);
6245                                 break;
6246                         }
6247                 }
6248
6249                 if (*flags & CPU_DTRACE_DROP)
6250                         continue;
6251
6252                 if (*flags & CPU_DTRACE_FAULT) {
6253                         int ndx;
6254                         dtrace_action_t *err;
6255
6256                         buf->dtb_errors++;
6257
6258                         if (probe->dtpr_id == dtrace_probeid_error) {
6259                                 /*
6260                                  * There's nothing we can do -- we had an
6261                                  * error on the error probe.  We bump an
6262                                  * error counter to at least indicate that
6263                                  * this condition happened.
6264                                  */
6265                                 dtrace_error(&state->dts_dblerrors);
6266                                 continue;
6267                         }
6268
6269                         if (vtime) {
6270                                 /*
6271                                  * Before recursing on dtrace_probe(), we
6272                                  * need to explicitly clear out our start
6273                                  * time to prevent it from being accumulated
6274                                  * into t_dtrace_vtime.
6275                                  */
6276                                 curthread->t_dtrace_start = 0;
6277                         }
6278
6279                         /*
6280                          * Iterate over the actions to figure out which action
6281                          * we were processing when we experienced the error.
6282                          * Note that act points _past_ the faulting action; if
6283                          * act is ecb->dte_action, the fault was in the
6284                          * predicate, if it's ecb->dte_action->dta_next it's
6285                          * in action #1, and so on.
6286                          */
6287                         for (err = ecb->dte_action, ndx = 0;
6288                             err != act; err = err->dta_next, ndx++)
6289                                 continue;
6290
6291                         dtrace_probe_error(state, ecb->dte_epid, ndx,
6292                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6293                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6294                             cpu_core[cpuid].cpuc_dtrace_illval);
6295
6296                         continue;
6297                 }
6298
6299                 if (!committed)
6300                         buf->dtb_offset = offs + ecb->dte_size;
6301         }
6302
6303         if (vtime)
6304                 curthread->t_dtrace_start = dtrace_gethrtime();
6305
6306         dtrace_interrupt_enable(cookie);
6307 }
6308
6309 /*
6310  * DTrace Probe Hashing Functions
6311  *
6312  * The functions in this section (and indeed, the functions in remaining
6313  * sections) are not _called_ from probe context.  (Any exceptions to this are
6314  * marked with a "Note:".)  Rather, they are called from elsewhere in the
6315  * DTrace framework to look-up probes in, add probes to and remove probes from
6316  * the DTrace probe hashes.  (Each probe is hashed by each element of the
6317  * probe tuple -- allowing for fast lookups, regardless of what was
6318  * specified.)
6319  */
6320 static uint_t
6321 dtrace_hash_str(char *p)
6322 {
6323         unsigned int g;
6324         uint_t hval = 0;
6325
6326         while (*p) {
6327                 hval = (hval << 4) + *p++;
6328                 if ((g = (hval & 0xf0000000)) != 0)
6329                         hval ^= g >> 24;
6330                 hval &= ~g;
6331         }
6332         return (hval);
6333 }
6334
6335 static dtrace_hash_t *
6336 dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6337 {
6338         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6339
6340         hash->dth_stroffs = stroffs;
6341         hash->dth_nextoffs = nextoffs;
6342         hash->dth_prevoffs = prevoffs;
6343
6344         hash->dth_size = 1;
6345         hash->dth_mask = hash->dth_size - 1;
6346
6347         hash->dth_tab = kmem_zalloc(hash->dth_size *
6348             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6349
6350         return (hash);
6351 }
6352
6353 static void
6354 dtrace_hash_destroy(dtrace_hash_t *hash)
6355 {
6356 #ifdef DEBUG
6357         int i;
6358
6359         for (i = 0; i < hash->dth_size; i++)
6360                 ASSERT(hash->dth_tab[i] == NULL);
6361 #endif
6362
6363         kmem_free(hash->dth_tab,
6364             hash->dth_size * sizeof (dtrace_hashbucket_t *));
6365         kmem_free(hash, sizeof (dtrace_hash_t));
6366 }
6367
6368 static void
6369 dtrace_hash_resize(dtrace_hash_t *hash)
6370 {
6371         int size = hash->dth_size, i, ndx;
6372         int new_size = hash->dth_size << 1;
6373         int new_mask = new_size - 1;
6374         dtrace_hashbucket_t **new_tab, *bucket, *next;
6375
6376         ASSERT((new_size & new_mask) == 0);
6377
6378         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6379
6380         for (i = 0; i < size; i++) {
6381                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6382                         dtrace_probe_t *probe = bucket->dthb_chain;
6383
6384                         ASSERT(probe != NULL);
6385                         ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6386
6387                         next = bucket->dthb_next;
6388                         bucket->dthb_next = new_tab[ndx];
6389                         new_tab[ndx] = bucket;
6390                 }
6391         }
6392
6393         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6394         hash->dth_tab = new_tab;
6395         hash->dth_size = new_size;
6396         hash->dth_mask = new_mask;
6397 }
6398
6399 static void
6400 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6401 {
6402         int hashval = DTRACE_HASHSTR(hash, new);
6403         int ndx = hashval & hash->dth_mask;
6404         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6405         dtrace_probe_t **nextp, **prevp;
6406
6407         for (; bucket != NULL; bucket = bucket->dthb_next) {
6408                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6409                         goto add;
6410         }
6411
6412         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6413                 dtrace_hash_resize(hash);
6414                 dtrace_hash_add(hash, new);
6415                 return;
6416         }
6417
6418         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6419         bucket->dthb_next = hash->dth_tab[ndx];
6420         hash->dth_tab[ndx] = bucket;
6421         hash->dth_nbuckets++;
6422
6423 add:
6424         nextp = DTRACE_HASHNEXT(hash, new);
6425         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6426         *nextp = bucket->dthb_chain;
6427
6428         if (bucket->dthb_chain != NULL) {
6429                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6430                 ASSERT(*prevp == NULL);
6431                 *prevp = new;
6432         }
6433
6434         bucket->dthb_chain = new;
6435         bucket->dthb_len++;
6436 }
6437
6438 static dtrace_probe_t *
6439 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6440 {
6441         int hashval = DTRACE_HASHSTR(hash, template);
6442         int ndx = hashval & hash->dth_mask;
6443         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6444
6445         for (; bucket != NULL; bucket = bucket->dthb_next) {
6446                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6447                         return (bucket->dthb_chain);
6448         }
6449
6450         return (NULL);
6451 }
6452
6453 static int
6454 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6455 {
6456         int hashval = DTRACE_HASHSTR(hash, template);
6457         int ndx = hashval & hash->dth_mask;
6458         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6459
6460         for (; bucket != NULL; bucket = bucket->dthb_next) {
6461                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6462                         return (bucket->dthb_len);
6463         }
6464
6465         return (NULL);
6466 }
6467
6468 static void
6469 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6470 {
6471         int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6472         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6473
6474         dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6475         dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6476
6477         /*
6478          * Find the bucket that we're removing this probe from.
6479          */
6480         for (; bucket != NULL; bucket = bucket->dthb_next) {
6481                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6482                         break;
6483         }
6484
6485         ASSERT(bucket != NULL);
6486
6487         if (*prevp == NULL) {
6488                 if (*nextp == NULL) {
6489                         /*
6490                          * The removed probe was the only probe on this
6491                          * bucket; we need to remove the bucket.
6492                          */
6493                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6494
6495                         ASSERT(bucket->dthb_chain == probe);
6496                         ASSERT(b != NULL);
6497
6498                         if (b == bucket) {
6499                                 hash->dth_tab[ndx] = bucket->dthb_next;
6500                         } else {
6501                                 while (b->dthb_next != bucket)
6502                                         b = b->dthb_next;
6503                                 b->dthb_next = bucket->dthb_next;
6504                         }
6505
6506                         ASSERT(hash->dth_nbuckets > 0);
6507                         hash->dth_nbuckets--;
6508                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6509                         return;
6510                 }
6511
6512                 bucket->dthb_chain = *nextp;
6513         } else {
6514                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6515         }
6516
6517         if (*nextp != NULL)
6518                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6519 }
6520
6521 /*
6522  * DTrace Utility Functions
6523  *
6524  * These are random utility functions that are _not_ called from probe context.
6525  */
6526 static int
6527 dtrace_badattr(const dtrace_attribute_t *a)
6528 {
6529         return (a->dtat_name > DTRACE_STABILITY_MAX ||
6530             a->dtat_data > DTRACE_STABILITY_MAX ||
6531             a->dtat_class > DTRACE_CLASS_MAX);
6532 }
6533
6534 /*
6535  * Return a duplicate copy of a string.  If the specified string is NULL,
6536  * this function returns a zero-length string.
6537  */
6538 static char *
6539 dtrace_strdup(const char *str)
6540 {
6541         char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
6542
6543         if (str != NULL)
6544                 (void) strcpy(new, str);
6545
6546         return (new);
6547 }
6548
6549 #define DTRACE_ISALPHA(c)       \
6550         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6551
6552 static int
6553 dtrace_badname(const char *s)
6554 {
6555         char c;
6556
6557         if (s == NULL || (c = *s++) == '\0')
6558                 return (0);
6559
6560         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6561                 return (1);
6562
6563         while ((c = *s++) != '\0') {
6564                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6565                     c != '-' && c != '_' && c != '.' && c != '`')
6566                         return (1);
6567         }
6568
6569         return (0);
6570 }
6571
6572 static void
6573 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6574 {
6575         uint32_t priv;
6576
6577         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6578                 /*
6579                  * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6580                  */
6581                 priv = DTRACE_PRIV_ALL;
6582         } else {
6583                 *uidp = crgetuid(cr);
6584                 *zoneidp = crgetzoneid(cr);
6585
6586                 priv = 0;
6587                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6588                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6589                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6590                         priv |= DTRACE_PRIV_USER;
6591                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6592                         priv |= DTRACE_PRIV_PROC;
6593                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6594                         priv |= DTRACE_PRIV_OWNER;
6595                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6596                         priv |= DTRACE_PRIV_ZONEOWNER;
6597         }
6598
6599         *privp = priv;
6600 }
6601
6602 #ifdef DTRACE_ERRDEBUG
6603 static void
6604 dtrace_errdebug(const char *str)
6605 {
6606         int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
6607         int occupied = 0;
6608
6609         mutex_enter(&dtrace_errlock);
6610         dtrace_errlast = str;
6611         dtrace_errthread = curthread;
6612
6613         while (occupied++ < DTRACE_ERRHASHSZ) {
6614                 if (dtrace_errhash[hval].dter_msg == str) {
6615                         dtrace_errhash[hval].dter_count++;
6616                         goto out;
6617                 }
6618
6619                 if (dtrace_errhash[hval].dter_msg != NULL) {
6620                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
6621                         continue;
6622                 }
6623
6624                 dtrace_errhash[hval].dter_msg = str;
6625                 dtrace_errhash[hval].dter_count = 1;
6626                 goto out;
6627         }
6628
6629         panic("dtrace: undersized error hash");
6630 out:
6631         mutex_exit(&dtrace_errlock);
6632 }
6633 #endif
6634
6635 /*
6636  * DTrace Matching Functions
6637  *
6638  * These functions are used to match groups of probes, given some elements of
6639  * a probe tuple, or some globbed expressions for elements of a probe tuple.
6640  */
6641 static int
6642 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6643     zoneid_t zoneid)
6644 {
6645         if (priv != DTRACE_PRIV_ALL) {
6646                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6647                 uint32_t match = priv & ppriv;
6648
6649                 /*
6650                  * No PRIV_DTRACE_* privileges...
6651                  */
6652                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6653                     DTRACE_PRIV_KERNEL)) == 0)
6654                         return (0);
6655
6656                 /*
6657                  * No matching bits, but there were bits to match...
6658                  */
6659                 if (match == 0 && ppriv != 0)
6660                         return (0);
6661
6662                 /*
6663                  * Need to have permissions to the process, but don't...
6664                  */
6665                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6666                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6667                         return (0);
6668                 }
6669
6670                 /*
6671                  * Need to be in the same zone unless we possess the
6672                  * privilege to examine all zones.
6673                  */
6674                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6675                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6676                         return (0);
6677                 }
6678         }
6679
6680         return (1);
6681 }
6682
6683 /*
6684  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6685  * consists of input pattern strings and an ops-vector to evaluate them.
6686  * This function returns >0 for match, 0 for no match, and <0 for error.
6687  */
6688 static int
6689 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6690     uint32_t priv, uid_t uid, zoneid_t zoneid)
6691 {
6692         dtrace_provider_t *pvp = prp->dtpr_provider;
6693         int rv;
6694
6695         if (pvp->dtpv_defunct)
6696                 return (0);
6697
6698         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6699                 return (rv);
6700
6701         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6702                 return (rv);
6703
6704         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6705                 return (rv);
6706
6707         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6708                 return (rv);
6709
6710         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6711                 return (0);
6712
6713         return (rv);
6714 }
6715
6716 /*
6717  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
6718  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
6719  * libc's version, the kernel version only applies to 8-bit ASCII strings.
6720  * In addition, all of the recursion cases except for '*' matching have been
6721  * unwound.  For '*', we still implement recursive evaluation, but a depth
6722  * counter is maintained and matching is aborted if we recurse too deep.
6723  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
6724  */
6725 static int
6726 dtrace_match_glob(const char *s, const char *p, int depth)
6727 {
6728         const char *olds;
6729         char s1, c;
6730         int gs;
6731
6732         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
6733                 return (-1);
6734
6735         if (s == NULL)
6736                 s = ""; /* treat NULL as empty string */
6737
6738 top:
6739         olds = s;
6740         s1 = *s++;
6741
6742         if (p == NULL)
6743                 return (0);
6744
6745         if ((c = *p++) == '\0')
6746                 return (s1 == '\0');
6747
6748         switch (c) {
6749         case '[': {
6750                 int ok = 0, notflag = 0;
6751                 char lc = '\0';
6752
6753                 if (s1 == '\0')
6754                         return (0);
6755
6756                 if (*p == '!') {
6757                         notflag = 1;
6758                         p++;
6759                 }
6760
6761                 if ((c = *p++) == '\0')
6762                         return (0);
6763
6764                 do {
6765                         if (c == '-' && lc != '\0' && *p != ']') {
6766                                 if ((c = *p++) == '\0')
6767                                         return (0);
6768                                 if (c == '\\' && (c = *p++) == '\0')
6769                                         return (0);
6770
6771                                 if (notflag) {
6772                                         if (s1 < lc || s1 > c)
6773                                                 ok++;
6774                                         else
6775                                                 return (0);
6776                                 } else if (lc <= s1 && s1 <= c)
6777                                         ok++;
6778
6779                         } else if (c == '\\' && (c = *p++) == '\0')
6780                                 return (0);
6781
6782                         lc = c; /* save left-hand 'c' for next iteration */
6783
6784                         if (notflag) {
6785                                 if (s1 != c)
6786                                         ok++;
6787                                 else
6788                                         return (0);
6789                         } else if (s1 == c)
6790                                 ok++;
6791
6792                         if ((c = *p++) == '\0')
6793                                 return (0);
6794
6795                 } while (c != ']');
6796
6797                 if (ok)
6798                         goto top;
6799
6800                 return (0);
6801         }
6802
6803         case '\\':
6804                 if ((c = *p++) == '\0')
6805                         return (0);
6806                 /*FALLTHRU*/
6807
6808         default:
6809                 if (c != s1)
6810                         return (0);
6811                 /*FALLTHRU*/
6812
6813         case '?':
6814                 if (s1 != '\0')
6815                         goto top;
6816                 return (0);
6817
6818         case '*':
6819                 while (*p == '*')
6820                         p++; /* consecutive *'s are identical to a single one */
6821
6822                 if (*p == '\0')
6823                         return (1);
6824
6825                 for (s = olds; *s != '\0'; s++) {
6826                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
6827                                 return (gs);
6828                 }
6829
6830                 return (0);
6831         }
6832 }
6833
6834 /*ARGSUSED*/
6835 static int
6836 dtrace_match_string(const char *s, const char *p, int depth)
6837 {
6838         return (s != NULL && strcmp(s, p) == 0);
6839 }
6840
6841 /*ARGSUSED*/
6842 static int
6843 dtrace_match_nul(const char *s, const char *p, int depth)
6844 {
6845         return (1); /* always match the empty pattern */
6846 }
6847
6848 /*ARGSUSED*/
6849 static int
6850 dtrace_match_nonzero(const char *s, const char *p, int depth)
6851 {
6852         return (s != NULL && s[0] != '\0');
6853 }
6854
6855 static int
6856 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
6857     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
6858 {
6859         dtrace_probe_t template, *probe;
6860         dtrace_hash_t *hash = NULL;
6861         int len, rc, best = INT_MAX, nmatched = 0;
6862         dtrace_id_t i;
6863
6864         ASSERT(MUTEX_HELD(&dtrace_lock));
6865
6866         /*
6867          * If the probe ID is specified in the key, just lookup by ID and
6868          * invoke the match callback once if a matching probe is found.
6869          */
6870         if (pkp->dtpk_id != DTRACE_IDNONE) {
6871                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
6872                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
6873                         if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
6874                                 return (DTRACE_MATCH_FAIL);
6875                         nmatched++;
6876                 }
6877                 return (nmatched);
6878         }
6879
6880         template.dtpr_mod = (char *)pkp->dtpk_mod;
6881         template.dtpr_func = (char *)pkp->dtpk_func;
6882         template.dtpr_name = (char *)pkp->dtpk_name;
6883
6884         /*
6885          * We want to find the most distinct of the module name, function
6886          * name, and name.  So for each one that is not a glob pattern or
6887          * empty string, we perform a lookup in the corresponding hash and
6888          * use the hash table with the fewest collisions to do our search.
6889          */
6890         if (pkp->dtpk_mmatch == &dtrace_match_string &&
6891             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
6892                 best = len;
6893                 hash = dtrace_bymod;
6894         }
6895
6896         if (pkp->dtpk_fmatch == &dtrace_match_string &&
6897             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
6898                 best = len;
6899                 hash = dtrace_byfunc;
6900         }
6901
6902         if (pkp->dtpk_nmatch == &dtrace_match_string &&
6903             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
6904                 best = len;
6905                 hash = dtrace_byname;
6906         }
6907
6908         /*
6909          * If we did not select a hash table, iterate over every probe and
6910          * invoke our callback for each one that matches our input probe key.
6911          */
6912         if (hash == NULL) {
6913                 for (i = 0; i < dtrace_nprobes; i++) {
6914                         if ((probe = dtrace_probes[i]) == NULL ||
6915                             dtrace_match_probe(probe, pkp, priv, uid,
6916                             zoneid) <= 0)
6917                                 continue;
6918
6919                         nmatched++;
6920
6921                         if ((rc = (*matched)(probe, arg)) !=
6922                             DTRACE_MATCH_NEXT) {
6923                                 if (rc == DTRACE_MATCH_FAIL)
6924                                         return (DTRACE_MATCH_FAIL);
6925                                 break;
6926                         }
6927                 }
6928
6929                 return (nmatched);
6930         }
6931
6932         /*
6933          * If we selected a hash table, iterate over each probe of the same key
6934          * name and invoke the callback for every probe that matches the other
6935          * attributes of our input probe key.
6936          */
6937         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
6938             probe = *(DTRACE_HASHNEXT(hash, probe))) {
6939
6940                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
6941                         continue;
6942
6943                 nmatched++;
6944
6945                 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
6946                         if (rc == DTRACE_MATCH_FAIL)
6947                                 return (DTRACE_MATCH_FAIL);
6948                         break;
6949                 }
6950         }
6951
6952         return (nmatched);
6953 }
6954
6955 /*
6956  * Return the function pointer dtrace_probecmp() should use to compare the
6957  * specified pattern with a string.  For NULL or empty patterns, we select
6958  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
6959  * For non-empty non-glob strings, we use dtrace_match_string().
6960  */
6961 static dtrace_probekey_f *
6962 dtrace_probekey_func(const char *p)
6963 {
6964         char c;
6965
6966         if (p == NULL || *p == '\0')
6967                 return (&dtrace_match_nul);
6968
6969         while ((c = *p++) != '\0') {
6970                 if (c == '[' || c == '?' || c == '*' || c == '\\')
6971                         return (&dtrace_match_glob);
6972         }
6973
6974         return (&dtrace_match_string);
6975 }
6976
6977 /*
6978  * Build a probe comparison key for use with dtrace_match_probe() from the
6979  * given probe description.  By convention, a null key only matches anchored
6980  * probes: if each field is the empty string, reset dtpk_fmatch to
6981  * dtrace_match_nonzero().
6982  */
6983 static void
6984 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
6985 {
6986         pkp->dtpk_prov = pdp->dtpd_provider;
6987         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
6988
6989         pkp->dtpk_mod = pdp->dtpd_mod;
6990         pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
6991
6992         pkp->dtpk_func = pdp->dtpd_func;
6993         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
6994
6995         pkp->dtpk_name = pdp->dtpd_name;
6996         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
6997
6998         pkp->dtpk_id = pdp->dtpd_id;
6999
7000         if (pkp->dtpk_id == DTRACE_IDNONE &&
7001             pkp->dtpk_pmatch == &dtrace_match_nul &&
7002             pkp->dtpk_mmatch == &dtrace_match_nul &&
7003             pkp->dtpk_fmatch == &dtrace_match_nul &&
7004             pkp->dtpk_nmatch == &dtrace_match_nul)
7005                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7006 }
7007
7008 /*
7009  * DTrace Provider-to-Framework API Functions
7010  *
7011  * These functions implement much of the Provider-to-Framework API, as
7012  * described in <sys/dtrace.h>.  The parts of the API not in this section are
7013  * the functions in the API for probe management (found below), and
7014  * dtrace_probe() itself (found above).
7015  */
7016
7017 /*
7018  * Register the calling provider with the DTrace framework.  This should
7019  * generally be called by DTrace providers in their attach(9E) entry point.
7020  */
7021 int
7022 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7023     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7024 {
7025         dtrace_provider_t *provider;
7026
7027         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7028                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7029                     "arguments", name ? name : "<NULL>");
7030                 return (EINVAL);
7031         }
7032
7033         if (name[0] == '\0' || dtrace_badname(name)) {
7034                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7035                     "provider name", name);
7036                 return (EINVAL);
7037         }
7038
7039         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7040             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7041             pops->dtps_destroy == NULL ||
7042             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7043                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7044                     "provider ops", name);
7045                 return (EINVAL);
7046         }
7047
7048         if (dtrace_badattr(&pap->dtpa_provider) ||
7049             dtrace_badattr(&pap->dtpa_mod) ||
7050             dtrace_badattr(&pap->dtpa_func) ||
7051             dtrace_badattr(&pap->dtpa_name) ||
7052             dtrace_badattr(&pap->dtpa_args)) {
7053                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7054                     "provider attributes", name);
7055                 return (EINVAL);
7056         }
7057
7058         if (priv & ~DTRACE_PRIV_ALL) {
7059                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7060                     "privilege attributes", name);
7061                 return (EINVAL);
7062         }
7063
7064         if ((priv & DTRACE_PRIV_KERNEL) &&
7065             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7066             pops->dtps_mode == NULL) {
7067                 cmn_err(CE_WARN, "failed to register provider '%s': need "
7068                     "dtps_mode() op for given privilege attributes", name);
7069                 return (EINVAL);
7070         }
7071
7072         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7073         provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7074         (void) strcpy(provider->dtpv_name, name);
7075
7076         provider->dtpv_attr = *pap;
7077         provider->dtpv_priv.dtpp_flags = priv;
7078         if (cr != NULL) {
7079                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7080                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7081         }
7082         provider->dtpv_pops = *pops;
7083
7084         if (pops->dtps_provide == NULL) {
7085                 ASSERT(pops->dtps_provide_module != NULL);
7086                 provider->dtpv_pops.dtps_provide =
7087                     (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7088         }
7089
7090         if (pops->dtps_provide_module == NULL) {
7091                 ASSERT(pops->dtps_provide != NULL);
7092                 provider->dtpv_pops.dtps_provide_module =
7093                     (void (*)(void *, struct modctl *))dtrace_nullop;
7094         }
7095
7096         if (pops->dtps_suspend == NULL) {
7097                 ASSERT(pops->dtps_resume == NULL);
7098                 provider->dtpv_pops.dtps_suspend =
7099                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7100                 provider->dtpv_pops.dtps_resume =
7101                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7102         }
7103
7104         provider->dtpv_arg = arg;
7105         *idp = (dtrace_provider_id_t)provider;
7106
7107         if (pops == &dtrace_provider_ops) {
7108                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7109                 ASSERT(MUTEX_HELD(&dtrace_lock));
7110                 ASSERT(dtrace_anon.dta_enabling == NULL);
7111
7112                 /*
7113                  * We make sure that the DTrace provider is at the head of
7114                  * the provider chain.
7115                  */
7116                 provider->dtpv_next = dtrace_provider;
7117                 dtrace_provider = provider;
7118                 return (0);
7119         }
7120
7121         mutex_enter(&dtrace_provider_lock);
7122         mutex_enter(&dtrace_lock);
7123
7124         /*
7125          * If there is at least one provider registered, we'll add this
7126          * provider after the first provider.
7127          */
7128         if (dtrace_provider != NULL) {
7129                 provider->dtpv_next = dtrace_provider->dtpv_next;
7130                 dtrace_provider->dtpv_next = provider;
7131         } else {
7132                 dtrace_provider = provider;
7133         }
7134
7135         if (dtrace_retained != NULL) {
7136                 dtrace_enabling_provide(provider);
7137
7138                 /*
7139                  * Now we need to call dtrace_enabling_matchall() -- which
7140                  * will acquire cpu_lock and dtrace_lock.  We therefore need
7141                  * to drop all of our locks before calling into it...
7142                  */
7143                 mutex_exit(&dtrace_lock);
7144                 mutex_exit(&dtrace_provider_lock);
7145                 dtrace_enabling_matchall();
7146
7147                 return (0);
7148         }
7149
7150         mutex_exit(&dtrace_lock);
7151         mutex_exit(&dtrace_provider_lock);
7152
7153         return (0);
7154 }
7155
7156 /*
7157  * Unregister the specified provider from the DTrace framework.  This should
7158  * generally be called by DTrace providers in their detach(9E) entry point.
7159  */
7160 int
7161 dtrace_unregister(dtrace_provider_id_t id)
7162 {
7163         dtrace_provider_t *old = (dtrace_provider_t *)id;
7164         dtrace_provider_t *prev = NULL;
7165         int i, self = 0, noreap = 0;
7166         dtrace_probe_t *probe, *first = NULL;
7167
7168         if (old->dtpv_pops.dtps_enable ==
7169             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
7170                 /*
7171                  * If DTrace itself is the provider, we're called with locks
7172                  * already held.
7173                  */
7174                 ASSERT(old == dtrace_provider);
7175                 ASSERT(dtrace_devi != NULL);
7176                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7177                 ASSERT(MUTEX_HELD(&dtrace_lock));
7178                 self = 1;
7179
7180                 if (dtrace_provider->dtpv_next != NULL) {
7181                         /*
7182                          * There's another provider here; return failure.
7183                          */
7184                         return (EBUSY);
7185                 }
7186         } else {
7187                 mutex_enter(&dtrace_provider_lock);
7188                 mutex_enter(&mod_lock);
7189                 mutex_enter(&dtrace_lock);
7190         }
7191
7192         /*
7193          * If anyone has /dev/dtrace open, or if there are anonymous enabled
7194          * probes, we refuse to let providers slither away, unless this
7195          * provider has already been explicitly invalidated.
7196          */
7197         if (!old->dtpv_defunct &&
7198             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7199             dtrace_anon.dta_state->dts_necbs > 0))) {
7200                 if (!self) {
7201                         mutex_exit(&dtrace_lock);
7202                         mutex_exit(&mod_lock);
7203                         mutex_exit(&dtrace_provider_lock);
7204                 }
7205                 return (EBUSY);
7206         }
7207
7208         /*
7209          * Attempt to destroy the probes associated with this provider.
7210          */
7211         for (i = 0; i < dtrace_nprobes; i++) {
7212                 if ((probe = dtrace_probes[i]) == NULL)
7213                         continue;
7214
7215                 if (probe->dtpr_provider != old)
7216                         continue;
7217
7218                 if (probe->dtpr_ecb == NULL)
7219                         continue;
7220
7221                 /*
7222                  * If we are trying to unregister a defunct provider, and the
7223                  * provider was made defunct within the interval dictated by
7224                  * dtrace_unregister_defunct_reap, we'll (asynchronously)
7225                  * attempt to reap our enablings.  To denote that the provider
7226                  * should reattempt to unregister itself at some point in the
7227                  * future, we will return a differentiable error code (EAGAIN
7228                  * instead of EBUSY) in this case.
7229                  */
7230                 if (dtrace_gethrtime() - old->dtpv_defunct >
7231                     dtrace_unregister_defunct_reap)
7232                         noreap = 1;
7233
7234                 if (!self) {
7235                         mutex_exit(&dtrace_lock);
7236                         mutex_exit(&mod_lock);
7237                         mutex_exit(&dtrace_provider_lock);
7238                 }
7239
7240                 if (noreap)
7241                         return (EBUSY);
7242
7243                 (void) taskq_dispatch(dtrace_taskq,
7244                     (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
7245
7246                 return (EAGAIN);
7247         }
7248
7249         /*
7250          * All of the probes for this provider are disabled; we can safely
7251          * remove all of them from their hash chains and from the probe array.
7252          */
7253         for (i = 0; i < dtrace_nprobes; i++) {
7254                 if ((probe = dtrace_probes[i]) == NULL)
7255                         continue;
7256
7257                 if (probe->dtpr_provider != old)
7258                         continue;
7259
7260                 dtrace_probes[i] = NULL;
7261
7262                 dtrace_hash_remove(dtrace_bymod, probe);
7263                 dtrace_hash_remove(dtrace_byfunc, probe);
7264                 dtrace_hash_remove(dtrace_byname, probe);
7265
7266                 if (first == NULL) {
7267                         first = probe;
7268                         probe->dtpr_nextmod = NULL;
7269                 } else {
7270                         probe->dtpr_nextmod = first;
7271                         first = probe;
7272                 }
7273         }
7274
7275         /*
7276          * The provider's probes have been removed from the hash chains and
7277          * from the probe array.  Now issue a dtrace_sync() to be sure that
7278          * everyone has cleared out from any probe array processing.
7279          */
7280         dtrace_sync();
7281
7282         for (probe = first; probe != NULL; probe = first) {
7283                 first = probe->dtpr_nextmod;
7284
7285                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7286                     probe->dtpr_arg);
7287                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7288                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7289                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7290                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7291                 kmem_free(probe, sizeof (dtrace_probe_t));
7292         }
7293
7294         if ((prev = dtrace_provider) == old) {
7295                 ASSERT(self || dtrace_devi == NULL);
7296                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7297                 dtrace_provider = old->dtpv_next;
7298         } else {
7299                 while (prev != NULL && prev->dtpv_next != old)
7300                         prev = prev->dtpv_next;
7301
7302                 if (prev == NULL) {
7303                         panic("attempt to unregister non-existent "
7304                             "dtrace provider %p\n", (void *)id);
7305                 }
7306
7307                 prev->dtpv_next = old->dtpv_next;
7308         }
7309
7310         if (!self) {
7311                 mutex_exit(&dtrace_lock);
7312                 mutex_exit(&mod_lock);
7313                 mutex_exit(&dtrace_provider_lock);
7314         }
7315
7316         kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7317         kmem_free(old, sizeof (dtrace_provider_t));
7318
7319         return (0);
7320 }
7321
7322 /*
7323  * Invalidate the specified provider.  All subsequent probe lookups for the
7324  * specified provider will fail, but its probes will not be removed.
7325  */
7326 void
7327 dtrace_invalidate(dtrace_provider_id_t id)
7328 {
7329         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7330
7331         ASSERT(pvp->dtpv_pops.dtps_enable !=
7332             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7333
7334         mutex_enter(&dtrace_provider_lock);
7335         mutex_enter(&dtrace_lock);
7336
7337         pvp->dtpv_defunct = dtrace_gethrtime();
7338
7339         mutex_exit(&dtrace_lock);
7340         mutex_exit(&dtrace_provider_lock);
7341 }
7342
7343 /*
7344  * Indicate whether or not DTrace has attached.
7345  */
7346 int
7347 dtrace_attached(void)
7348 {
7349         /*
7350          * dtrace_provider will be non-NULL iff the DTrace driver has
7351          * attached.  (It's non-NULL because DTrace is always itself a
7352          * provider.)
7353          */
7354         return (dtrace_provider != NULL);
7355 }
7356
7357 /*
7358  * Remove all the unenabled probes for the given provider.  This function is
7359  * not unlike dtrace_unregister(), except that it doesn't remove the provider
7360  * -- just as many of its associated probes as it can.
7361  */
7362 int
7363 dtrace_condense(dtrace_provider_id_t id)
7364 {
7365         dtrace_provider_t *prov = (dtrace_provider_t *)id;
7366         int i;
7367         dtrace_probe_t *probe;
7368
7369         /*
7370          * Make sure this isn't the dtrace provider itself.
7371          */
7372         ASSERT(prov->dtpv_pops.dtps_enable !=
7373             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7374
7375         mutex_enter(&dtrace_provider_lock);
7376         mutex_enter(&dtrace_lock);
7377
7378         /*
7379          * Attempt to destroy the probes associated with this provider.
7380          */
7381         for (i = 0; i < dtrace_nprobes; i++) {
7382                 if ((probe = dtrace_probes[i]) == NULL)
7383                         continue;
7384
7385                 if (probe->dtpr_provider != prov)
7386                         continue;
7387
7388                 if (probe->dtpr_ecb != NULL)
7389                         continue;
7390
7391                 dtrace_probes[i] = NULL;
7392
7393                 dtrace_hash_remove(dtrace_bymod, probe);
7394                 dtrace_hash_remove(dtrace_byfunc, probe);
7395                 dtrace_hash_remove(dtrace_byname, probe);
7396
7397                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7398                     probe->dtpr_arg);
7399                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7400                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7401                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7402                 kmem_free(probe, sizeof (dtrace_probe_t));
7403                 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7404         }
7405
7406         mutex_exit(&dtrace_lock);
7407         mutex_exit(&dtrace_provider_lock);
7408
7409         return (0);
7410 }
7411
7412 /*
7413  * DTrace Probe Management Functions
7414  *
7415  * The functions in this section perform the DTrace probe management,
7416  * including functions to create probes, look-up probes, and call into the
7417  * providers to request that probes be provided.  Some of these functions are
7418  * in the Provider-to-Framework API; these functions can be identified by the
7419  * fact that they are not declared "static".
7420  */
7421
7422 /*
7423  * Create a probe with the specified module name, function name, and name.
7424  */
7425 dtrace_id_t
7426 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7427     const char *func, const char *name, int aframes, void *arg)
7428 {
7429         dtrace_probe_t *probe, **probes;
7430         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7431         dtrace_id_t id;
7432
7433         if (provider == dtrace_provider) {
7434                 ASSERT(MUTEX_HELD(&dtrace_lock));
7435         } else {
7436                 mutex_enter(&dtrace_lock);
7437         }
7438
7439         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7440             VM_BESTFIT | VM_SLEEP);
7441         probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
7442
7443         probe->dtpr_id = id;
7444         probe->dtpr_gen = dtrace_probegen++;
7445         probe->dtpr_mod = dtrace_strdup(mod);
7446         probe->dtpr_func = dtrace_strdup(func);
7447         probe->dtpr_name = dtrace_strdup(name);
7448         probe->dtpr_arg = arg;
7449         probe->dtpr_aframes = aframes;
7450         probe->dtpr_provider = provider;
7451
7452         dtrace_hash_add(dtrace_bymod, probe);
7453         dtrace_hash_add(dtrace_byfunc, probe);
7454         dtrace_hash_add(dtrace_byname, probe);
7455
7456         if (id - 1 >= dtrace_nprobes) {
7457                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7458                 size_t nsize = osize << 1;
7459
7460                 if (nsize == 0) {
7461                         ASSERT(osize == 0);
7462                         ASSERT(dtrace_probes == NULL);
7463                         nsize = sizeof (dtrace_probe_t *);
7464                 }
7465
7466                 probes = kmem_zalloc(nsize, KM_SLEEP);
7467
7468                 if (dtrace_probes == NULL) {
7469                         ASSERT(osize == 0);
7470                         dtrace_probes = probes;
7471                         dtrace_nprobes = 1;
7472                 } else {
7473                         dtrace_probe_t **oprobes = dtrace_probes;
7474
7475                         bcopy(oprobes, probes, osize);
7476                         dtrace_membar_producer();
7477                         dtrace_probes = probes;
7478
7479                         dtrace_sync();
7480
7481                         /*
7482                          * All CPUs are now seeing the new probes array; we can
7483                          * safely free the old array.
7484                          */
7485                         kmem_free(oprobes, osize);
7486                         dtrace_nprobes <<= 1;
7487                 }
7488
7489                 ASSERT(id - 1 < dtrace_nprobes);
7490         }
7491
7492         ASSERT(dtrace_probes[id - 1] == NULL);
7493         dtrace_probes[id - 1] = probe;
7494
7495         if (provider != dtrace_provider)
7496                 mutex_exit(&dtrace_lock);
7497
7498         return (id);
7499 }
7500
7501 static dtrace_probe_t *
7502 dtrace_probe_lookup_id(dtrace_id_t id)
7503 {
7504         ASSERT(MUTEX_HELD(&dtrace_lock));
7505
7506         if (id == 0 || id > dtrace_nprobes)
7507                 return (NULL);
7508
7509         return (dtrace_probes[id - 1]);
7510 }
7511
7512 static int
7513 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7514 {
7515         *((dtrace_id_t *)arg) = probe->dtpr_id;
7516
7517         return (DTRACE_MATCH_DONE);
7518 }
7519
7520 /*
7521  * Look up a probe based on provider and one or more of module name, function
7522  * name and probe name.
7523  */
7524 dtrace_id_t
7525 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7526     const char *func, const char *name)
7527 {
7528         dtrace_probekey_t pkey;
7529         dtrace_id_t id;
7530         int match;
7531
7532         pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7533         pkey.dtpk_pmatch = &dtrace_match_string;
7534         pkey.dtpk_mod = mod;
7535         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7536         pkey.dtpk_func = func;
7537         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7538         pkey.dtpk_name = name;
7539         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7540         pkey.dtpk_id = DTRACE_IDNONE;
7541
7542         mutex_enter(&dtrace_lock);
7543         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7544             dtrace_probe_lookup_match, &id);
7545         mutex_exit(&dtrace_lock);
7546
7547         ASSERT(match == 1 || match == 0);
7548         return (match ? id : 0);
7549 }
7550
7551 /*
7552  * Returns the probe argument associated with the specified probe.
7553  */
7554 void *
7555 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7556 {
7557         dtrace_probe_t *probe;
7558         void *rval = NULL;
7559
7560         mutex_enter(&dtrace_lock);
7561
7562         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7563             probe->dtpr_provider == (dtrace_provider_t *)id)
7564                 rval = probe->dtpr_arg;
7565
7566         mutex_exit(&dtrace_lock);
7567
7568         return (rval);
7569 }
7570
7571 /*
7572  * Copy a probe into a probe description.
7573  */
7574 static void
7575 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7576 {
7577         bzero(pdp, sizeof (dtrace_probedesc_t));
7578         pdp->dtpd_id = prp->dtpr_id;
7579
7580         (void) strncpy(pdp->dtpd_provider,
7581             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
7582
7583         (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
7584         (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
7585         (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
7586 }
7587
7588 /*
7589  * Called to indicate that a probe -- or probes -- should be provided by a
7590  * specfied provider.  If the specified description is NULL, the provider will
7591  * be told to provide all of its probes.  (This is done whenever a new
7592  * consumer comes along, or whenever a retained enabling is to be matched.) If
7593  * the specified description is non-NULL, the provider is given the
7594  * opportunity to dynamically provide the specified probe, allowing providers
7595  * to support the creation of probes on-the-fly.  (So-called _autocreated_
7596  * probes.)  If the provider is NULL, the operations will be applied to all
7597  * providers; if the provider is non-NULL the operations will only be applied
7598  * to the specified provider.  The dtrace_provider_lock must be held, and the
7599  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7600  * will need to grab the dtrace_lock when it reenters the framework through
7601  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7602  */
7603 static void
7604 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7605 {
7606         struct modctl *ctl;
7607         int all = 0;
7608
7609         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
7610
7611         if (prv == NULL) {
7612                 all = 1;
7613                 prv = dtrace_provider;
7614         }
7615
7616         do {
7617                 /*
7618                  * First, call the blanket provide operation.
7619                  */
7620                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7621
7622                 /*
7623                  * Now call the per-module provide operation.  We will grab
7624                  * mod_lock to prevent the list from being modified.  Note
7625                  * that this also prevents the mod_busy bits from changing.
7626                  * (mod_busy can only be changed with mod_lock held.)
7627                  */
7628                 mutex_enter(&mod_lock);
7629
7630                 ctl = &modules;
7631                 do {
7632                         if (ctl->mod_busy || ctl->mod_mp == NULL)
7633                                 continue;
7634
7635                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7636
7637                 } while ((ctl = ctl->mod_next) != &modules);
7638
7639                 mutex_exit(&mod_lock);
7640         } while (all && (prv = prv->dtpv_next) != NULL);
7641 }
7642
7643 /*
7644  * Iterate over each probe, and call the Framework-to-Provider API function
7645  * denoted by offs.
7646  */
7647 static void
7648 dtrace_probe_foreach(uintptr_t offs)
7649 {
7650         dtrace_provider_t *prov;
7651         void (*func)(void *, dtrace_id_t, void *);
7652         dtrace_probe_t *probe;
7653         dtrace_icookie_t cookie;
7654         int i;
7655
7656         /*
7657          * We disable interrupts to walk through the probe array.  This is
7658          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7659          * won't see stale data.
7660          */
7661         cookie = dtrace_interrupt_disable();
7662
7663         for (i = 0; i < dtrace_nprobes; i++) {
7664                 if ((probe = dtrace_probes[i]) == NULL)
7665                         continue;
7666
7667                 if (probe->dtpr_ecb == NULL) {
7668                         /*
7669                          * This probe isn't enabled -- don't call the function.
7670                          */
7671                         continue;
7672                 }
7673
7674                 prov = probe->dtpr_provider;
7675                 func = *((void(**)(void *, dtrace_id_t, void *))
7676                     ((uintptr_t)&prov->dtpv_pops + offs));
7677
7678                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7679         }
7680
7681         dtrace_interrupt_enable(cookie);
7682 }
7683
7684 static int
7685 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7686 {
7687         dtrace_probekey_t pkey;
7688         uint32_t priv;
7689         uid_t uid;
7690         zoneid_t zoneid;
7691
7692         ASSERT(MUTEX_HELD(&dtrace_lock));
7693         dtrace_ecb_create_cache = NULL;
7694
7695         if (desc == NULL) {
7696                 /*
7697                  * If we're passed a NULL description, we're being asked to
7698                  * create an ECB with a NULL probe.
7699                  */
7700                 (void) dtrace_ecb_create_enable(NULL, enab);
7701                 return (0);
7702         }
7703
7704         dtrace_probekey(desc, &pkey);
7705         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7706             &priv, &uid, &zoneid);
7707
7708         return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7709             enab));
7710 }
7711
7712 /*
7713  * DTrace Helper Provider Functions
7714  */
7715 static void
7716 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7717 {
7718         attr->dtat_name = DOF_ATTR_NAME(dofattr);
7719         attr->dtat_data = DOF_ATTR_DATA(dofattr);
7720         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7721 }
7722
7723 static void
7724 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7725     const dof_provider_t *dofprov, char *strtab)
7726 {
7727         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7728         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7729             dofprov->dofpv_provattr);
7730         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
7731             dofprov->dofpv_modattr);
7732         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
7733             dofprov->dofpv_funcattr);
7734         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
7735             dofprov->dofpv_nameattr);
7736         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
7737             dofprov->dofpv_argsattr);
7738 }
7739
7740 static void
7741 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7742 {
7743         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7744         dof_hdr_t *dof = (dof_hdr_t *)daddr;
7745         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
7746         dof_provider_t *provider;
7747         dof_probe_t *probe;
7748         uint32_t *off, *enoff;
7749         uint8_t *arg;
7750         char *strtab;
7751         uint_t i, nprobes;
7752         dtrace_helper_provdesc_t dhpv;
7753         dtrace_helper_probedesc_t dhpb;
7754         dtrace_meta_t *meta = dtrace_meta_pid;
7755         dtrace_mops_t *mops = &meta->dtm_mops;
7756         void *parg;
7757
7758         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7759         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7760             provider->dofpv_strtab * dof->dofh_secsize);
7761         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7762             provider->dofpv_probes * dof->dofh_secsize);
7763         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7764             provider->dofpv_prargs * dof->dofh_secsize);
7765         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7766             provider->dofpv_proffs * dof->dofh_secsize);
7767
7768         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7769         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
7770         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
7771         enoff = NULL;
7772
7773         /*
7774          * See dtrace_helper_provider_validate().
7775          */
7776         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
7777             provider->dofpv_prenoffs != DOF_SECT_NONE) {
7778                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7779                     provider->dofpv_prenoffs * dof->dofh_secsize);
7780                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
7781         }
7782
7783         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
7784
7785         /*
7786          * Create the provider.
7787          */
7788         dtrace_dofprov2hprov(&dhpv, provider, strtab);
7789
7790         if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
7791                 return;
7792
7793         meta->dtm_count++;
7794
7795         /*
7796          * Create the probes.
7797          */
7798         for (i = 0; i < nprobes; i++) {
7799                 probe = (dof_probe_t *)(uintptr_t)(daddr +
7800                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
7801
7802                 dhpb.dthpb_mod = dhp->dofhp_mod;
7803                 dhpb.dthpb_func = strtab + probe->dofpr_func;
7804                 dhpb.dthpb_name = strtab + probe->dofpr_name;
7805                 dhpb.dthpb_base = probe->dofpr_addr;
7806                 dhpb.dthpb_offs = off + probe->dofpr_offidx;
7807                 dhpb.dthpb_noffs = probe->dofpr_noffs;
7808                 if (enoff != NULL) {
7809                         dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
7810                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
7811                 } else {
7812                         dhpb.dthpb_enoffs = NULL;
7813                         dhpb.dthpb_nenoffs = 0;
7814                 }
7815                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
7816                 dhpb.dthpb_nargc = probe->dofpr_nargc;
7817                 dhpb.dthpb_xargc = probe->dofpr_xargc;
7818                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
7819                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
7820
7821                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
7822         }
7823 }
7824
7825 static void
7826 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
7827 {
7828         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7829         dof_hdr_t *dof = (dof_hdr_t *)daddr;
7830         int i;
7831
7832         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7833
7834         for (i = 0; i < dof->dofh_secnum; i++) {
7835                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7836                     dof->dofh_secoff + i * dof->dofh_secsize);
7837
7838                 if (sec->dofs_type != DOF_SECT_PROVIDER)
7839                         continue;
7840
7841                 dtrace_helper_provide_one(dhp, sec, pid);
7842         }
7843
7844         /*
7845          * We may have just created probes, so we must now rematch against
7846          * any retained enablings.  Note that this call will acquire both
7847          * cpu_lock and dtrace_lock; the fact that we are holding
7848          * dtrace_meta_lock now is what defines the ordering with respect to
7849          * these three locks.
7850          */
7851         dtrace_enabling_matchall();
7852 }
7853
7854 static void
7855 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
7856 {
7857         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7858         dof_hdr_t *dof = (dof_hdr_t *)daddr;
7859         dof_sec_t *str_sec;
7860         dof_provider_t *provider;
7861         char *strtab;
7862         dtrace_helper_provdesc_t dhpv;
7863         dtrace_meta_t *meta = dtrace_meta_pid;
7864         dtrace_mops_t *mops = &meta->dtm_mops;
7865
7866         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
7867         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
7868             provider->dofpv_strtab * dof->dofh_secsize);
7869
7870         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
7871
7872         /*
7873          * Create the provider.
7874          */
7875         dtrace_dofprov2hprov(&dhpv, provider, strtab);
7876
7877         mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
7878
7879         meta->dtm_count--;
7880 }
7881
7882 static void
7883 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
7884 {
7885         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
7886         dof_hdr_t *dof = (dof_hdr_t *)daddr;
7887         int i;
7888
7889         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
7890
7891         for (i = 0; i < dof->dofh_secnum; i++) {
7892                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
7893                     dof->dofh_secoff + i * dof->dofh_secsize);
7894
7895                 if (sec->dofs_type != DOF_SECT_PROVIDER)
7896                         continue;
7897
7898                 dtrace_helper_provider_remove_one(dhp, sec, pid);
7899         }
7900 }
7901
7902 /*
7903  * DTrace Meta Provider-to-Framework API Functions
7904  *
7905  * These functions implement the Meta Provider-to-Framework API, as described
7906  * in <sys/dtrace.h>.
7907  */
7908 int
7909 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
7910     dtrace_meta_provider_id_t *idp)
7911 {
7912         dtrace_meta_t *meta;
7913         dtrace_helpers_t *help, *next;
7914         int i;
7915
7916         *idp = DTRACE_METAPROVNONE;
7917
7918         /*
7919          * We strictly don't need the name, but we hold onto it for
7920          * debuggability. All hail error queues!
7921          */
7922         if (name == NULL) {
7923                 cmn_err(CE_WARN, "failed to register meta-provider: "
7924                     "invalid name");
7925                 return (EINVAL);
7926         }
7927
7928         if (mops == NULL ||
7929             mops->dtms_create_probe == NULL ||
7930             mops->dtms_provide_pid == NULL ||
7931             mops->dtms_remove_pid == NULL) {
7932                 cmn_err(CE_WARN, "failed to register meta-register %s: "
7933                     "invalid ops", name);
7934                 return (EINVAL);
7935         }
7936
7937         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
7938         meta->dtm_mops = *mops;
7939         meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
7940         (void) strcpy(meta->dtm_name, name);
7941         meta->dtm_arg = arg;
7942
7943         mutex_enter(&dtrace_meta_lock);
7944         mutex_enter(&dtrace_lock);
7945
7946         if (dtrace_meta_pid != NULL) {
7947                 mutex_exit(&dtrace_lock);
7948                 mutex_exit(&dtrace_meta_lock);
7949                 cmn_err(CE_WARN, "failed to register meta-register %s: "
7950                     "user-land meta-provider exists", name);
7951                 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
7952                 kmem_free(meta, sizeof (dtrace_meta_t));
7953                 return (EINVAL);
7954         }
7955
7956         dtrace_meta_pid = meta;
7957         *idp = (dtrace_meta_provider_id_t)meta;
7958
7959         /*
7960          * If there are providers and probes ready to go, pass them
7961          * off to the new meta provider now.
7962          */
7963
7964         help = dtrace_deferred_pid;
7965         dtrace_deferred_pid = NULL;
7966
7967         mutex_exit(&dtrace_lock);
7968
7969         while (help != NULL) {
7970                 for (i = 0; i < help->dthps_nprovs; i++) {
7971                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
7972                             help->dthps_pid);
7973                 }
7974
7975                 next = help->dthps_next;
7976                 help->dthps_next = NULL;
7977                 help->dthps_prev = NULL;
7978                 help->dthps_deferred = 0;
7979                 help = next;
7980         }
7981
7982         mutex_exit(&dtrace_meta_lock);
7983
7984         return (0);
7985 }
7986
7987 int
7988 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
7989 {
7990         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
7991
7992         mutex_enter(&dtrace_meta_lock);
7993         mutex_enter(&dtrace_lock);
7994
7995         if (old == dtrace_meta_pid) {
7996                 pp = &dtrace_meta_pid;
7997         } else {
7998                 panic("attempt to unregister non-existent "
7999                     "dtrace meta-provider %p\n", (void *)old);
8000         }
8001
8002         if (old->dtm_count != 0) {
8003                 mutex_exit(&dtrace_lock);
8004                 mutex_exit(&dtrace_meta_lock);
8005                 return (EBUSY);
8006         }
8007
8008         *pp = NULL;
8009
8010         mutex_exit(&dtrace_lock);
8011         mutex_exit(&dtrace_meta_lock);
8012
8013         kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8014         kmem_free(old, sizeof (dtrace_meta_t));
8015
8016         return (0);
8017 }
8018
8019
8020 /*
8021  * DTrace DIF Object Functions
8022  */
8023 static int
8024 dtrace_difo_err(uint_t pc, const char *format, ...)
8025 {
8026         if (dtrace_err_verbose) {
8027                 va_list alist;
8028
8029                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8030                 va_start(alist, format);
8031                 (void) vuprintf(format, alist);
8032                 va_end(alist);
8033         }
8034
8035 #ifdef DTRACE_ERRDEBUG
8036         dtrace_errdebug(format);
8037 #endif
8038         return (1);
8039 }
8040
8041 /*
8042  * Validate a DTrace DIF object by checking the IR instructions.  The following
8043  * rules are currently enforced by dtrace_difo_validate():
8044  *
8045  * 1. Each instruction must have a valid opcode
8046  * 2. Each register, string, variable, or subroutine reference must be valid
8047  * 3. No instruction can modify register %r0 (must be zero)
8048  * 4. All instruction reserved bits must be set to zero
8049  * 5. The last instruction must be a "ret" instruction
8050  * 6. All branch targets must reference a valid instruction _after_ the branch
8051  */
8052 static int
8053 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8054     cred_t *cr)
8055 {
8056         int err = 0, i;
8057         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8058         int kcheckload;
8059         uint_t pc;
8060
8061         kcheckload = cr == NULL ||
8062             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8063
8064         dp->dtdo_destructive = 0;
8065
8066         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8067                 dif_instr_t instr = dp->dtdo_buf[pc];
8068
8069                 uint_t r1 = DIF_INSTR_R1(instr);
8070                 uint_t r2 = DIF_INSTR_R2(instr);
8071                 uint_t rd = DIF_INSTR_RD(instr);
8072                 uint_t rs = DIF_INSTR_RS(instr);
8073                 uint_t label = DIF_INSTR_LABEL(instr);
8074                 uint_t v = DIF_INSTR_VAR(instr);
8075                 uint_t subr = DIF_INSTR_SUBR(instr);
8076                 uint_t type = DIF_INSTR_TYPE(instr);
8077                 uint_t op = DIF_INSTR_OP(instr);
8078
8079                 switch (op) {
8080                 case DIF_OP_OR:
8081                 case DIF_OP_XOR:
8082                 case DIF_OP_AND:
8083                 case DIF_OP_SLL:
8084                 case DIF_OP_SRL:
8085                 case DIF_OP_SRA:
8086                 case DIF_OP_SUB:
8087                 case DIF_OP_ADD:
8088                 case DIF_OP_MUL:
8089                 case DIF_OP_SDIV:
8090                 case DIF_OP_UDIV:
8091                 case DIF_OP_SREM:
8092                 case DIF_OP_UREM:
8093                 case DIF_OP_COPYS:
8094                         if (r1 >= nregs)
8095                                 err += efunc(pc, "invalid register %u\n", r1);
8096                         if (r2 >= nregs)
8097                                 err += efunc(pc, "invalid register %u\n", r2);
8098                         if (rd >= nregs)
8099                                 err += efunc(pc, "invalid register %u\n", rd);
8100                         if (rd == 0)
8101                                 err += efunc(pc, "cannot write to %r0\n");
8102                         break;
8103                 case DIF_OP_NOT:
8104                 case DIF_OP_MOV:
8105                 case DIF_OP_ALLOCS:
8106                         if (r1 >= nregs)
8107                                 err += efunc(pc, "invalid register %u\n", r1);
8108                         if (r2 != 0)
8109                                 err += efunc(pc, "non-zero reserved bits\n");
8110                         if (rd >= nregs)
8111                                 err += efunc(pc, "invalid register %u\n", rd);
8112                         if (rd == 0)
8113                                 err += efunc(pc, "cannot write to %r0\n");
8114                         break;
8115                 case DIF_OP_LDSB:
8116                 case DIF_OP_LDSH:
8117                 case DIF_OP_LDSW:
8118                 case DIF_OP_LDUB:
8119                 case DIF_OP_LDUH:
8120                 case DIF_OP_LDUW:
8121                 case DIF_OP_LDX:
8122                         if (r1 >= nregs)
8123                                 err += efunc(pc, "invalid register %u\n", r1);
8124                         if (r2 != 0)
8125                                 err += efunc(pc, "non-zero reserved bits\n");
8126                         if (rd >= nregs)
8127                                 err += efunc(pc, "invalid register %u\n", rd);
8128                         if (rd == 0)
8129                                 err += efunc(pc, "cannot write to %r0\n");
8130                         if (kcheckload)
8131                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8132                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8133                         break;
8134                 case DIF_OP_RLDSB:
8135                 case DIF_OP_RLDSH:
8136                 case DIF_OP_RLDSW:
8137                 case DIF_OP_RLDUB:
8138                 case DIF_OP_RLDUH:
8139                 case DIF_OP_RLDUW:
8140                 case DIF_OP_RLDX:
8141                         if (r1 >= nregs)
8142                                 err += efunc(pc, "invalid register %u\n", r1);
8143                         if (r2 != 0)
8144                                 err += efunc(pc, "non-zero reserved bits\n");
8145                         if (rd >= nregs)
8146                                 err += efunc(pc, "invalid register %u\n", rd);
8147                         if (rd == 0)
8148                                 err += efunc(pc, "cannot write to %r0\n");
8149                         break;
8150                 case DIF_OP_ULDSB:
8151                 case DIF_OP_ULDSH:
8152                 case DIF_OP_ULDSW:
8153                 case DIF_OP_ULDUB:
8154                 case DIF_OP_ULDUH:
8155                 case DIF_OP_ULDUW:
8156                 case DIF_OP_ULDX:
8157                         if (r1 >= nregs)
8158                                 err += efunc(pc, "invalid register %u\n", r1);
8159                         if (r2 != 0)
8160                                 err += efunc(pc, "non-zero reserved bits\n");
8161                         if (rd >= nregs)
8162                                 err += efunc(pc, "invalid register %u\n", rd);
8163                         if (rd == 0)
8164                                 err += efunc(pc, "cannot write to %r0\n");
8165                         break;
8166                 case DIF_OP_STB:
8167                 case DIF_OP_STH:
8168                 case DIF_OP_STW:
8169                 case DIF_OP_STX:
8170                         if (r1 >= nregs)
8171                                 err += efunc(pc, "invalid register %u\n", r1);
8172                         if (r2 != 0)
8173                                 err += efunc(pc, "non-zero reserved bits\n");
8174                         if (rd >= nregs)
8175                                 err += efunc(pc, "invalid register %u\n", rd);
8176                         if (rd == 0)
8177                                 err += efunc(pc, "cannot write to 0 address\n");
8178                         break;
8179                 case DIF_OP_CMP:
8180                 case DIF_OP_SCMP:
8181                         if (r1 >= nregs)
8182                                 err += efunc(pc, "invalid register %u\n", r1);
8183                         if (r2 >= nregs)
8184                                 err += efunc(pc, "invalid register %u\n", r2);
8185                         if (rd != 0)
8186                                 err += efunc(pc, "non-zero reserved bits\n");
8187                         break;
8188                 case DIF_OP_TST:
8189                         if (r1 >= nregs)
8190                                 err += efunc(pc, "invalid register %u\n", r1);
8191                         if (r2 != 0 || rd != 0)
8192                                 err += efunc(pc, "non-zero reserved bits\n");
8193                         break;
8194                 case DIF_OP_BA:
8195                 case DIF_OP_BE:
8196                 case DIF_OP_BNE:
8197                 case DIF_OP_BG:
8198                 case DIF_OP_BGU:
8199                 case DIF_OP_BGE:
8200                 case DIF_OP_BGEU:
8201                 case DIF_OP_BL:
8202                 case DIF_OP_BLU:
8203                 case DIF_OP_BLE:
8204                 case DIF_OP_BLEU:
8205                         if (label >= dp->dtdo_len) {
8206                                 err += efunc(pc, "invalid branch target %u\n",
8207                                     label);
8208                         }
8209                         if (label <= pc) {
8210                                 err += efunc(pc, "backward branch to %u\n",
8211                                     label);
8212                         }
8213                         break;
8214                 case DIF_OP_RET:
8215                         if (r1 != 0 || r2 != 0)
8216                                 err += efunc(pc, "non-zero reserved bits\n");
8217                         if (rd >= nregs)
8218                                 err += efunc(pc, "invalid register %u\n", rd);
8219                         break;
8220                 case DIF_OP_NOP:
8221                 case DIF_OP_POPTS:
8222                 case DIF_OP_FLUSHTS:
8223                         if (r1 != 0 || r2 != 0 || rd != 0)
8224                                 err += efunc(pc, "non-zero reserved bits\n");
8225                         break;
8226                 case DIF_OP_SETX:
8227                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8228                                 err += efunc(pc, "invalid integer ref %u\n",
8229                                     DIF_INSTR_INTEGER(instr));
8230                         }
8231                         if (rd >= nregs)
8232                                 err += efunc(pc, "invalid register %u\n", rd);
8233                         if (rd == 0)
8234                                 err += efunc(pc, "cannot write to %r0\n");
8235                         break;
8236                 case DIF_OP_SETS:
8237                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8238                                 err += efunc(pc, "invalid string ref %u\n",
8239                                     DIF_INSTR_STRING(instr));
8240                         }
8241                         if (rd >= nregs)
8242                                 err += efunc(pc, "invalid register %u\n", rd);
8243                         if (rd == 0)
8244                                 err += efunc(pc, "cannot write to %r0\n");
8245                         break;
8246                 case DIF_OP_LDGA:
8247                 case DIF_OP_LDTA:
8248                         if (r1 > DIF_VAR_ARRAY_MAX)
8249                                 err += efunc(pc, "invalid array %u\n", r1);
8250                         if (r2 >= nregs)
8251                                 err += efunc(pc, "invalid register %u\n", r2);
8252                         if (rd >= nregs)
8253                                 err += efunc(pc, "invalid register %u\n", rd);
8254                         if (rd == 0)
8255                                 err += efunc(pc, "cannot write to %r0\n");
8256                         break;
8257                 case DIF_OP_LDGS:
8258                 case DIF_OP_LDTS:
8259                 case DIF_OP_LDLS:
8260                 case DIF_OP_LDGAA:
8261                 case DIF_OP_LDTAA:
8262                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8263                                 err += efunc(pc, "invalid variable %u\n", v);
8264                         if (rd >= nregs)
8265                                 err += efunc(pc, "invalid register %u\n", rd);
8266                         if (rd == 0)
8267                                 err += efunc(pc, "cannot write to %r0\n");
8268                         break;
8269                 case DIF_OP_STGS:
8270                 case DIF_OP_STTS:
8271                 case DIF_OP_STLS:
8272                 case DIF_OP_STGAA:
8273                 case DIF_OP_STTAA:
8274                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8275                                 err += efunc(pc, "invalid variable %u\n", v);
8276                         if (rs >= nregs)
8277                                 err += efunc(pc, "invalid register %u\n", rd);
8278                         break;
8279                 case DIF_OP_CALL:
8280                         if (subr > DIF_SUBR_MAX)
8281                                 err += efunc(pc, "invalid subr %u\n", subr);
8282                         if (rd >= nregs)
8283                                 err += efunc(pc, "invalid register %u\n", rd);
8284                         if (rd == 0)
8285                                 err += efunc(pc, "cannot write to %r0\n");
8286
8287                         if (subr == DIF_SUBR_COPYOUT ||
8288                             subr == DIF_SUBR_COPYOUTSTR) {
8289                                 dp->dtdo_destructive = 1;
8290                         }
8291                         break;
8292                 case DIF_OP_PUSHTR:
8293                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8294                                 err += efunc(pc, "invalid ref type %u\n", type);
8295                         if (r2 >= nregs)
8296                                 err += efunc(pc, "invalid register %u\n", r2);
8297                         if (rs >= nregs)
8298                                 err += efunc(pc, "invalid register %u\n", rs);
8299                         break;
8300                 case DIF_OP_PUSHTV:
8301                         if (type != DIF_TYPE_CTF)
8302                                 err += efunc(pc, "invalid val type %u\n", type);
8303                         if (r2 >= nregs)
8304                                 err += efunc(pc, "invalid register %u\n", r2);
8305                         if (rs >= nregs)
8306                                 err += efunc(pc, "invalid register %u\n", rs);
8307                         break;
8308                 default:
8309                         err += efunc(pc, "invalid opcode %u\n",
8310                             DIF_INSTR_OP(instr));
8311                 }
8312         }
8313
8314         if (dp->dtdo_len != 0 &&
8315             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8316                 err += efunc(dp->dtdo_len - 1,
8317                     "expected 'ret' as last DIF instruction\n");
8318         }
8319
8320         if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8321                 /*
8322                  * If we're not returning by reference, the size must be either
8323                  * 0 or the size of one of the base types.
8324                  */
8325                 switch (dp->dtdo_rtype.dtdt_size) {
8326                 case 0:
8327                 case sizeof (uint8_t):
8328                 case sizeof (uint16_t):
8329                 case sizeof (uint32_t):
8330                 case sizeof (uint64_t):
8331                         break;
8332
8333                 default:
8334                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
8335                 }
8336         }
8337
8338         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8339                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8340                 dtrace_diftype_t *vt, *et;
8341                 uint_t id, ndx;
8342
8343                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8344                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
8345                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8346                         err += efunc(i, "unrecognized variable scope %d\n",
8347                             v->dtdv_scope);
8348                         break;
8349                 }
8350
8351                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8352                     v->dtdv_kind != DIFV_KIND_SCALAR) {
8353                         err += efunc(i, "unrecognized variable type %d\n",
8354                             v->dtdv_kind);
8355                         break;
8356                 }
8357
8358                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8359                         err += efunc(i, "%d exceeds variable id limit\n", id);
8360                         break;
8361                 }
8362
8363                 if (id < DIF_VAR_OTHER_UBASE)
8364                         continue;
8365
8366                 /*
8367                  * For user-defined variables, we need to check that this
8368                  * definition is identical to any previous definition that we
8369                  * encountered.
8370                  */
8371                 ndx = id - DIF_VAR_OTHER_UBASE;
8372
8373                 switch (v->dtdv_scope) {
8374                 case DIFV_SCOPE_GLOBAL:
8375                         if (ndx < vstate->dtvs_nglobals) {
8376                                 dtrace_statvar_t *svar;
8377
8378                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8379                                         existing = &svar->dtsv_var;
8380                         }
8381
8382                         break;
8383
8384                 case DIFV_SCOPE_THREAD:
8385                         if (ndx < vstate->dtvs_ntlocals)
8386                                 existing = &vstate->dtvs_tlocals[ndx];
8387                         break;
8388
8389                 case DIFV_SCOPE_LOCAL:
8390                         if (ndx < vstate->dtvs_nlocals) {
8391                                 dtrace_statvar_t *svar;
8392
8393                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8394                                         existing = &svar->dtsv_var;
8395                         }
8396
8397                         break;
8398                 }
8399
8400                 vt = &v->dtdv_type;
8401
8402                 if (vt->dtdt_flags & DIF_TF_BYREF) {
8403                         if (vt->dtdt_size == 0) {
8404                                 err += efunc(i, "zero-sized variable\n");
8405                                 break;
8406                         }
8407
8408                         if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8409                             vt->dtdt_size > dtrace_global_maxsize) {
8410                                 err += efunc(i, "oversized by-ref global\n");
8411                                 break;
8412                         }
8413                 }
8414
8415                 if (existing == NULL || existing->dtdv_id == 0)
8416                         continue;
8417
8418                 ASSERT(existing->dtdv_id == v->dtdv_id);
8419                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
8420
8421                 if (existing->dtdv_kind != v->dtdv_kind)
8422                         err += efunc(i, "%d changed variable kind\n", id);
8423
8424                 et = &existing->dtdv_type;
8425
8426                 if (vt->dtdt_flags != et->dtdt_flags) {
8427                         err += efunc(i, "%d changed variable type flags\n", id);
8428                         break;
8429                 }
8430
8431                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8432                         err += efunc(i, "%d changed variable type size\n", id);
8433                         break;
8434                 }
8435         }
8436
8437         return (err);
8438 }
8439
8440 /*
8441  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
8442  * are much more constrained than normal DIFOs.  Specifically, they may
8443  * not:
8444  *
8445  * 1. Make calls to subroutines other than copyin(), copyinstr() or
8446  *    miscellaneous string routines
8447  * 2. Access DTrace variables other than the args[] array, and the
8448  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8449  * 3. Have thread-local variables.
8450  * 4. Have dynamic variables.
8451  */
8452 static int
8453 dtrace_difo_validate_helper(dtrace_difo_t *dp)
8454 {
8455         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8456         int err = 0;
8457         uint_t pc;
8458
8459         for (pc = 0; pc < dp->dtdo_len; pc++) {
8460                 dif_instr_t instr = dp->dtdo_buf[pc];
8461
8462                 uint_t v = DIF_INSTR_VAR(instr);
8463                 uint_t subr = DIF_INSTR_SUBR(instr);
8464                 uint_t op = DIF_INSTR_OP(instr);
8465
8466                 switch (op) {
8467                 case DIF_OP_OR:
8468                 case DIF_OP_XOR:
8469                 case DIF_OP_AND:
8470                 case DIF_OP_SLL:
8471                 case DIF_OP_SRL:
8472                 case DIF_OP_SRA:
8473                 case DIF_OP_SUB:
8474                 case DIF_OP_ADD:
8475                 case DIF_OP_MUL:
8476                 case DIF_OP_SDIV:
8477                 case DIF_OP_UDIV:
8478                 case DIF_OP_SREM:
8479                 case DIF_OP_UREM:
8480                 case DIF_OP_COPYS:
8481                 case DIF_OP_NOT:
8482                 case DIF_OP_MOV:
8483                 case DIF_OP_RLDSB:
8484                 case DIF_OP_RLDSH:
8485                 case DIF_OP_RLDSW:
8486                 case DIF_OP_RLDUB:
8487                 case DIF_OP_RLDUH:
8488                 case DIF_OP_RLDUW:
8489                 case DIF_OP_RLDX:
8490                 case DIF_OP_ULDSB:
8491                 case DIF_OP_ULDSH:
8492                 case DIF_OP_ULDSW:
8493                 case DIF_OP_ULDUB:
8494                 case DIF_OP_ULDUH:
8495                 case DIF_OP_ULDUW:
8496                 case DIF_OP_ULDX:
8497                 case DIF_OP_STB:
8498                 case DIF_OP_STH:
8499                 case DIF_OP_STW:
8500                 case DIF_OP_STX:
8501                 case DIF_OP_ALLOCS:
8502                 case DIF_OP_CMP:
8503                 case DIF_OP_SCMP:
8504                 case DIF_OP_TST:
8505                 case DIF_OP_BA:
8506                 case DIF_OP_BE:
8507                 case DIF_OP_BNE:
8508                 case DIF_OP_BG:
8509                 case DIF_OP_BGU:
8510                 case DIF_OP_BGE:
8511                 case DIF_OP_BGEU:
8512                 case DIF_OP_BL:
8513                 case DIF_OP_BLU:
8514                 case DIF_OP_BLE:
8515                 case DIF_OP_BLEU:
8516                 case DIF_OP_RET:
8517                 case DIF_OP_NOP:
8518                 case DIF_OP_POPTS:
8519                 case DIF_OP_FLUSHTS:
8520                 case DIF_OP_SETX:
8521                 case DIF_OP_SETS:
8522                 case DIF_OP_LDGA:
8523                 case DIF_OP_LDLS:
8524                 case DIF_OP_STGS:
8525                 case DIF_OP_STLS:
8526                 case DIF_OP_PUSHTR:
8527                 case DIF_OP_PUSHTV:
8528                         break;
8529
8530                 case DIF_OP_LDGS:
8531                         if (v >= DIF_VAR_OTHER_UBASE)
8532                                 break;
8533
8534                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8535                                 break;
8536
8537                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8538                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8539                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8540                             v == DIF_VAR_UID || v == DIF_VAR_GID)
8541                                 break;
8542
8543                         err += efunc(pc, "illegal variable %u\n", v);
8544                         break;
8545
8546                 case DIF_OP_LDTA:
8547                 case DIF_OP_LDTS:
8548                 case DIF_OP_LDGAA:
8549                 case DIF_OP_LDTAA:
8550                         err += efunc(pc, "illegal dynamic variable load\n");
8551                         break;
8552
8553                 case DIF_OP_STTS:
8554                 case DIF_OP_STGAA:
8555                 case DIF_OP_STTAA:
8556                         err += efunc(pc, "illegal dynamic variable store\n");
8557                         break;
8558
8559                 case DIF_OP_CALL:
8560                         if (subr == DIF_SUBR_ALLOCA ||
8561                             subr == DIF_SUBR_BCOPY ||
8562                             subr == DIF_SUBR_COPYIN ||
8563                             subr == DIF_SUBR_COPYINTO ||
8564                             subr == DIF_SUBR_COPYINSTR ||
8565                             subr == DIF_SUBR_INDEX ||
8566                             subr == DIF_SUBR_INET_NTOA ||
8567                             subr == DIF_SUBR_INET_NTOA6 ||
8568                             subr == DIF_SUBR_INET_NTOP ||
8569                             subr == DIF_SUBR_LLTOSTR ||
8570                             subr == DIF_SUBR_RINDEX ||
8571                             subr == DIF_SUBR_STRCHR ||
8572                             subr == DIF_SUBR_STRJOIN ||
8573                             subr == DIF_SUBR_STRRCHR ||
8574                             subr == DIF_SUBR_STRSTR ||
8575                             subr == DIF_SUBR_HTONS ||
8576                             subr == DIF_SUBR_HTONL ||
8577                             subr == DIF_SUBR_HTONLL ||
8578                             subr == DIF_SUBR_NTOHS ||
8579                             subr == DIF_SUBR_NTOHL ||
8580                             subr == DIF_SUBR_NTOHLL)
8581                                 break;
8582
8583                         err += efunc(pc, "invalid subr %u\n", subr);
8584                         break;
8585
8586                 default:
8587                         err += efunc(pc, "invalid opcode %u\n",
8588                             DIF_INSTR_OP(instr));
8589                 }
8590         }
8591
8592         return (err);
8593 }
8594
8595 /*
8596  * Returns 1 if the expression in the DIF object can be cached on a per-thread
8597  * basis; 0 if not.
8598  */
8599 static int
8600 dtrace_difo_cacheable(dtrace_difo_t *dp)
8601 {
8602         int i;
8603
8604         if (dp == NULL)
8605                 return (0);
8606
8607         for (i = 0; i < dp->dtdo_varlen; i++) {
8608                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8609
8610                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8611                         continue;
8612
8613                 switch (v->dtdv_id) {
8614                 case DIF_VAR_CURTHREAD:
8615                 case DIF_VAR_PID:
8616                 case DIF_VAR_TID:
8617                 case DIF_VAR_EXECNAME:
8618                 case DIF_VAR_ZONENAME:
8619                         break;
8620
8621                 default:
8622                         return (0);
8623                 }
8624         }
8625
8626         /*
8627          * This DIF object may be cacheable.  Now we need to look for any
8628          * array loading instructions, any memory loading instructions, or
8629          * any stores to thread-local variables.
8630          */
8631         for (i = 0; i < dp->dtdo_len; i++) {
8632                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8633
8634                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8635                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8636                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8637                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
8638                         return (0);
8639         }
8640
8641         return (1);
8642 }
8643
8644 static void
8645 dtrace_difo_hold(dtrace_difo_t *dp)
8646 {
8647         int i;
8648
8649         ASSERT(MUTEX_HELD(&dtrace_lock));
8650
8651         dp->dtdo_refcnt++;
8652         ASSERT(dp->dtdo_refcnt != 0);
8653
8654         /*
8655          * We need to check this DIF object for references to the variable
8656          * DIF_VAR_VTIMESTAMP.
8657          */
8658         for (i = 0; i < dp->dtdo_varlen; i++) {
8659                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8660
8661                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8662                         continue;
8663
8664                 if (dtrace_vtime_references++ == 0)
8665                         dtrace_vtime_enable();
8666         }
8667 }
8668
8669 /*
8670  * This routine calculates the dynamic variable chunksize for a given DIF
8671  * object.  The calculation is not fool-proof, and can probably be tricked by
8672  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
8673  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8674  * if a dynamic variable size exceeds the chunksize.
8675  */
8676 static void
8677 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8678 {
8679         uint64_t sval;
8680         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8681         const dif_instr_t *text = dp->dtdo_buf;
8682         uint_t pc, srd = 0;
8683         uint_t ttop = 0;
8684         size_t size, ksize;
8685         uint_t id, i;
8686
8687         for (pc = 0; pc < dp->dtdo_len; pc++) {
8688                 dif_instr_t instr = text[pc];
8689                 uint_t op = DIF_INSTR_OP(instr);
8690                 uint_t rd = DIF_INSTR_RD(instr);
8691                 uint_t r1 = DIF_INSTR_R1(instr);
8692                 uint_t nkeys = 0;
8693                 uchar_t scope;
8694
8695                 dtrace_key_t *key = tupregs;
8696
8697                 switch (op) {
8698                 case DIF_OP_SETX:
8699                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8700                         srd = rd;
8701                         continue;
8702
8703                 case DIF_OP_STTS:
8704                         key = &tupregs[DIF_DTR_NREGS];
8705                         key[0].dttk_size = 0;
8706                         key[1].dttk_size = 0;
8707                         nkeys = 2;
8708                         scope = DIFV_SCOPE_THREAD;
8709                         break;
8710
8711                 case DIF_OP_STGAA:
8712                 case DIF_OP_STTAA:
8713                         nkeys = ttop;
8714
8715                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
8716                                 key[nkeys++].dttk_size = 0;
8717
8718                         key[nkeys++].dttk_size = 0;
8719
8720                         if (op == DIF_OP_STTAA) {
8721                                 scope = DIFV_SCOPE_THREAD;
8722                         } else {
8723                                 scope = DIFV_SCOPE_GLOBAL;
8724                         }
8725
8726                         break;
8727
8728                 case DIF_OP_PUSHTR:
8729                         if (ttop == DIF_DTR_NREGS)
8730                                 return;
8731
8732                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
8733                                 /*
8734                                  * If the register for the size of the "pushtr"
8735                                  * is %r0 (or the value is 0) and the type is
8736                                  * a string, we'll use the system-wide default
8737                                  * string size.
8738                                  */
8739                                 tupregs[ttop++].dttk_size =
8740                                     dtrace_strsize_default;
8741                         } else {
8742                                 if (srd == 0)
8743                                         return;
8744
8745                                 tupregs[ttop++].dttk_size = sval;
8746                         }
8747
8748                         break;
8749
8750                 case DIF_OP_PUSHTV:
8751                         if (ttop == DIF_DTR_NREGS)
8752                                 return;
8753
8754                         tupregs[ttop++].dttk_size = 0;
8755                         break;
8756
8757                 case DIF_OP_FLUSHTS:
8758                         ttop = 0;
8759                         break;
8760
8761                 case DIF_OP_POPTS:
8762                         if (ttop != 0)
8763                                 ttop--;
8764                         break;
8765                 }
8766
8767                 sval = 0;
8768                 srd = 0;
8769
8770                 if (nkeys == 0)
8771                         continue;
8772
8773                 /*
8774                  * We have a dynamic variable allocation; calculate its size.
8775                  */
8776                 for (ksize = 0, i = 0; i < nkeys; i++)
8777                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
8778
8779                 size = sizeof (dtrace_dynvar_t);
8780                 size += sizeof (dtrace_key_t) * (nkeys - 1);
8781                 size += ksize;
8782
8783                 /*
8784                  * Now we need to determine the size of the stored data.
8785                  */
8786                 id = DIF_INSTR_VAR(instr);
8787
8788                 for (i = 0; i < dp->dtdo_varlen; i++) {
8789                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
8790
8791                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
8792                                 size += v->dtdv_type.dtdt_size;
8793                                 break;
8794                         }
8795                 }
8796
8797                 if (i == dp->dtdo_varlen)
8798                         return;
8799
8800                 /*
8801                  * We have the size.  If this is larger than the chunk size
8802                  * for our dynamic variable state, reset the chunk size.
8803                  */
8804                 size = P2ROUNDUP(size, sizeof (uint64_t));
8805
8806                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
8807                         vstate->dtvs_dynvars.dtds_chunksize = size;
8808         }
8809 }
8810
8811 static void
8812 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8813 {
8814         int i, oldsvars, osz, nsz, otlocals, ntlocals;
8815         uint_t id;
8816
8817         ASSERT(MUTEX_HELD(&dtrace_lock));
8818         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
8819
8820         for (i = 0; i < dp->dtdo_varlen; i++) {
8821                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8822                 dtrace_statvar_t *svar, ***svarp;
8823                 size_t dsize = 0;
8824                 uint8_t scope = v->dtdv_scope;
8825                 int *np;
8826
8827                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
8828                         continue;
8829
8830                 id -= DIF_VAR_OTHER_UBASE;
8831
8832                 switch (scope) {
8833                 case DIFV_SCOPE_THREAD:
8834                         while (id >= (otlocals = vstate->dtvs_ntlocals)) {
8835                                 dtrace_difv_t *tlocals;
8836
8837                                 if ((ntlocals = (otlocals << 1)) == 0)
8838                                         ntlocals = 1;
8839
8840                                 osz = otlocals * sizeof (dtrace_difv_t);
8841                                 nsz = ntlocals * sizeof (dtrace_difv_t);
8842
8843                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
8844
8845                                 if (osz != 0) {
8846                                         bcopy(vstate->dtvs_tlocals,
8847                                             tlocals, osz);
8848                                         kmem_free(vstate->dtvs_tlocals, osz);
8849                                 }
8850
8851                                 vstate->dtvs_tlocals = tlocals;
8852                                 vstate->dtvs_ntlocals = ntlocals;
8853                         }
8854
8855                         vstate->dtvs_tlocals[id] = *v;
8856                         continue;
8857
8858                 case DIFV_SCOPE_LOCAL:
8859                         np = &vstate->dtvs_nlocals;
8860                         svarp = &vstate->dtvs_locals;
8861
8862                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8863                                 dsize = NCPU * (v->dtdv_type.dtdt_size +
8864                                     sizeof (uint64_t));
8865                         else
8866                                 dsize = NCPU * sizeof (uint64_t);
8867
8868                         break;
8869
8870                 case DIFV_SCOPE_GLOBAL:
8871                         np = &vstate->dtvs_nglobals;
8872                         svarp = &vstate->dtvs_globals;
8873
8874                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
8875                                 dsize = v->dtdv_type.dtdt_size +
8876                                     sizeof (uint64_t);
8877
8878                         break;
8879
8880                 default:
8881                         ASSERT(0);
8882                 }
8883
8884                 while (id >= (oldsvars = *np)) {
8885                         dtrace_statvar_t **statics;
8886                         int newsvars, oldsize, newsize;
8887
8888                         if ((newsvars = (oldsvars << 1)) == 0)
8889                                 newsvars = 1;
8890
8891                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
8892                         newsize = newsvars * sizeof (dtrace_statvar_t *);
8893
8894                         statics = kmem_zalloc(newsize, KM_SLEEP);
8895
8896                         if (oldsize != 0) {
8897                                 bcopy(*svarp, statics, oldsize);
8898                                 kmem_free(*svarp, oldsize);
8899                         }
8900
8901                         *svarp = statics;
8902                         *np = newsvars;
8903                 }
8904
8905                 if ((svar = (*svarp)[id]) == NULL) {
8906                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
8907                         svar->dtsv_var = *v;
8908
8909                         if ((svar->dtsv_size = dsize) != 0) {
8910                                 svar->dtsv_data = (uint64_t)(uintptr_t)
8911                                     kmem_zalloc(dsize, KM_SLEEP);
8912                         }
8913
8914                         (*svarp)[id] = svar;
8915                 }
8916
8917                 svar->dtsv_refcnt++;
8918         }
8919
8920         dtrace_difo_chunksize(dp, vstate);
8921         dtrace_difo_hold(dp);
8922 }
8923
8924 static dtrace_difo_t *
8925 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8926 {
8927         dtrace_difo_t *new;
8928         size_t sz;
8929
8930         ASSERT(dp->dtdo_buf != NULL);
8931         ASSERT(dp->dtdo_refcnt != 0);
8932
8933         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
8934
8935         ASSERT(dp->dtdo_buf != NULL);
8936         sz = dp->dtdo_len * sizeof (dif_instr_t);
8937         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
8938         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
8939         new->dtdo_len = dp->dtdo_len;
8940
8941         if (dp->dtdo_strtab != NULL) {
8942                 ASSERT(dp->dtdo_strlen != 0);
8943                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
8944                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
8945                 new->dtdo_strlen = dp->dtdo_strlen;
8946         }
8947
8948         if (dp->dtdo_inttab != NULL) {
8949                 ASSERT(dp->dtdo_intlen != 0);
8950                 sz = dp->dtdo_intlen * sizeof (uint64_t);
8951                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
8952                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
8953                 new->dtdo_intlen = dp->dtdo_intlen;
8954         }
8955
8956         if (dp->dtdo_vartab != NULL) {
8957                 ASSERT(dp->dtdo_varlen != 0);
8958                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
8959                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
8960                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
8961                 new->dtdo_varlen = dp->dtdo_varlen;
8962         }
8963
8964         dtrace_difo_init(new, vstate);
8965         return (new);
8966 }
8967
8968 static void
8969 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8970 {
8971         int i;
8972
8973         ASSERT(dp->dtdo_refcnt == 0);
8974
8975         for (i = 0; i < dp->dtdo_varlen; i++) {
8976                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8977                 dtrace_statvar_t *svar, **svarp;
8978                 uint_t id;
8979                 uint8_t scope = v->dtdv_scope;
8980                 int *np;
8981
8982                 switch (scope) {
8983                 case DIFV_SCOPE_THREAD:
8984                         continue;
8985
8986                 case DIFV_SCOPE_LOCAL:
8987                         np = &vstate->dtvs_nlocals;
8988                         svarp = vstate->dtvs_locals;
8989                         break;
8990
8991                 case DIFV_SCOPE_GLOBAL:
8992                         np = &vstate->dtvs_nglobals;
8993                         svarp = vstate->dtvs_globals;
8994                         break;
8995
8996                 default:
8997                         ASSERT(0);
8998                 }
8999
9000                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9001                         continue;
9002
9003                 id -= DIF_VAR_OTHER_UBASE;
9004                 ASSERT(id < *np);
9005
9006                 svar = svarp[id];
9007                 ASSERT(svar != NULL);
9008                 ASSERT(svar->dtsv_refcnt > 0);
9009
9010                 if (--svar->dtsv_refcnt > 0)
9011                         continue;
9012
9013                 if (svar->dtsv_size != 0) {
9014                         ASSERT(svar->dtsv_data != NULL);
9015                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
9016                             svar->dtsv_size);
9017                 }
9018
9019                 kmem_free(svar, sizeof (dtrace_statvar_t));
9020                 svarp[id] = NULL;
9021         }
9022
9023         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9024         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9025         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9026         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9027
9028         kmem_free(dp, sizeof (dtrace_difo_t));
9029 }
9030
9031 static void
9032 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9033 {
9034         int i;
9035
9036         ASSERT(MUTEX_HELD(&dtrace_lock));
9037         ASSERT(dp->dtdo_refcnt != 0);
9038
9039         for (i = 0; i < dp->dtdo_varlen; i++) {
9040                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9041
9042                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9043                         continue;
9044
9045                 ASSERT(dtrace_vtime_references > 0);
9046                 if (--dtrace_vtime_references == 0)
9047                         dtrace_vtime_disable();
9048         }
9049
9050         if (--dp->dtdo_refcnt == 0)
9051                 dtrace_difo_destroy(dp, vstate);
9052 }
9053
9054 /*
9055  * DTrace Format Functions
9056  */
9057 static uint16_t
9058 dtrace_format_add(dtrace_state_t *state, char *str)
9059 {
9060         char *fmt, **new;
9061         uint16_t ndx, len = strlen(str) + 1;
9062
9063         fmt = kmem_zalloc(len, KM_SLEEP);
9064         bcopy(str, fmt, len);
9065
9066         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9067                 if (state->dts_formats[ndx] == NULL) {
9068                         state->dts_formats[ndx] = fmt;
9069                         return (ndx + 1);
9070                 }
9071         }
9072
9073         if (state->dts_nformats == USHRT_MAX) {
9074                 /*
9075                  * This is only likely if a denial-of-service attack is being
9076                  * attempted.  As such, it's okay to fail silently here.
9077                  */
9078                 kmem_free(fmt, len);
9079                 return (0);
9080         }
9081
9082         /*
9083          * For simplicity, we always resize the formats array to be exactly the
9084          * number of formats.
9085          */
9086         ndx = state->dts_nformats++;
9087         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9088
9089         if (state->dts_formats != NULL) {
9090                 ASSERT(ndx != 0);
9091                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
9092                 kmem_free(state->dts_formats, ndx * sizeof (char *));
9093         }
9094
9095         state->dts_formats = new;
9096         state->dts_formats[ndx] = fmt;
9097
9098         return (ndx + 1);
9099 }
9100
9101 static void
9102 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9103 {
9104         char *fmt;
9105
9106         ASSERT(state->dts_formats != NULL);
9107         ASSERT(format <= state->dts_nformats);
9108         ASSERT(state->dts_formats[format - 1] != NULL);
9109
9110         fmt = state->dts_formats[format - 1];
9111         kmem_free(fmt, strlen(fmt) + 1);
9112         state->dts_formats[format - 1] = NULL;
9113 }
9114
9115 static void
9116 dtrace_format_destroy(dtrace_state_t *state)
9117 {
9118         int i;
9119
9120         if (state->dts_nformats == 0) {
9121                 ASSERT(state->dts_formats == NULL);
9122                 return;
9123         }
9124
9125         ASSERT(state->dts_formats != NULL);
9126
9127         for (i = 0; i < state->dts_nformats; i++) {
9128                 char *fmt = state->dts_formats[i];
9129
9130                 if (fmt == NULL)
9131                         continue;
9132
9133                 kmem_free(fmt, strlen(fmt) + 1);
9134         }
9135
9136         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9137         state->dts_nformats = 0;
9138         state->dts_formats = NULL;
9139 }
9140
9141 /*
9142  * DTrace Predicate Functions
9143  */
9144 static dtrace_predicate_t *
9145 dtrace_predicate_create(dtrace_difo_t *dp)
9146 {
9147         dtrace_predicate_t *pred;
9148
9149         ASSERT(MUTEX_HELD(&dtrace_lock));
9150         ASSERT(dp->dtdo_refcnt != 0);
9151
9152         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9153         pred->dtp_difo = dp;
9154         pred->dtp_refcnt = 1;
9155
9156         if (!dtrace_difo_cacheable(dp))
9157                 return (pred);
9158
9159         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9160                 /*
9161                  * This is only theoretically possible -- we have had 2^32
9162                  * cacheable predicates on this machine.  We cannot allow any
9163                  * more predicates to become cacheable:  as unlikely as it is,
9164                  * there may be a thread caching a (now stale) predicate cache
9165                  * ID. (N.B.: the temptation is being successfully resisted to
9166                  * have this cmn_err() "Holy shit -- we executed this code!")
9167                  */
9168                 return (pred);
9169         }
9170
9171         pred->dtp_cacheid = dtrace_predcache_id++;
9172
9173         return (pred);
9174 }
9175
9176 static void
9177 dtrace_predicate_hold(dtrace_predicate_t *pred)
9178 {
9179         ASSERT(MUTEX_HELD(&dtrace_lock));
9180         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9181         ASSERT(pred->dtp_refcnt > 0);
9182
9183         pred->dtp_refcnt++;
9184 }
9185
9186 static void
9187 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9188 {
9189         dtrace_difo_t *dp = pred->dtp_difo;
9190
9191         ASSERT(MUTEX_HELD(&dtrace_lock));
9192         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9193         ASSERT(pred->dtp_refcnt > 0);
9194
9195         if (--pred->dtp_refcnt == 0) {
9196                 dtrace_difo_release(pred->dtp_difo, vstate);
9197                 kmem_free(pred, sizeof (dtrace_predicate_t));
9198         }
9199 }
9200
9201 /*
9202  * DTrace Action Description Functions
9203  */
9204 static dtrace_actdesc_t *
9205 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9206     uint64_t uarg, uint64_t arg)
9207 {
9208         dtrace_actdesc_t *act;
9209
9210         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
9211             arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
9212
9213         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9214         act->dtad_kind = kind;
9215         act->dtad_ntuple = ntuple;
9216         act->dtad_uarg = uarg;
9217         act->dtad_arg = arg;
9218         act->dtad_refcnt = 1;
9219
9220         return (act);
9221 }
9222
9223 static void
9224 dtrace_actdesc_hold(dtrace_actdesc_t *act)
9225 {
9226         ASSERT(act->dtad_refcnt >= 1);
9227         act->dtad_refcnt++;
9228 }
9229
9230 static void
9231 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9232 {
9233         dtrace_actkind_t kind = act->dtad_kind;
9234         dtrace_difo_t *dp;
9235
9236         ASSERT(act->dtad_refcnt >= 1);
9237
9238         if (--act->dtad_refcnt != 0)
9239                 return;
9240
9241         if ((dp = act->dtad_difo) != NULL)
9242                 dtrace_difo_release(dp, vstate);
9243
9244         if (DTRACEACT_ISPRINTFLIKE(kind)) {
9245                 char *str = (char *)(uintptr_t)act->dtad_arg;
9246
9247                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9248                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9249
9250                 if (str != NULL)
9251                         kmem_free(str, strlen(str) + 1);
9252         }
9253
9254         kmem_free(act, sizeof (dtrace_actdesc_t));
9255 }
9256
9257 /*
9258  * DTrace ECB Functions
9259  */
9260 static dtrace_ecb_t *
9261 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9262 {
9263         dtrace_ecb_t *ecb;
9264         dtrace_epid_t epid;
9265
9266         ASSERT(MUTEX_HELD(&dtrace_lock));
9267
9268         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9269         ecb->dte_predicate = NULL;
9270         ecb->dte_probe = probe;
9271
9272         /*
9273          * The default size is the size of the default action: recording
9274          * the epid.
9275          */
9276         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9277         ecb->dte_alignment = sizeof (dtrace_epid_t);
9278
9279         epid = state->dts_epid++;
9280
9281         if (epid - 1 >= state->dts_necbs) {
9282                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9283                 int necbs = state->dts_necbs << 1;
9284
9285                 ASSERT(epid == state->dts_necbs + 1);
9286
9287                 if (necbs == 0) {
9288                         ASSERT(oecbs == NULL);
9289                         necbs = 1;
9290                 }
9291
9292                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9293
9294                 if (oecbs != NULL)
9295                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9296
9297                 dtrace_membar_producer();
9298                 state->dts_ecbs = ecbs;
9299
9300                 if (oecbs != NULL) {
9301                         /*
9302                          * If this state is active, we must dtrace_sync()
9303                          * before we can free the old dts_ecbs array:  we're
9304                          * coming in hot, and there may be active ring
9305                          * buffer processing (which indexes into the dts_ecbs
9306                          * array) on another CPU.
9307                          */
9308                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9309                                 dtrace_sync();
9310
9311                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9312                 }
9313
9314                 dtrace_membar_producer();
9315                 state->dts_necbs = necbs;
9316         }
9317
9318         ecb->dte_state = state;
9319
9320         ASSERT(state->dts_ecbs[epid - 1] == NULL);
9321         dtrace_membar_producer();
9322         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9323
9324         return (ecb);
9325 }
9326
9327 static int
9328 dtrace_ecb_enable(dtrace_ecb_t *ecb)
9329 {
9330         dtrace_probe_t *probe = ecb->dte_probe;
9331
9332         ASSERT(MUTEX_HELD(&cpu_lock));
9333         ASSERT(MUTEX_HELD(&dtrace_lock));
9334         ASSERT(ecb->dte_next == NULL);
9335
9336         if (probe == NULL) {
9337                 /*
9338                  * This is the NULL probe -- there's nothing to do.
9339                  */
9340                 return (0);
9341         }
9342
9343         if (probe->dtpr_ecb == NULL) {
9344                 dtrace_provider_t *prov = probe->dtpr_provider;
9345
9346                 /*
9347                  * We're the first ECB on this probe.
9348                  */
9349                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9350
9351                 if (ecb->dte_predicate != NULL)
9352                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9353
9354                 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9355                     probe->dtpr_id, probe->dtpr_arg));
9356         } else {
9357                 /*
9358                  * This probe is already active.  Swing the last pointer to
9359                  * point to the new ECB, and issue a dtrace_sync() to assure
9360                  * that all CPUs have seen the change.
9361                  */
9362                 ASSERT(probe->dtpr_ecb_last != NULL);
9363                 probe->dtpr_ecb_last->dte_next = ecb;
9364                 probe->dtpr_ecb_last = ecb;
9365                 probe->dtpr_predcache = 0;
9366
9367                 dtrace_sync();
9368                 return (0);
9369         }
9370 }
9371
9372 static void
9373 dtrace_ecb_resize(dtrace_ecb_t *ecb)
9374 {
9375         uint32_t maxalign = sizeof (dtrace_epid_t);
9376         uint32_t align = sizeof (uint8_t), offs, diff;
9377         dtrace_action_t *act;
9378         int wastuple = 0;
9379         uint32_t aggbase = UINT32_MAX;
9380         dtrace_state_t *state = ecb->dte_state;
9381
9382         /*
9383          * If we record anything, we always record the epid.  (And we always
9384          * record it first.)
9385          */
9386         offs = sizeof (dtrace_epid_t);
9387         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9388
9389         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9390                 dtrace_recdesc_t *rec = &act->dta_rec;
9391
9392                 if ((align = rec->dtrd_alignment) > maxalign)
9393                         maxalign = align;
9394
9395                 if (!wastuple && act->dta_intuple) {
9396                         /*
9397                          * This is the first record in a tuple.  Align the
9398                          * offset to be at offset 4 in an 8-byte aligned
9399                          * block.
9400                          */
9401                         diff = offs + sizeof (dtrace_aggid_t);
9402
9403                         if (diff = (diff & (sizeof (uint64_t) - 1)))
9404                                 offs += sizeof (uint64_t) - diff;
9405
9406                         aggbase = offs - sizeof (dtrace_aggid_t);
9407                         ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9408                 }
9409
9410                 /*LINTED*/
9411                 if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9412                         /*
9413                          * The current offset is not properly aligned; align it.
9414                          */
9415                         offs += align - diff;
9416                 }
9417
9418                 rec->dtrd_offset = offs;
9419
9420                 if (offs + rec->dtrd_size > ecb->dte_needed) {
9421                         ecb->dte_needed = offs + rec->dtrd_size;
9422
9423                         if (ecb->dte_needed > state->dts_needed)
9424                                 state->dts_needed = ecb->dte_needed;
9425                 }
9426
9427                 if (DTRACEACT_ISAGG(act->dta_kind)) {
9428                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9429                         dtrace_action_t *first = agg->dtag_first, *prev;
9430
9431                         ASSERT(rec->dtrd_size != 0 && first != NULL);
9432                         ASSERT(wastuple);
9433                         ASSERT(aggbase != UINT32_MAX);
9434
9435                         agg->dtag_base = aggbase;
9436
9437                         while ((prev = first->dta_prev) != NULL &&
9438                             DTRACEACT_ISAGG(prev->dta_kind)) {
9439                                 agg = (dtrace_aggregation_t *)prev;
9440                                 first = agg->dtag_first;
9441                         }
9442
9443                         if (prev != NULL) {
9444                                 offs = prev->dta_rec.dtrd_offset +
9445                                     prev->dta_rec.dtrd_size;
9446                         } else {
9447                                 offs = sizeof (dtrace_epid_t);
9448                         }
9449                         wastuple = 0;
9450                 } else {
9451                         if (!act->dta_intuple)
9452                                 ecb->dte_size = offs + rec->dtrd_size;
9453
9454                         offs += rec->dtrd_size;
9455                 }
9456
9457                 wastuple = act->dta_intuple;
9458         }
9459
9460         if ((act = ecb->dte_action) != NULL &&
9461             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9462             ecb->dte_size == sizeof (dtrace_epid_t)) {
9463                 /*
9464                  * If the size is still sizeof (dtrace_epid_t), then all
9465                  * actions store no data; set the size to 0.
9466                  */
9467                 ecb->dte_alignment = maxalign;
9468                 ecb->dte_size = 0;
9469
9470                 /*
9471                  * If the needed space is still sizeof (dtrace_epid_t), then
9472                  * all actions need no additional space; set the needed
9473                  * size to 0.
9474                  */
9475                 if (ecb->dte_needed == sizeof (dtrace_epid_t))
9476                         ecb->dte_needed = 0;
9477
9478                 return;
9479         }
9480
9481         /*
9482          * Set our alignment, and make sure that the dte_size and dte_needed
9483          * are aligned to the size of an EPID.
9484          */
9485         ecb->dte_alignment = maxalign;
9486         ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9487             ~(sizeof (dtrace_epid_t) - 1);
9488         ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9489             ~(sizeof (dtrace_epid_t) - 1);
9490         ASSERT(ecb->dte_size <= ecb->dte_needed);
9491 }
9492
9493 static dtrace_action_t *
9494 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9495 {
9496         dtrace_aggregation_t *agg;
9497         size_t size = sizeof (uint64_t);
9498         int ntuple = desc->dtad_ntuple;
9499         dtrace_action_t *act;
9500         dtrace_recdesc_t *frec;
9501         dtrace_aggid_t aggid;
9502         dtrace_state_t *state = ecb->dte_state;
9503
9504         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9505         agg->dtag_ecb = ecb;
9506
9507         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9508
9509         switch (desc->dtad_kind) {
9510         case DTRACEAGG_MIN:
9511                 agg->dtag_initial = INT64_MAX;
9512                 agg->dtag_aggregate = dtrace_aggregate_min;
9513                 break;
9514
9515         case DTRACEAGG_MAX:
9516                 agg->dtag_initial = INT64_MIN;
9517                 agg->dtag_aggregate = dtrace_aggregate_max;
9518                 break;
9519
9520         case DTRACEAGG_COUNT:
9521                 agg->dtag_aggregate = dtrace_aggregate_count;
9522                 break;
9523
9524         case DTRACEAGG_QUANTIZE:
9525                 agg->dtag_aggregate = dtrace_aggregate_quantize;
9526                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9527                     sizeof (uint64_t);
9528                 break;
9529
9530         case DTRACEAGG_LQUANTIZE: {
9531                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9532                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9533
9534                 agg->dtag_initial = desc->dtad_arg;
9535                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
9536
9537                 if (step == 0 || levels == 0)
9538                         goto err;
9539
9540                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9541                 break;
9542         }
9543
9544         case DTRACEAGG_LLQUANTIZE: {
9545                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
9546                 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
9547                 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
9548                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
9549                 int64_t v;
9550
9551                 agg->dtag_initial = desc->dtad_arg;
9552                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
9553
9554                 if (factor < 2 || low >= high || nsteps < factor)
9555                         goto err;
9556
9557                 /*
9558                  * Now check that the number of steps evenly divides a power
9559                  * of the factor.  (This assures both integer bucket size and
9560                  * linearity within each magnitude.)
9561                  */
9562                 for (v = factor; v < nsteps; v *= factor)
9563                         continue;
9564
9565                 if ((v % nsteps) || (nsteps % factor))
9566                         goto err;
9567
9568                 size = (dtrace_aggregate_llquantize_bucket(factor,
9569                     low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
9570                 break;
9571         }
9572
9573         case DTRACEAGG_AVG:
9574                 agg->dtag_aggregate = dtrace_aggregate_avg;
9575                 size = sizeof (uint64_t) * 2;
9576                 break;
9577
9578         case DTRACEAGG_STDDEV:
9579                 agg->dtag_aggregate = dtrace_aggregate_stddev;
9580                 size = sizeof (uint64_t) * 4;
9581                 break;
9582
9583         case DTRACEAGG_SUM:
9584                 agg->dtag_aggregate = dtrace_aggregate_sum;
9585                 break;
9586
9587         default:
9588                 goto err;
9589         }
9590
9591         agg->dtag_action.dta_rec.dtrd_size = size;
9592
9593         if (ntuple == 0)
9594                 goto err;
9595
9596         /*
9597          * We must make sure that we have enough actions for the n-tuple.
9598          */
9599         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9600                 if (DTRACEACT_ISAGG(act->dta_kind))
9601                         break;
9602
9603                 if (--ntuple == 0) {
9604                         /*
9605                          * This is the action with which our n-tuple begins.
9606                          */
9607                         agg->dtag_first = act;
9608                         goto success;
9609                 }
9610         }
9611
9612         /*
9613          * This n-tuple is short by ntuple elements.  Return failure.
9614          */
9615         ASSERT(ntuple != 0);
9616 err:
9617         kmem_free(agg, sizeof (dtrace_aggregation_t));
9618         return (NULL);
9619
9620 success:
9621         /*
9622          * If the last action in the tuple has a size of zero, it's actually
9623          * an expression argument for the aggregating action.
9624          */
9625         ASSERT(ecb->dte_action_last != NULL);
9626         act = ecb->dte_action_last;
9627
9628         if (act->dta_kind == DTRACEACT_DIFEXPR) {
9629                 ASSERT(act->dta_difo != NULL);
9630
9631                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9632                         agg->dtag_hasarg = 1;
9633         }
9634
9635         /*
9636          * We need to allocate an id for this aggregation.
9637          */
9638         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9639             VM_BESTFIT | VM_SLEEP);
9640
9641         if (aggid - 1 >= state->dts_naggregations) {
9642                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
9643                 dtrace_aggregation_t **aggs;
9644                 int naggs = state->dts_naggregations << 1;
9645                 int onaggs = state->dts_naggregations;
9646
9647                 ASSERT(aggid == state->dts_naggregations + 1);
9648
9649                 if (naggs == 0) {
9650                         ASSERT(oaggs == NULL);
9651                         naggs = 1;
9652                 }
9653
9654                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9655
9656                 if (oaggs != NULL) {
9657                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9658                         kmem_free(oaggs, onaggs * sizeof (*aggs));
9659                 }
9660
9661                 state->dts_aggregations = aggs;
9662                 state->dts_naggregations = naggs;
9663         }
9664
9665         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9666         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9667
9668         frec = &agg->dtag_first->dta_rec;
9669         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9670                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9671
9672         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9673                 ASSERT(!act->dta_intuple);
9674                 act->dta_intuple = 1;
9675         }
9676
9677         return (&agg->dtag_action);
9678 }
9679
9680 static void
9681 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9682 {
9683         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9684         dtrace_state_t *state = ecb->dte_state;
9685         dtrace_aggid_t aggid = agg->dtag_id;
9686
9687         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9688         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9689
9690         ASSERT(state->dts_aggregations[aggid - 1] == agg);
9691         state->dts_aggregations[aggid - 1] = NULL;
9692
9693         kmem_free(agg, sizeof (dtrace_aggregation_t));
9694 }
9695
9696 static int
9697 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9698 {
9699         dtrace_action_t *action, *last;
9700         dtrace_difo_t *dp = desc->dtad_difo;
9701         uint32_t size = 0, align = sizeof (uint8_t), mask;
9702         uint16_t format = 0;
9703         dtrace_recdesc_t *rec;
9704         dtrace_state_t *state = ecb->dte_state;
9705         dtrace_optval_t *opt = state->dts_options, nframes, strsize;
9706         uint64_t arg = desc->dtad_arg;
9707
9708         ASSERT(MUTEX_HELD(&dtrace_lock));
9709         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9710
9711         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
9712                 /*
9713                  * If this is an aggregating action, there must be neither
9714                  * a speculate nor a commit on the action chain.
9715                  */
9716                 dtrace_action_t *act;
9717
9718                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9719                         if (act->dta_kind == DTRACEACT_COMMIT)
9720                                 return (EINVAL);
9721
9722                         if (act->dta_kind == DTRACEACT_SPECULATE)
9723                                 return (EINVAL);
9724                 }
9725
9726                 action = dtrace_ecb_aggregation_create(ecb, desc);
9727
9728                 if (action == NULL)
9729                         return (EINVAL);
9730         } else {
9731                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
9732                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
9733                     dp != NULL && dp->dtdo_destructive)) {
9734                         state->dts_destructive = 1;
9735                 }
9736
9737                 switch (desc->dtad_kind) {
9738                 case DTRACEACT_PRINTF:
9739                 case DTRACEACT_PRINTA:
9740                 case DTRACEACT_SYSTEM:
9741                 case DTRACEACT_FREOPEN:
9742                         /*
9743                          * We know that our arg is a string -- turn it into a
9744                          * format.
9745                          */
9746                         if (arg == NULL) {
9747                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
9748                                 format = 0;
9749                         } else {
9750                                 ASSERT(arg != NULL);
9751                                 ASSERT(arg > KERNELBASE);
9752                                 format = dtrace_format_add(state,
9753                                     (char *)(uintptr_t)arg);
9754                         }
9755
9756                         /*FALLTHROUGH*/
9757                 case DTRACEACT_LIBACT:
9758                 case DTRACEACT_DIFEXPR:
9759                         if (dp == NULL)
9760                                 return (EINVAL);
9761
9762                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
9763                                 break;
9764
9765                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
9766                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9767                                         return (EINVAL);
9768
9769                                 size = opt[DTRACEOPT_STRSIZE];
9770                         }
9771
9772                         break;
9773
9774                 case DTRACEACT_STACK:
9775                         if ((nframes = arg) == 0) {
9776                                 nframes = opt[DTRACEOPT_STACKFRAMES];
9777                                 ASSERT(nframes > 0);
9778                                 arg = nframes;
9779                         }
9780
9781                         size = nframes * sizeof (pc_t);
9782                         break;
9783
9784                 case DTRACEACT_JSTACK:
9785                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
9786                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
9787
9788                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
9789                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
9790
9791                         arg = DTRACE_USTACK_ARG(nframes, strsize);
9792
9793                         /*FALLTHROUGH*/
9794                 case DTRACEACT_USTACK:
9795                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
9796                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
9797                                 strsize = DTRACE_USTACK_STRSIZE(arg);
9798                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
9799                                 ASSERT(nframes > 0);
9800                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
9801                         }
9802
9803                         /*
9804                          * Save a slot for the pid.
9805                          */
9806                         size = (nframes + 1) * sizeof (uint64_t);
9807                         size += DTRACE_USTACK_STRSIZE(arg);
9808                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
9809
9810                         break;
9811
9812                 case DTRACEACT_SYM:
9813                 case DTRACEACT_MOD:
9814                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
9815                             sizeof (uint64_t)) ||
9816                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9817                                 return (EINVAL);
9818                         break;
9819
9820                 case DTRACEACT_USYM:
9821                 case DTRACEACT_UMOD:
9822                 case DTRACEACT_UADDR:
9823                         if (dp == NULL ||
9824                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
9825                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9826                                 return (EINVAL);
9827
9828                         /*
9829                          * We have a slot for the pid, plus a slot for the
9830                          * argument.  To keep things simple (aligned with
9831                          * bitness-neutral sizing), we store each as a 64-bit
9832                          * quantity.
9833                          */
9834                         size = 2 * sizeof (uint64_t);
9835                         break;
9836
9837                 case DTRACEACT_STOP:
9838                 case DTRACEACT_BREAKPOINT:
9839                 case DTRACEACT_PANIC:
9840                         break;
9841
9842                 case DTRACEACT_CHILL:
9843                 case DTRACEACT_DISCARD:
9844                 case DTRACEACT_RAISE:
9845                         if (dp == NULL)
9846                                 return (EINVAL);
9847                         break;
9848
9849                 case DTRACEACT_EXIT:
9850                         if (dp == NULL ||
9851                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
9852                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
9853                                 return (EINVAL);
9854                         break;
9855
9856                 case DTRACEACT_SPECULATE:
9857                         if (ecb->dte_size > sizeof (dtrace_epid_t))
9858                                 return (EINVAL);
9859
9860                         if (dp == NULL)
9861                                 return (EINVAL);
9862
9863                         state->dts_speculates = 1;
9864                         break;
9865
9866                 case DTRACEACT_COMMIT: {
9867                         dtrace_action_t *act = ecb->dte_action;
9868
9869                         for (; act != NULL; act = act->dta_next) {
9870                                 if (act->dta_kind == DTRACEACT_COMMIT)
9871                                         return (EINVAL);
9872                         }
9873
9874                         if (dp == NULL)
9875                                 return (EINVAL);
9876                         break;
9877                 }
9878
9879                 default:
9880                         return (EINVAL);
9881                 }
9882
9883                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
9884                         /*
9885                          * If this is a data-storing action or a speculate,
9886                          * we must be sure that there isn't a commit on the
9887                          * action chain.
9888                          */
9889                         dtrace_action_t *act = ecb->dte_action;
9890
9891                         for (; act != NULL; act = act->dta_next) {
9892                                 if (act->dta_kind == DTRACEACT_COMMIT)
9893                                         return (EINVAL);
9894                         }
9895                 }
9896
9897                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
9898                 action->dta_rec.dtrd_size = size;
9899         }
9900
9901         action->dta_refcnt = 1;
9902         rec = &action->dta_rec;
9903         size = rec->dtrd_size;
9904
9905         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
9906                 if (!(size & mask)) {
9907                         align = mask + 1;
9908                         break;
9909                 }
9910         }
9911
9912         action->dta_kind = desc->dtad_kind;
9913
9914         if ((action->dta_difo = dp) != NULL)
9915                 dtrace_difo_hold(dp);
9916
9917         rec->dtrd_action = action->dta_kind;
9918         rec->dtrd_arg = arg;
9919         rec->dtrd_uarg = desc->dtad_uarg;
9920         rec->dtrd_alignment = (uint16_t)align;
9921         rec->dtrd_format = format;
9922
9923         if ((last = ecb->dte_action_last) != NULL) {
9924                 ASSERT(ecb->dte_action != NULL);
9925                 action->dta_prev = last;
9926                 last->dta_next = action;
9927         } else {
9928                 ASSERT(ecb->dte_action == NULL);
9929                 ecb->dte_action = action;
9930         }
9931
9932         ecb->dte_action_last = action;
9933
9934         return (0);
9935 }
9936
9937 static void
9938 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
9939 {
9940         dtrace_action_t *act = ecb->dte_action, *next;
9941         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
9942         dtrace_difo_t *dp;
9943         uint16_t format;
9944
9945         if (act != NULL && act->dta_refcnt > 1) {
9946                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
9947                 act->dta_refcnt--;
9948         } else {
9949                 for (; act != NULL; act = next) {
9950                         next = act->dta_next;
9951                         ASSERT(next != NULL || act == ecb->dte_action_last);
9952                         ASSERT(act->dta_refcnt == 1);
9953
9954                         if ((format = act->dta_rec.dtrd_format) != 0)
9955                                 dtrace_format_remove(ecb->dte_state, format);
9956
9957                         if ((dp = act->dta_difo) != NULL)
9958                                 dtrace_difo_release(dp, vstate);
9959
9960                         if (DTRACEACT_ISAGG(act->dta_kind)) {
9961                                 dtrace_ecb_aggregation_destroy(ecb, act);
9962                         } else {
9963                                 kmem_free(act, sizeof (dtrace_action_t));
9964                         }
9965                 }
9966         }
9967
9968         ecb->dte_action = NULL;
9969         ecb->dte_action_last = NULL;
9970         ecb->dte_size = sizeof (dtrace_epid_t);
9971 }
9972
9973 static void
9974 dtrace_ecb_disable(dtrace_ecb_t *ecb)
9975 {
9976         /*
9977          * We disable the ECB by removing it from its probe.
9978          */
9979         dtrace_ecb_t *pecb, *prev = NULL;
9980         dtrace_probe_t *probe = ecb->dte_probe;
9981
9982         ASSERT(MUTEX_HELD(&dtrace_lock));
9983
9984         if (probe == NULL) {
9985                 /*
9986                  * This is the NULL probe; there is nothing to disable.
9987                  */
9988                 return;
9989         }
9990
9991         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
9992                 if (pecb == ecb)
9993                         break;
9994                 prev = pecb;
9995         }
9996
9997         ASSERT(pecb != NULL);
9998
9999         if (prev == NULL) {
10000                 probe->dtpr_ecb = ecb->dte_next;
10001         } else {
10002                 prev->dte_next = ecb->dte_next;
10003         }
10004
10005         if (ecb == probe->dtpr_ecb_last) {
10006                 ASSERT(ecb->dte_next == NULL);
10007                 probe->dtpr_ecb_last = prev;
10008         }
10009
10010         /*
10011          * The ECB has been disconnected from the probe; now sync to assure
10012          * that all CPUs have seen the change before returning.
10013          */
10014         dtrace_sync();
10015
10016         if (probe->dtpr_ecb == NULL) {
10017                 /*
10018                  * That was the last ECB on the probe; clear the predicate
10019                  * cache ID for the probe, disable it and sync one more time
10020                  * to assure that we'll never hit it again.
10021                  */
10022                 dtrace_provider_t *prov = probe->dtpr_provider;
10023
10024                 ASSERT(ecb->dte_next == NULL);
10025                 ASSERT(probe->dtpr_ecb_last == NULL);
10026                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10027                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10028                     probe->dtpr_id, probe->dtpr_arg);
10029                 dtrace_sync();
10030         } else {
10031                 /*
10032                  * There is at least one ECB remaining on the probe.  If there
10033                  * is _exactly_ one, set the probe's predicate cache ID to be
10034                  * the predicate cache ID of the remaining ECB.
10035                  */
10036                 ASSERT(probe->dtpr_ecb_last != NULL);
10037                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10038
10039                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10040                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10041
10042                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
10043
10044                         if (p != NULL)
10045                                 probe->dtpr_predcache = p->dtp_cacheid;
10046                 }
10047
10048                 ecb->dte_next = NULL;
10049         }
10050 }
10051
10052 static void
10053 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10054 {
10055         dtrace_state_t *state = ecb->dte_state;
10056         dtrace_vstate_t *vstate = &state->dts_vstate;
10057         dtrace_predicate_t *pred;
10058         dtrace_epid_t epid = ecb->dte_epid;
10059
10060         ASSERT(MUTEX_HELD(&dtrace_lock));
10061         ASSERT(ecb->dte_next == NULL);
10062         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10063
10064         if ((pred = ecb->dte_predicate) != NULL)
10065                 dtrace_predicate_release(pred, vstate);
10066
10067         dtrace_ecb_action_remove(ecb);
10068
10069         ASSERT(state->dts_ecbs[epid - 1] == ecb);
10070         state->dts_ecbs[epid - 1] = NULL;
10071
10072         kmem_free(ecb, sizeof (dtrace_ecb_t));
10073 }
10074
10075 static dtrace_ecb_t *
10076 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10077     dtrace_enabling_t *enab)
10078 {
10079         dtrace_ecb_t *ecb;
10080         dtrace_predicate_t *pred;
10081         dtrace_actdesc_t *act;
10082         dtrace_provider_t *prov;
10083         dtrace_ecbdesc_t *desc = enab->dten_current;
10084
10085         ASSERT(MUTEX_HELD(&dtrace_lock));
10086         ASSERT(state != NULL);
10087
10088         ecb = dtrace_ecb_add(state, probe);
10089         ecb->dte_uarg = desc->dted_uarg;
10090
10091         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10092                 dtrace_predicate_hold(pred);
10093                 ecb->dte_predicate = pred;
10094         }
10095
10096         if (probe != NULL) {
10097                 /*
10098                  * If the provider shows more leg than the consumer is old
10099                  * enough to see, we need to enable the appropriate implicit
10100                  * predicate bits to prevent the ecb from activating at
10101                  * revealing times.
10102                  *
10103                  * Providers specifying DTRACE_PRIV_USER at register time
10104                  * are stating that they need the /proc-style privilege
10105                  * model to be enforced, and this is what DTRACE_COND_OWNER
10106                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
10107                  */
10108                 prov = probe->dtpr_provider;
10109                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10110                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10111                         ecb->dte_cond |= DTRACE_COND_OWNER;
10112
10113                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10114                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10115                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10116
10117                 /*
10118                  * If the provider shows us kernel innards and the user
10119                  * is lacking sufficient privilege, enable the
10120                  * DTRACE_COND_USERMODE implicit predicate.
10121                  */
10122                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10123                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10124                         ecb->dte_cond |= DTRACE_COND_USERMODE;
10125         }
10126
10127         if (dtrace_ecb_create_cache != NULL) {
10128                 /*
10129                  * If we have a cached ecb, we'll use its action list instead
10130                  * of creating our own (saving both time and space).
10131                  */
10132                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10133                 dtrace_action_t *act = cached->dte_action;
10134
10135                 if (act != NULL) {
10136                         ASSERT(act->dta_refcnt > 0);
10137                         act->dta_refcnt++;
10138                         ecb->dte_action = act;
10139                         ecb->dte_action_last = cached->dte_action_last;
10140                         ecb->dte_needed = cached->dte_needed;
10141                         ecb->dte_size = cached->dte_size;
10142                         ecb->dte_alignment = cached->dte_alignment;
10143                 }
10144
10145                 return (ecb);
10146         }
10147
10148         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10149                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10150                         dtrace_ecb_destroy(ecb);
10151                         return (NULL);
10152                 }
10153         }
10154
10155         dtrace_ecb_resize(ecb);
10156
10157         return (dtrace_ecb_create_cache = ecb);
10158 }
10159
10160 static int
10161 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10162 {
10163         dtrace_ecb_t *ecb;
10164         dtrace_enabling_t *enab = arg;
10165         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10166
10167         ASSERT(state != NULL);
10168
10169         if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10170                 /*
10171                  * This probe was created in a generation for which this
10172                  * enabling has previously created ECBs; we don't want to
10173                  * enable it again, so just kick out.
10174                  */
10175                 return (DTRACE_MATCH_NEXT);
10176         }
10177
10178         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10179                 return (DTRACE_MATCH_DONE);
10180
10181         if (dtrace_ecb_enable(ecb) < 0)
10182                 return (DTRACE_MATCH_FAIL);
10183
10184         return (DTRACE_MATCH_NEXT);
10185 }
10186
10187 static dtrace_ecb_t *
10188 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10189 {
10190         dtrace_ecb_t *ecb;
10191
10192         ASSERT(MUTEX_HELD(&dtrace_lock));
10193
10194         if (id == 0 || id > state->dts_necbs)
10195                 return (NULL);
10196
10197         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10198         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10199
10200         return (state->dts_ecbs[id - 1]);
10201 }
10202
10203 static dtrace_aggregation_t *
10204 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10205 {
10206         dtrace_aggregation_t *agg;
10207
10208         ASSERT(MUTEX_HELD(&dtrace_lock));
10209
10210         if (id == 0 || id > state->dts_naggregations)
10211                 return (NULL);
10212
10213         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10214         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10215             agg->dtag_id == id);
10216
10217         return (state->dts_aggregations[id - 1]);
10218 }
10219
10220 /*
10221  * DTrace Buffer Functions
10222  *
10223  * The following functions manipulate DTrace buffers.  Most of these functions
10224  * are called in the context of establishing or processing consumer state;
10225  * exceptions are explicitly noted.
10226  */
10227
10228 /*
10229  * Note:  called from cross call context.  This function switches the two
10230  * buffers on a given CPU.  The atomicity of this operation is assured by
10231  * disabling interrupts while the actual switch takes place; the disabling of
10232  * interrupts serializes the execution with any execution of dtrace_probe() on
10233  * the same CPU.
10234  */
10235 static void
10236 dtrace_buffer_switch(dtrace_buffer_t *buf)
10237 {
10238         caddr_t tomax = buf->dtb_tomax;
10239         caddr_t xamot = buf->dtb_xamot;
10240         dtrace_icookie_t cookie;
10241         hrtime_t now = dtrace_gethrtime();
10242
10243         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10244         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10245
10246         cookie = dtrace_interrupt_disable();
10247         buf->dtb_tomax = xamot;
10248         buf->dtb_xamot = tomax;
10249         buf->dtb_xamot_drops = buf->dtb_drops;
10250         buf->dtb_xamot_offset = buf->dtb_offset;
10251         buf->dtb_xamot_errors = buf->dtb_errors;
10252         buf->dtb_xamot_flags = buf->dtb_flags;
10253         buf->dtb_offset = 0;
10254         buf->dtb_drops = 0;
10255         buf->dtb_errors = 0;
10256         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10257         buf->dtb_interval = now - buf->dtb_switched;
10258         buf->dtb_switched = now;
10259         dtrace_interrupt_enable(cookie);
10260 }
10261
10262 /*
10263  * Note:  called from cross call context.  This function activates a buffer
10264  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
10265  * is guaranteed by the disabling of interrupts.
10266  */
10267 static void
10268 dtrace_buffer_activate(dtrace_state_t *state)
10269 {
10270         dtrace_buffer_t *buf;
10271         dtrace_icookie_t cookie = dtrace_interrupt_disable();
10272
10273         buf = &state->dts_buffer[CPU->cpu_id];
10274
10275         if (buf->dtb_tomax != NULL) {
10276                 /*
10277                  * We might like to assert that the buffer is marked inactive,
10278                  * but this isn't necessarily true:  the buffer for the CPU
10279                  * that processes the BEGIN probe has its buffer activated
10280                  * manually.  In this case, we take the (harmless) action
10281                  * re-clearing the bit INACTIVE bit.
10282                  */
10283                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10284         }
10285
10286         dtrace_interrupt_enable(cookie);
10287 }
10288
10289 static int
10290 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10291     processorid_t cpu, int *factor)
10292 {
10293         cpu_t *cp;
10294         dtrace_buffer_t *buf;
10295         int allocated = 0, desired = 0;
10296
10297         ASSERT(MUTEX_HELD(&cpu_lock));
10298         ASSERT(MUTEX_HELD(&dtrace_lock));
10299
10300         *factor = 1;
10301
10302         if (size > dtrace_nonroot_maxsize &&
10303             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10304                 return (EFBIG);
10305
10306         cp = cpu_list;
10307
10308         do {
10309                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10310                         continue;
10311
10312                 buf = &bufs[cp->cpu_id];
10313
10314                 /*
10315                  * If there is already a buffer allocated for this CPU, it
10316                  * is only possible that this is a DR event.  In this case,
10317                  * the buffer size must match our specified size.
10318                  */
10319                 if (buf->dtb_tomax != NULL) {
10320                         ASSERT(buf->dtb_size == size);
10321                         continue;
10322                 }
10323
10324                 ASSERT(buf->dtb_xamot == NULL);
10325
10326                 if ((buf->dtb_tomax = kmem_zalloc(size,
10327                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10328                         goto err;
10329
10330                 buf->dtb_size = size;
10331                 buf->dtb_flags = flags;
10332                 buf->dtb_offset = 0;
10333                 buf->dtb_drops = 0;
10334
10335                 if (flags & DTRACEBUF_NOSWITCH)
10336                         continue;
10337
10338                 if ((buf->dtb_xamot = kmem_zalloc(size,
10339                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
10340                         goto err;
10341         } while ((cp = cp->cpu_next) != cpu_list);
10342
10343         return (0);
10344
10345 err:
10346         cp = cpu_list;
10347
10348         do {
10349                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10350                         continue;
10351
10352                 buf = &bufs[cp->cpu_id];
10353                 desired += 2;
10354
10355                 if (buf->dtb_xamot != NULL) {
10356                         ASSERT(buf->dtb_tomax != NULL);
10357                         ASSERT(buf->dtb_size == size);
10358                         kmem_free(buf->dtb_xamot, size);
10359                         allocated++;
10360                 }
10361
10362                 if (buf->dtb_tomax != NULL) {
10363                         ASSERT(buf->dtb_size == size);
10364                         kmem_free(buf->dtb_tomax, size);
10365                         allocated++;
10366                 }
10367
10368                 buf->dtb_tomax = NULL;
10369                 buf->dtb_xamot = NULL;
10370                 buf->dtb_size = 0;
10371         } while ((cp = cp->cpu_next) != cpu_list);
10372
10373         *factor = desired / (allocated > 0 ? allocated : 1);
10374
10375         return (ENOMEM);
10376 }
10377
10378 /*
10379  * Note:  called from probe context.  This function just increments the drop
10380  * count on a buffer.  It has been made a function to allow for the
10381  * possibility of understanding the source of mysterious drop counts.  (A
10382  * problem for which one may be particularly disappointed that DTrace cannot
10383  * be used to understand DTrace.)
10384  */
10385 static void
10386 dtrace_buffer_drop(dtrace_buffer_t *buf)
10387 {
10388         buf->dtb_drops++;
10389 }
10390
10391 /*
10392  * Note:  called from probe context.  This function is called to reserve space
10393  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
10394  * mstate.  Returns the new offset in the buffer, or a negative value if an
10395  * error has occurred.
10396  */
10397 static intptr_t
10398 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10399     dtrace_state_t *state, dtrace_mstate_t *mstate)
10400 {
10401         intptr_t offs = buf->dtb_offset, soffs;
10402         intptr_t woffs;
10403         caddr_t tomax;
10404         size_t total;
10405
10406         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10407                 return (-1);
10408
10409         if ((tomax = buf->dtb_tomax) == NULL) {
10410                 dtrace_buffer_drop(buf);
10411                 return (-1);
10412         }
10413
10414         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10415                 while (offs & (align - 1)) {
10416                         /*
10417                          * Assert that our alignment is off by a number which
10418                          * is itself sizeof (uint32_t) aligned.
10419                          */
10420                         ASSERT(!((align - (offs & (align - 1))) &
10421                             (sizeof (uint32_t) - 1)));
10422                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10423                         offs += sizeof (uint32_t);
10424                 }
10425
10426                 if ((soffs = offs + needed) > buf->dtb_size) {
10427                         dtrace_buffer_drop(buf);
10428                         return (-1);
10429                 }
10430
10431                 if (mstate == NULL)
10432                         return (offs);
10433
10434                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10435                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
10436                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10437
10438                 return (offs);
10439         }
10440
10441         if (buf->dtb_flags & DTRACEBUF_FILL) {
10442                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10443                     (buf->dtb_flags & DTRACEBUF_FULL))
10444                         return (-1);
10445                 goto out;
10446         }
10447
10448         total = needed + (offs & (align - 1));
10449
10450         /*
10451          * For a ring buffer, life is quite a bit more complicated.  Before
10452          * we can store any padding, we need to adjust our wrapping offset.
10453          * (If we've never before wrapped or we're not about to, no adjustment
10454          * is required.)
10455          */
10456         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10457             offs + total > buf->dtb_size) {
10458                 woffs = buf->dtb_xamot_offset;
10459
10460                 if (offs + total > buf->dtb_size) {
10461                         /*
10462                          * We can't fit in the end of the buffer.  First, a
10463                          * sanity check that we can fit in the buffer at all.
10464                          */
10465                         if (total > buf->dtb_size) {
10466                                 dtrace_buffer_drop(buf);
10467                                 return (-1);
10468                         }
10469
10470                         /*
10471                          * We're going to be storing at the top of the buffer,
10472                          * so now we need to deal with the wrapped offset.  We
10473                          * only reset our wrapped offset to 0 if it is
10474                          * currently greater than the current offset.  If it
10475                          * is less than the current offset, it is because a
10476                          * previous allocation induced a wrap -- but the
10477                          * allocation didn't subsequently take the space due
10478                          * to an error or false predicate evaluation.  In this
10479                          * case, we'll just leave the wrapped offset alone: if
10480                          * the wrapped offset hasn't been advanced far enough
10481                          * for this allocation, it will be adjusted in the
10482                          * lower loop.
10483                          */
10484                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10485                                 if (woffs >= offs)
10486                                         woffs = 0;
10487                         } else {
10488                                 woffs = 0;
10489                         }
10490
10491                         /*
10492                          * Now we know that we're going to be storing to the
10493                          * top of the buffer and that there is room for us
10494                          * there.  We need to clear the buffer from the current
10495                          * offset to the end (there may be old gunk there).
10496                          */
10497                         while (offs < buf->dtb_size)
10498                                 tomax[offs++] = 0;
10499
10500                         /*
10501                          * We need to set our offset to zero.  And because we
10502                          * are wrapping, we need to set the bit indicating as
10503                          * much.  We can also adjust our needed space back
10504                          * down to the space required by the ECB -- we know
10505                          * that the top of the buffer is aligned.
10506                          */
10507                         offs = 0;
10508                         total = needed;
10509                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
10510                 } else {
10511                         /*
10512                          * There is room for us in the buffer, so we simply
10513                          * need to check the wrapped offset.
10514                          */
10515                         if (woffs < offs) {
10516                                 /*
10517                                  * The wrapped offset is less than the offset.
10518                                  * This can happen if we allocated buffer space
10519                                  * that induced a wrap, but then we didn't
10520                                  * subsequently take the space due to an error
10521                                  * or false predicate evaluation.  This is
10522                                  * okay; we know that _this_ allocation isn't
10523                                  * going to induce a wrap.  We still can't
10524                                  * reset the wrapped offset to be zero,
10525                                  * however: the space may have been trashed in
10526                                  * the previous failed probe attempt.  But at
10527                                  * least the wrapped offset doesn't need to
10528                                  * be adjusted at all...
10529                                  */
10530                                 goto out;
10531                         }
10532                 }
10533
10534                 while (offs + total > woffs) {
10535                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10536                         size_t size;
10537
10538                         if (epid == DTRACE_EPIDNONE) {
10539                                 size = sizeof (uint32_t);
10540                         } else {
10541                                 ASSERT(epid <= state->dts_necbs);
10542                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
10543
10544                                 size = state->dts_ecbs[epid - 1]->dte_size;
10545                         }
10546
10547                         ASSERT(woffs + size <= buf->dtb_size);
10548                         ASSERT(size != 0);
10549
10550                         if (woffs + size == buf->dtb_size) {
10551                                 /*
10552                                  * We've reached the end of the buffer; we want
10553                                  * to set the wrapped offset to 0 and break
10554                                  * out.  However, if the offs is 0, then we're
10555                                  * in a strange edge-condition:  the amount of
10556                                  * space that we want to reserve plus the size
10557                                  * of the record that we're overwriting is
10558                                  * greater than the size of the buffer.  This
10559                                  * is problematic because if we reserve the
10560                                  * space but subsequently don't consume it (due
10561                                  * to a failed predicate or error) the wrapped
10562                                  * offset will be 0 -- yet the EPID at offset 0
10563                                  * will not be committed.  This situation is
10564                                  * relatively easy to deal with:  if we're in
10565                                  * this case, the buffer is indistinguishable
10566                                  * from one that hasn't wrapped; we need only
10567                                  * finish the job by clearing the wrapped bit,
10568                                  * explicitly setting the offset to be 0, and
10569                                  * zero'ing out the old data in the buffer.
10570                                  */
10571                                 if (offs == 0) {
10572                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10573                                         buf->dtb_offset = 0;
10574                                         woffs = total;
10575
10576                                         while (woffs < buf->dtb_size)
10577                                                 tomax[woffs++] = 0;
10578                                 }
10579
10580                                 woffs = 0;
10581                                 break;
10582                         }
10583
10584                         woffs += size;
10585                 }
10586
10587                 /*
10588                  * We have a wrapped offset.  It may be that the wrapped offset
10589                  * has become zero -- that's okay.
10590                  */
10591                 buf->dtb_xamot_offset = woffs;
10592         }
10593
10594 out:
10595         /*
10596          * Now we can plow the buffer with any necessary padding.
10597          */
10598         while (offs & (align - 1)) {
10599                 /*
10600                  * Assert that our alignment is off by a number which
10601                  * is itself sizeof (uint32_t) aligned.
10602                  */
10603                 ASSERT(!((align - (offs & (align - 1))) &
10604                     (sizeof (uint32_t) - 1)));
10605                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10606                 offs += sizeof (uint32_t);
10607         }
10608
10609         if (buf->dtb_flags & DTRACEBUF_FILL) {
10610                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
10611                         buf->dtb_flags |= DTRACEBUF_FULL;
10612                         return (-1);
10613                 }
10614         }
10615
10616         if (mstate == NULL)
10617                 return (offs);
10618
10619         /*
10620          * For ring buffers and fill buffers, the scratch space is always
10621          * the inactive buffer.
10622          */
10623         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10624         mstate->dtms_scratch_size = buf->dtb_size;
10625         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10626
10627         return (offs);
10628 }
10629
10630 static void
10631 dtrace_buffer_polish(dtrace_buffer_t *buf)
10632 {
10633         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10634         ASSERT(MUTEX_HELD(&dtrace_lock));
10635
10636         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10637                 return;
10638
10639         /*
10640          * We need to polish the ring buffer.  There are three cases:
10641          *
10642          * - The first (and presumably most common) is that there is no gap
10643          *   between the buffer offset and the wrapped offset.  In this case,
10644          *   there is nothing in the buffer that isn't valid data; we can
10645          *   mark the buffer as polished and return.
10646          *
10647          * - The second (less common than the first but still more common
10648          *   than the third) is that there is a gap between the buffer offset
10649          *   and the wrapped offset, and the wrapped offset is larger than the
10650          *   buffer offset.  This can happen because of an alignment issue, or
10651          *   can happen because of a call to dtrace_buffer_reserve() that
10652          *   didn't subsequently consume the buffer space.  In this case,
10653          *   we need to zero the data from the buffer offset to the wrapped
10654          *   offset.
10655          *
10656          * - The third (and least common) is that there is a gap between the
10657          *   buffer offset and the wrapped offset, but the wrapped offset is
10658          *   _less_ than the buffer offset.  This can only happen because a
10659          *   call to dtrace_buffer_reserve() induced a wrap, but the space
10660          *   was not subsequently consumed.  In this case, we need to zero the
10661          *   space from the offset to the end of the buffer _and_ from the
10662          *   top of the buffer to the wrapped offset.
10663          */
10664         if (buf->dtb_offset < buf->dtb_xamot_offset) {
10665                 bzero(buf->dtb_tomax + buf->dtb_offset,
10666                     buf->dtb_xamot_offset - buf->dtb_offset);
10667         }
10668
10669         if (buf->dtb_offset > buf->dtb_xamot_offset) {
10670                 bzero(buf->dtb_tomax + buf->dtb_offset,
10671                     buf->dtb_size - buf->dtb_offset);
10672                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10673         }
10674 }
10675
10676 /*
10677  * This routine determines if data generated at the specified time has likely
10678  * been entirely consumed at user-level.  This routine is called to determine
10679  * if an ECB on a defunct probe (but for an active enabling) can be safely
10680  * disabled and destroyed.
10681  */
10682 static int
10683 dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
10684 {
10685         int i;
10686
10687         for (i = 0; i < NCPU; i++) {
10688                 dtrace_buffer_t *buf = &bufs[i];
10689
10690                 if (buf->dtb_size == 0)
10691                         continue;
10692
10693                 if (buf->dtb_flags & DTRACEBUF_RING)
10694                         return (0);
10695
10696                 if (!buf->dtb_switched && buf->dtb_offset != 0)
10697                         return (0);
10698
10699                 if (buf->dtb_switched - buf->dtb_interval < when)
10700                         return (0);
10701         }
10702
10703         return (1);
10704 }
10705
10706 static void
10707 dtrace_buffer_free(dtrace_buffer_t *bufs)
10708 {
10709         int i;
10710
10711         for (i = 0; i < NCPU; i++) {
10712                 dtrace_buffer_t *buf = &bufs[i];
10713
10714                 if (buf->dtb_tomax == NULL) {
10715                         ASSERT(buf->dtb_xamot == NULL);
10716                         ASSERT(buf->dtb_size == 0);
10717                         continue;
10718                 }
10719
10720                 if (buf->dtb_xamot != NULL) {
10721                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10722                         kmem_free(buf->dtb_xamot, buf->dtb_size);
10723                 }
10724
10725                 kmem_free(buf->dtb_tomax, buf->dtb_size);
10726                 buf->dtb_size = 0;
10727                 buf->dtb_tomax = NULL;
10728                 buf->dtb_xamot = NULL;
10729         }
10730 }
10731
10732 /*
10733  * DTrace Enabling Functions
10734  */
10735 static dtrace_enabling_t *
10736 dtrace_enabling_create(dtrace_vstate_t *vstate)
10737 {
10738         dtrace_enabling_t *enab;
10739
10740         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
10741         enab->dten_vstate = vstate;
10742
10743         return (enab);
10744 }
10745
10746 static void
10747 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
10748 {
10749         dtrace_ecbdesc_t **ndesc;
10750         size_t osize, nsize;
10751
10752         /*
10753          * We can't add to enablings after we've enabled them, or after we've
10754          * retained them.
10755          */
10756         ASSERT(enab->dten_probegen == 0);
10757         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10758
10759         if (enab->dten_ndesc < enab->dten_maxdesc) {
10760                 enab->dten_desc[enab->dten_ndesc++] = ecb;
10761                 return;
10762         }
10763
10764         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10765
10766         if (enab->dten_maxdesc == 0) {
10767                 enab->dten_maxdesc = 1;
10768         } else {
10769                 enab->dten_maxdesc <<= 1;
10770         }
10771
10772         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
10773
10774         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
10775         ndesc = kmem_zalloc(nsize, KM_SLEEP);
10776         bcopy(enab->dten_desc, ndesc, osize);
10777         kmem_free(enab->dten_desc, osize);
10778
10779         enab->dten_desc = ndesc;
10780         enab->dten_desc[enab->dten_ndesc++] = ecb;
10781 }
10782
10783 static void
10784 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
10785     dtrace_probedesc_t *pd)
10786 {
10787         dtrace_ecbdesc_t *new;
10788         dtrace_predicate_t *pred;
10789         dtrace_actdesc_t *act;
10790
10791         /*
10792          * We're going to create a new ECB description that matches the
10793          * specified ECB in every way, but has the specified probe description.
10794          */
10795         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
10796
10797         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
10798                 dtrace_predicate_hold(pred);
10799
10800         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
10801                 dtrace_actdesc_hold(act);
10802
10803         new->dted_action = ecb->dted_action;
10804         new->dted_pred = ecb->dted_pred;
10805         new->dted_probe = *pd;
10806         new->dted_uarg = ecb->dted_uarg;
10807
10808         dtrace_enabling_add(enab, new);
10809 }
10810
10811 static void
10812 dtrace_enabling_dump(dtrace_enabling_t *enab)
10813 {
10814         int i;
10815
10816         for (i = 0; i < enab->dten_ndesc; i++) {
10817                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
10818
10819                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
10820                     desc->dtpd_provider, desc->dtpd_mod,
10821                     desc->dtpd_func, desc->dtpd_name);
10822         }
10823 }
10824
10825 static void
10826 dtrace_enabling_destroy(dtrace_enabling_t *enab)
10827 {
10828         int i;
10829         dtrace_ecbdesc_t *ep;
10830         dtrace_vstate_t *vstate = enab->dten_vstate;
10831
10832         ASSERT(MUTEX_HELD(&dtrace_lock));
10833
10834         for (i = 0; i < enab->dten_ndesc; i++) {
10835                 dtrace_actdesc_t *act, *next;
10836                 dtrace_predicate_t *pred;
10837
10838                 ep = enab->dten_desc[i];
10839
10840                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
10841                         dtrace_predicate_release(pred, vstate);
10842
10843                 for (act = ep->dted_action; act != NULL; act = next) {
10844                         next = act->dtad_next;
10845                         dtrace_actdesc_release(act, vstate);
10846                 }
10847
10848                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
10849         }
10850
10851         kmem_free(enab->dten_desc,
10852             enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
10853
10854         /*
10855          * If this was a retained enabling, decrement the dts_nretained count
10856          * and take it off of the dtrace_retained list.
10857          */
10858         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
10859             dtrace_retained == enab) {
10860                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
10861                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
10862                 enab->dten_vstate->dtvs_state->dts_nretained--;
10863                 dtrace_retained_gen++;
10864         }
10865
10866         if (enab->dten_prev == NULL) {
10867                 if (dtrace_retained == enab) {
10868                         dtrace_retained = enab->dten_next;
10869
10870                         if (dtrace_retained != NULL)
10871                                 dtrace_retained->dten_prev = NULL;
10872                 }
10873         } else {
10874                 ASSERT(enab != dtrace_retained);
10875                 ASSERT(dtrace_retained != NULL);
10876                 enab->dten_prev->dten_next = enab->dten_next;
10877         }
10878
10879         if (enab->dten_next != NULL) {
10880                 ASSERT(dtrace_retained != NULL);
10881                 enab->dten_next->dten_prev = enab->dten_prev;
10882         }
10883
10884         kmem_free(enab, sizeof (dtrace_enabling_t));
10885 }
10886
10887 static int
10888 dtrace_enabling_retain(dtrace_enabling_t *enab)
10889 {
10890         dtrace_state_t *state;
10891
10892         ASSERT(MUTEX_HELD(&dtrace_lock));
10893         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
10894         ASSERT(enab->dten_vstate != NULL);
10895
10896         state = enab->dten_vstate->dtvs_state;
10897         ASSERT(state != NULL);
10898
10899         /*
10900          * We only allow each state to retain dtrace_retain_max enablings.
10901          */
10902         if (state->dts_nretained >= dtrace_retain_max)
10903                 return (ENOSPC);
10904
10905         state->dts_nretained++;
10906         dtrace_retained_gen++;
10907
10908         if (dtrace_retained == NULL) {
10909                 dtrace_retained = enab;
10910                 return (0);
10911         }
10912
10913         enab->dten_next = dtrace_retained;
10914         dtrace_retained->dten_prev = enab;
10915         dtrace_retained = enab;
10916
10917         return (0);
10918 }
10919
10920 static int
10921 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
10922     dtrace_probedesc_t *create)
10923 {
10924         dtrace_enabling_t *new, *enab;
10925         int found = 0, err = ENOENT;
10926
10927         ASSERT(MUTEX_HELD(&dtrace_lock));
10928         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
10929         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
10930         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
10931         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
10932
10933         new = dtrace_enabling_create(&state->dts_vstate);
10934
10935         /*
10936          * Iterate over all retained enablings, looking for enablings that
10937          * match the specified state.
10938          */
10939         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
10940                 int i;
10941
10942                 /*
10943                  * dtvs_state can only be NULL for helper enablings -- and
10944                  * helper enablings can't be retained.
10945                  */
10946                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
10947
10948                 if (enab->dten_vstate->dtvs_state != state)
10949                         continue;
10950
10951                 /*
10952                  * Now iterate over each probe description; we're looking for
10953                  * an exact match to the specified probe description.
10954                  */
10955                 for (i = 0; i < enab->dten_ndesc; i++) {
10956                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
10957                         dtrace_probedesc_t *pd = &ep->dted_probe;
10958
10959                         if (strcmp(pd->dtpd_provider, match->dtpd_provider))
10960                                 continue;
10961
10962                         if (strcmp(pd->dtpd_mod, match->dtpd_mod))
10963                                 continue;
10964
10965                         if (strcmp(pd->dtpd_func, match->dtpd_func))
10966                                 continue;
10967
10968                         if (strcmp(pd->dtpd_name, match->dtpd_name))
10969                                 continue;
10970
10971                         /*
10972                          * We have a winning probe!  Add it to our growing
10973                          * enabling.
10974                          */
10975                         found = 1;
10976                         dtrace_enabling_addlike(new, ep, create);
10977                 }
10978         }
10979
10980         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
10981                 dtrace_enabling_destroy(new);
10982                 return (err);
10983         }
10984
10985         return (0);
10986 }
10987
10988 static void
10989 dtrace_enabling_retract(dtrace_state_t *state)
10990 {
10991         dtrace_enabling_t *enab, *next;
10992
10993         ASSERT(MUTEX_HELD(&dtrace_lock));
10994
10995         /*
10996          * Iterate over all retained enablings, destroy the enablings retained
10997          * for the specified state.
10998          */
10999         for (enab = dtrace_retained; enab != NULL; enab = next) {
11000                 next = enab->dten_next;
11001
11002                 /*
11003                  * dtvs_state can only be NULL for helper enablings -- and
11004                  * helper enablings can't be retained.
11005                  */
11006                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11007
11008                 if (enab->dten_vstate->dtvs_state == state) {
11009                         ASSERT(state->dts_nretained > 0);
11010                         dtrace_enabling_destroy(enab);
11011                 }
11012         }
11013
11014         ASSERT(state->dts_nretained == 0);
11015 }
11016
11017 static int
11018 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11019 {
11020         int i = 0;
11021         int total_matched = 0, matched = 0;
11022
11023         ASSERT(MUTEX_HELD(&cpu_lock));
11024         ASSERT(MUTEX_HELD(&dtrace_lock));
11025
11026         for (i = 0; i < enab->dten_ndesc; i++) {
11027                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11028
11029                 enab->dten_current = ep;
11030                 enab->dten_error = 0;
11031
11032                 /*
11033                  * If a provider failed to enable a probe then get out and
11034                  * let the consumer know we failed.
11035                  */
11036                 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
11037                         return (EBUSY);
11038
11039                 total_matched += matched;
11040
11041                 if (enab->dten_error != 0) {
11042                         /*
11043                          * If we get an error half-way through enabling the
11044                          * probes, we kick out -- perhaps with some number of
11045                          * them enabled.  Leaving enabled probes enabled may
11046                          * be slightly confusing for user-level, but we expect
11047                          * that no one will attempt to actually drive on in
11048                          * the face of such errors.  If this is an anonymous
11049                          * enabling (indicated with a NULL nmatched pointer),
11050                          * we cmn_err() a message.  We aren't expecting to
11051                          * get such an error -- such as it can exist at all,
11052                          * it would be a result of corrupted DOF in the driver
11053                          * properties.
11054                          */
11055                         if (nmatched == NULL) {
11056                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
11057                                     "error on %p: %d", (void *)ep,
11058                                     enab->dten_error);
11059                         }
11060
11061                         return (enab->dten_error);
11062                 }
11063         }
11064
11065         enab->dten_probegen = dtrace_probegen;
11066         if (nmatched != NULL)
11067                 *nmatched = total_matched;
11068
11069         return (0);
11070 }
11071
11072 static void
11073 dtrace_enabling_matchall(void)
11074 {
11075         dtrace_enabling_t *enab;
11076
11077         mutex_enter(&cpu_lock);
11078         mutex_enter(&dtrace_lock);
11079
11080         /*
11081          * Iterate over all retained enablings to see if any probes match
11082          * against them.  We only perform this operation on enablings for which
11083          * we have sufficient permissions by virtue of being in the global zone
11084          * or in the same zone as the DTrace client.  Because we can be called
11085          * after dtrace_detach() has been called, we cannot assert that there
11086          * are retained enablings.  We can safely load from dtrace_retained,
11087          * however:  the taskq_destroy() at the end of dtrace_detach() will
11088          * block pending our completion.
11089          */
11090         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11091                 dtrace_cred_t *dcr = &enab->dten_vstate->dtvs_state->dts_cred;
11092                 cred_t *cr = dcr->dcr_cred;
11093                 zoneid_t zone = cr != NULL ? crgetzoneid(cr) : 0;
11094
11095                 if ((dcr->dcr_visible & DTRACE_CRV_ALLZONE) || (cr != NULL &&
11096                     (zone == GLOBAL_ZONEID || getzoneid() == zone)))
11097                         (void) dtrace_enabling_match(enab, NULL);
11098         }
11099
11100         mutex_exit(&dtrace_lock);
11101         mutex_exit(&cpu_lock);
11102 }
11103
11104 /*
11105  * If an enabling is to be enabled without having matched probes (that is, if
11106  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11107  * enabling must be _primed_ by creating an ECB for every ECB description.
11108  * This must be done to assure that we know the number of speculations, the
11109  * number of aggregations, the minimum buffer size needed, etc. before we
11110  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
11111  * enabling any probes, we create ECBs for every ECB decription, but with a
11112  * NULL probe -- which is exactly what this function does.
11113  */
11114 static void
11115 dtrace_enabling_prime(dtrace_state_t *state)
11116 {
11117         dtrace_enabling_t *enab;
11118         int i;
11119
11120         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11121                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11122
11123                 if (enab->dten_vstate->dtvs_state != state)
11124                         continue;
11125
11126                 /*
11127                  * We don't want to prime an enabling more than once, lest
11128                  * we allow a malicious user to induce resource exhaustion.
11129                  * (The ECBs that result from priming an enabling aren't
11130                  * leaked -- but they also aren't deallocated until the
11131                  * consumer state is destroyed.)
11132                  */
11133                 if (enab->dten_primed)
11134                         continue;
11135
11136                 for (i = 0; i < enab->dten_ndesc; i++) {
11137                         enab->dten_current = enab->dten_desc[i];
11138                         (void) dtrace_probe_enable(NULL, enab);
11139                 }
11140
11141                 enab->dten_primed = 1;
11142         }
11143 }
11144
11145 /*
11146  * Called to indicate that probes should be provided due to retained
11147  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
11148  * must take an initial lap through the enabling calling the dtps_provide()
11149  * entry point explicitly to allow for autocreated probes.
11150  */
11151 static void
11152 dtrace_enabling_provide(dtrace_provider_t *prv)
11153 {
11154         int i, all = 0;
11155         dtrace_probedesc_t desc;
11156         dtrace_genid_t gen;
11157
11158         ASSERT(MUTEX_HELD(&dtrace_lock));
11159         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
11160
11161         if (prv == NULL) {
11162                 all = 1;
11163                 prv = dtrace_provider;
11164         }
11165
11166         do {
11167                 dtrace_enabling_t *enab;
11168                 void *parg = prv->dtpv_arg;
11169
11170 retry:
11171                 gen = dtrace_retained_gen;
11172                 for (enab = dtrace_retained; enab != NULL;
11173                     enab = enab->dten_next) {
11174                         for (i = 0; i < enab->dten_ndesc; i++) {
11175                                 desc = enab->dten_desc[i]->dted_probe;
11176                                 mutex_exit(&dtrace_lock);
11177                                 prv->dtpv_pops.dtps_provide(parg, &desc);
11178                                 mutex_enter(&dtrace_lock);
11179                                 /*
11180                                  * Process the retained enablings again if
11181                                  * they have changed while we weren't holding
11182                                  * dtrace_lock.
11183                                  */
11184                                 if (gen != dtrace_retained_gen)
11185                                         goto retry;
11186                         }
11187                 }
11188         } while (all && (prv = prv->dtpv_next) != NULL);
11189
11190         mutex_exit(&dtrace_lock);
11191         dtrace_probe_provide(NULL, all ? NULL : prv);
11192         mutex_enter(&dtrace_lock);
11193 }
11194
11195 /*
11196  * Called to reap ECBs that are attached to probes from defunct providers.
11197  */
11198 static void
11199 dtrace_enabling_reap(void)
11200 {
11201         dtrace_provider_t *prov;
11202         dtrace_probe_t *probe;
11203         dtrace_ecb_t *ecb;
11204         hrtime_t when;
11205         int i;
11206
11207         mutex_enter(&cpu_lock);
11208         mutex_enter(&dtrace_lock);
11209
11210         for (i = 0; i < dtrace_nprobes; i++) {
11211                 if ((probe = dtrace_probes[i]) == NULL)
11212                         continue;
11213
11214                 if (probe->dtpr_ecb == NULL)
11215                         continue;
11216
11217                 prov = probe->dtpr_provider;
11218
11219                 if ((when = prov->dtpv_defunct) == 0)
11220                         continue;
11221
11222                 /*
11223                  * We have ECBs on a defunct provider:  we want to reap these
11224                  * ECBs to allow the provider to unregister.  The destruction
11225                  * of these ECBs must be done carefully:  if we destroy the ECB
11226                  * and the consumer later wishes to consume an EPID that
11227                  * corresponds to the destroyed ECB (and if the EPID metadata
11228                  * has not been previously consumed), the consumer will abort
11229                  * processing on the unknown EPID.  To reduce (but not, sadly,
11230                  * eliminate) the possibility of this, we will only destroy an
11231                  * ECB for a defunct provider if, for the state that
11232                  * corresponds to the ECB:
11233                  *
11234                  *  (a) There is no speculative tracing (which can effectively
11235                  *      cache an EPID for an arbitrary amount of time).
11236                  *
11237                  *  (b) The principal buffers have been switched twice since the
11238                  *      provider became defunct.
11239                  *
11240                  *  (c) The aggregation buffers are of zero size or have been
11241                  *      switched twice since the provider became defunct.
11242                  *
11243                  * We use dts_speculates to determine (a) and call a function
11244                  * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
11245                  * that as soon as we've been unable to destroy one of the ECBs
11246                  * associated with the probe, we quit trying -- reaping is only
11247                  * fruitful in as much as we can destroy all ECBs associated
11248                  * with the defunct provider's probes.
11249                  */
11250                 while ((ecb = probe->dtpr_ecb) != NULL) {
11251                         dtrace_state_t *state = ecb->dte_state;
11252                         dtrace_buffer_t *buf = state->dts_buffer;
11253                         dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
11254
11255                         if (state->dts_speculates)
11256                                 break;
11257
11258                         if (!dtrace_buffer_consumed(buf, when))
11259                                 break;
11260
11261                         if (!dtrace_buffer_consumed(aggbuf, when))
11262                                 break;
11263
11264                         dtrace_ecb_disable(ecb);
11265                         ASSERT(probe->dtpr_ecb != ecb);
11266                         dtrace_ecb_destroy(ecb);
11267                 }
11268         }
11269
11270         mutex_exit(&dtrace_lock);
11271         mutex_exit(&cpu_lock);
11272 }
11273
11274 /*
11275  * DTrace DOF Functions
11276  */
11277 /*ARGSUSED*/
11278 static void
11279 dtrace_dof_error(dof_hdr_t *dof, const char *str)
11280 {
11281         if (dtrace_err_verbose)
11282                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
11283
11284 #ifdef DTRACE_ERRDEBUG
11285         dtrace_errdebug(str);
11286 #endif
11287 }
11288
11289 /*
11290  * Create DOF out of a currently enabled state.  Right now, we only create
11291  * DOF containing the run-time options -- but this could be expanded to create
11292  * complete DOF representing the enabled state.
11293  */
11294 static dof_hdr_t *
11295 dtrace_dof_create(dtrace_state_t *state)
11296 {
11297         dof_hdr_t *dof;
11298         dof_sec_t *sec;
11299         dof_optdesc_t *opt;
11300         int i, len = sizeof (dof_hdr_t) +
11301             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11302             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11303
11304         ASSERT(MUTEX_HELD(&dtrace_lock));
11305
11306         dof = kmem_zalloc(len, KM_SLEEP);
11307         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11308         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11309         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11310         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11311
11312         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11313         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11314         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11315         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11316         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11317         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11318
11319         dof->dofh_flags = 0;
11320         dof->dofh_hdrsize = sizeof (dof_hdr_t);
11321         dof->dofh_secsize = sizeof (dof_sec_t);
11322         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
11323         dof->dofh_secoff = sizeof (dof_hdr_t);
11324         dof->dofh_loadsz = len;
11325         dof->dofh_filesz = len;
11326         dof->dofh_pad = 0;
11327
11328         /*
11329          * Fill in the option section header...
11330          */
11331         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11332         sec->dofs_type = DOF_SECT_OPTDESC;
11333         sec->dofs_align = sizeof (uint64_t);
11334         sec->dofs_flags = DOF_SECF_LOAD;
11335         sec->dofs_entsize = sizeof (dof_optdesc_t);
11336
11337         opt = (dof_optdesc_t *)((uintptr_t)sec +
11338             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11339
11340         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11341         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11342
11343         for (i = 0; i < DTRACEOPT_MAX; i++) {
11344                 opt[i].dofo_option = i;
11345                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
11346                 opt[i].dofo_value = state->dts_options[i];
11347         }
11348
11349         return (dof);
11350 }
11351
11352 static dof_hdr_t *
11353 dtrace_dof_copyin(uintptr_t uarg, int *errp)
11354 {
11355         dof_hdr_t hdr, *dof;
11356
11357         ASSERT(!MUTEX_HELD(&dtrace_lock));
11358
11359         /*
11360          * First, we're going to copyin() the sizeof (dof_hdr_t).
11361          */
11362         if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
11363                 dtrace_dof_error(NULL, "failed to copyin DOF header");
11364                 *errp = EFAULT;
11365                 return (NULL);
11366         }
11367
11368         /*
11369          * Now we'll allocate the entire DOF and copy it in -- provided
11370          * that the length isn't outrageous.
11371          */
11372         if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
11373                 dtrace_dof_error(&hdr, "load size exceeds maximum");
11374                 *errp = E2BIG;
11375                 return (NULL);
11376         }
11377
11378         if (hdr.dofh_loadsz < sizeof (hdr)) {
11379                 dtrace_dof_error(&hdr, "invalid load size");
11380                 *errp = EINVAL;
11381                 return (NULL);
11382         }
11383
11384         dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
11385
11386         if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
11387             dof->dofh_loadsz != hdr.dofh_loadsz) {
11388                 kmem_free(dof, hdr.dofh_loadsz);
11389                 *errp = EFAULT;
11390                 return (NULL);
11391         }
11392
11393         return (dof);
11394 }
11395
11396 static dof_hdr_t *
11397 dtrace_dof_property(const char *name)
11398 {
11399         uchar_t *buf;
11400         uint64_t loadsz;
11401         unsigned int len, i;
11402         dof_hdr_t *dof;
11403
11404         /*
11405          * Unfortunately, array of values in .conf files are always (and
11406          * only) interpreted to be integer arrays.  We must read our DOF
11407          * as an integer array, and then squeeze it into a byte array.
11408          */
11409         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11410             (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11411                 return (NULL);
11412
11413         for (i = 0; i < len; i++)
11414                 buf[i] = (uchar_t)(((int *)buf)[i]);
11415
11416         if (len < sizeof (dof_hdr_t)) {
11417                 ddi_prop_free(buf);
11418                 dtrace_dof_error(NULL, "truncated header");
11419                 return (NULL);
11420         }
11421
11422         if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11423                 ddi_prop_free(buf);
11424                 dtrace_dof_error(NULL, "truncated DOF");
11425                 return (NULL);
11426         }
11427
11428         if (loadsz >= dtrace_dof_maxsize) {
11429                 ddi_prop_free(buf);
11430                 dtrace_dof_error(NULL, "oversized DOF");
11431                 return (NULL);
11432         }
11433
11434         dof = kmem_alloc(loadsz, KM_SLEEP);
11435         bcopy(buf, dof, loadsz);
11436         ddi_prop_free(buf);
11437
11438         return (dof);
11439 }
11440
11441 static void
11442 dtrace_dof_destroy(dof_hdr_t *dof)
11443 {
11444         kmem_free(dof, dof->dofh_loadsz);
11445 }
11446
11447 /*
11448  * Return the dof_sec_t pointer corresponding to a given section index.  If the
11449  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
11450  * a type other than DOF_SECT_NONE is specified, the header is checked against
11451  * this type and NULL is returned if the types do not match.
11452  */
11453 static dof_sec_t *
11454 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11455 {
11456         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11457             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11458
11459         if (i >= dof->dofh_secnum) {
11460                 dtrace_dof_error(dof, "referenced section index is invalid");
11461                 return (NULL);
11462         }
11463
11464         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11465                 dtrace_dof_error(dof, "referenced section is not loadable");
11466                 return (NULL);
11467         }
11468
11469         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11470                 dtrace_dof_error(dof, "referenced section is the wrong type");
11471                 return (NULL);
11472         }
11473
11474         return (sec);
11475 }
11476
11477 static dtrace_probedesc_t *
11478 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11479 {
11480         dof_probedesc_t *probe;
11481         dof_sec_t *strtab;
11482         uintptr_t daddr = (uintptr_t)dof;
11483         uintptr_t str;
11484         size_t size;
11485
11486         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11487                 dtrace_dof_error(dof, "invalid probe section");
11488                 return (NULL);
11489         }
11490
11491         if (sec->dofs_align != sizeof (dof_secidx_t)) {
11492                 dtrace_dof_error(dof, "bad alignment in probe description");
11493                 return (NULL);
11494         }
11495
11496         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11497                 dtrace_dof_error(dof, "truncated probe description");
11498                 return (NULL);
11499         }
11500
11501         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11502         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11503
11504         if (strtab == NULL)
11505                 return (NULL);
11506
11507         str = daddr + strtab->dofs_offset;
11508         size = strtab->dofs_size;
11509
11510         if (probe->dofp_provider >= strtab->dofs_size) {
11511                 dtrace_dof_error(dof, "corrupt probe provider");
11512                 return (NULL);
11513         }
11514
11515         (void) strncpy(desc->dtpd_provider,
11516             (char *)(str + probe->dofp_provider),
11517             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11518
11519         if (probe->dofp_mod >= strtab->dofs_size) {
11520                 dtrace_dof_error(dof, "corrupt probe module");
11521                 return (NULL);
11522         }
11523
11524         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11525             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11526
11527         if (probe->dofp_func >= strtab->dofs_size) {
11528                 dtrace_dof_error(dof, "corrupt probe function");
11529                 return (NULL);
11530         }
11531
11532         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11533             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11534
11535         if (probe->dofp_name >= strtab->dofs_size) {
11536                 dtrace_dof_error(dof, "corrupt probe name");
11537                 return (NULL);
11538         }
11539
11540         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11541             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11542
11543         return (desc);
11544 }
11545
11546 static dtrace_difo_t *
11547 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11548     cred_t *cr)
11549 {
11550         dtrace_difo_t *dp;
11551         size_t ttl = 0;
11552         dof_difohdr_t *dofd;
11553         uintptr_t daddr = (uintptr_t)dof;
11554         size_t max = dtrace_difo_maxsize;
11555         int i, l, n;
11556
11557         static const struct {
11558                 int section;
11559                 int bufoffs;
11560                 int lenoffs;
11561                 int entsize;
11562                 int align;
11563                 const char *msg;
11564         } difo[] = {
11565                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11566                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11567                 sizeof (dif_instr_t), "multiple DIF sections" },
11568
11569                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11570                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11571                 sizeof (uint64_t), "multiple integer tables" },
11572
11573                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11574                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
11575                 sizeof (char), "multiple string tables" },
11576
11577                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11578                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11579                 sizeof (uint_t), "multiple variable tables" },
11580
11581                 { DOF_SECT_NONE, 0, 0, 0, NULL }
11582         };
11583
11584         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11585                 dtrace_dof_error(dof, "invalid DIFO header section");
11586                 return (NULL);
11587         }
11588
11589         if (sec->dofs_align != sizeof (dof_secidx_t)) {
11590                 dtrace_dof_error(dof, "bad alignment in DIFO header");
11591                 return (NULL);
11592         }
11593
11594         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11595             sec->dofs_size % sizeof (dof_secidx_t)) {
11596                 dtrace_dof_error(dof, "bad size in DIFO header");
11597                 return (NULL);
11598         }
11599
11600         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11601         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11602
11603         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11604         dp->dtdo_rtype = dofd->dofd_rtype;
11605
11606         for (l = 0; l < n; l++) {
11607                 dof_sec_t *subsec;
11608                 void **bufp;
11609                 uint32_t *lenp;
11610
11611                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11612                     dofd->dofd_links[l])) == NULL)
11613                         goto err; /* invalid section link */
11614
11615                 if (ttl + subsec->dofs_size > max) {
11616                         dtrace_dof_error(dof, "exceeds maximum size");
11617                         goto err;
11618                 }
11619
11620                 ttl += subsec->dofs_size;
11621
11622                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11623                         if (subsec->dofs_type != difo[i].section)
11624                                 continue;
11625
11626                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11627                                 dtrace_dof_error(dof, "section not loaded");
11628                                 goto err;
11629                         }
11630
11631                         if (subsec->dofs_align != difo[i].align) {
11632                                 dtrace_dof_error(dof, "bad alignment");
11633                                 goto err;
11634                         }
11635
11636                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11637                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11638
11639                         if (*bufp != NULL) {
11640                                 dtrace_dof_error(dof, difo[i].msg);
11641                                 goto err;
11642                         }
11643
11644                         if (difo[i].entsize != subsec->dofs_entsize) {
11645                                 dtrace_dof_error(dof, "entry size mismatch");
11646                                 goto err;
11647                         }
11648
11649                         if (subsec->dofs_entsize != 0 &&
11650                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11651                                 dtrace_dof_error(dof, "corrupt entry size");
11652                                 goto err;
11653                         }
11654
11655                         *lenp = subsec->dofs_size;
11656                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11657                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11658                             *bufp, subsec->dofs_size);
11659
11660                         if (subsec->dofs_entsize != 0)
11661                                 *lenp /= subsec->dofs_entsize;
11662
11663                         break;
11664                 }
11665
11666                 /*
11667                  * If we encounter a loadable DIFO sub-section that is not
11668                  * known to us, assume this is a broken program and fail.
11669                  */
11670                 if (difo[i].section == DOF_SECT_NONE &&
11671                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
11672                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
11673                         goto err;
11674                 }
11675         }
11676
11677         if (dp->dtdo_buf == NULL) {
11678                 /*
11679                  * We can't have a DIF object without DIF text.
11680                  */
11681                 dtrace_dof_error(dof, "missing DIF text");
11682                 goto err;
11683         }
11684
11685         /*
11686          * Before we validate the DIF object, run through the variable table
11687          * looking for the strings -- if any of their size are under, we'll set
11688          * their size to be the system-wide default string size.  Note that
11689          * this should _not_ happen if the "strsize" option has been set --
11690          * in this case, the compiler should have set the size to reflect the
11691          * setting of the option.
11692          */
11693         for (i = 0; i < dp->dtdo_varlen; i++) {
11694                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
11695                 dtrace_diftype_t *t = &v->dtdv_type;
11696
11697                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11698                         continue;
11699
11700                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11701                         t->dtdt_size = dtrace_strsize_default;
11702         }
11703
11704         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11705                 goto err;
11706
11707         dtrace_difo_init(dp, vstate);
11708         return (dp);
11709
11710 err:
11711         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11712         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11713         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11714         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11715
11716         kmem_free(dp, sizeof (dtrace_difo_t));
11717         return (NULL);
11718 }
11719
11720 static dtrace_predicate_t *
11721 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11722     cred_t *cr)
11723 {
11724         dtrace_difo_t *dp;
11725
11726         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11727                 return (NULL);
11728
11729         return (dtrace_predicate_create(dp));
11730 }
11731
11732 static dtrace_actdesc_t *
11733 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11734     cred_t *cr)
11735 {
11736         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
11737         dof_actdesc_t *desc;
11738         dof_sec_t *difosec;
11739         size_t offs;
11740         uintptr_t daddr = (uintptr_t)dof;
11741         uint64_t arg;
11742         dtrace_actkind_t kind;
11743
11744         if (sec->dofs_type != DOF_SECT_ACTDESC) {
11745                 dtrace_dof_error(dof, "invalid action section");
11746                 return (NULL);
11747         }
11748
11749         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
11750                 dtrace_dof_error(dof, "truncated action description");
11751                 return (NULL);
11752         }
11753
11754         if (sec->dofs_align != sizeof (uint64_t)) {
11755                 dtrace_dof_error(dof, "bad alignment in action description");
11756                 return (NULL);
11757         }
11758
11759         if (sec->dofs_size < sec->dofs_entsize) {
11760                 dtrace_dof_error(dof, "section entry size exceeds total size");
11761                 return (NULL);
11762         }
11763
11764         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
11765                 dtrace_dof_error(dof, "bad entry size in action description");
11766                 return (NULL);
11767         }
11768
11769         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
11770                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
11771                 return (NULL);
11772         }
11773
11774         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
11775                 desc = (dof_actdesc_t *)(daddr +
11776                     (uintptr_t)sec->dofs_offset + offs);
11777                 kind = (dtrace_actkind_t)desc->dofa_kind;
11778
11779                 if (DTRACEACT_ISPRINTFLIKE(kind) &&
11780                     (kind != DTRACEACT_PRINTA ||
11781                     desc->dofa_strtab != DOF_SECIDX_NONE)) {
11782                         dof_sec_t *strtab;
11783                         char *str, *fmt;
11784                         uint64_t i;
11785
11786                         /*
11787                          * printf()-like actions must have a format string.
11788                          */
11789                         if ((strtab = dtrace_dof_sect(dof,
11790                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
11791                                 goto err;
11792
11793                         str = (char *)((uintptr_t)dof +
11794                             (uintptr_t)strtab->dofs_offset);
11795
11796                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
11797                                 if (str[i] == '\0')
11798                                         break;
11799                         }
11800
11801                         if (i >= strtab->dofs_size) {
11802                                 dtrace_dof_error(dof, "bogus format string");
11803                                 goto err;
11804                         }
11805
11806                         if (i == desc->dofa_arg) {
11807                                 dtrace_dof_error(dof, "empty format string");
11808                                 goto err;
11809                         }
11810
11811                         i -= desc->dofa_arg;
11812                         fmt = kmem_alloc(i + 1, KM_SLEEP);
11813                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
11814                         arg = (uint64_t)(uintptr_t)fmt;
11815                 } else {
11816                         if (kind == DTRACEACT_PRINTA) {
11817                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
11818                                 arg = 0;
11819                         } else {
11820                                 arg = desc->dofa_arg;
11821                         }
11822                 }
11823
11824                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
11825                     desc->dofa_uarg, arg);
11826
11827                 if (last != NULL) {
11828                         last->dtad_next = act;
11829                 } else {
11830                         first = act;
11831                 }
11832
11833                 last = act;
11834
11835                 if (desc->dofa_difo == DOF_SECIDX_NONE)
11836                         continue;
11837
11838                 if ((difosec = dtrace_dof_sect(dof,
11839                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
11840                         goto err;
11841
11842                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
11843
11844                 if (act->dtad_difo == NULL)
11845                         goto err;
11846         }
11847
11848         ASSERT(first != NULL);
11849         return (first);
11850
11851 err:
11852         for (act = first; act != NULL; act = next) {
11853                 next = act->dtad_next;
11854                 dtrace_actdesc_release(act, vstate);
11855         }
11856
11857         return (NULL);
11858 }
11859
11860 static dtrace_ecbdesc_t *
11861 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11862     cred_t *cr)
11863 {
11864         dtrace_ecbdesc_t *ep;
11865         dof_ecbdesc_t *ecb;
11866         dtrace_probedesc_t *desc;
11867         dtrace_predicate_t *pred = NULL;
11868
11869         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
11870                 dtrace_dof_error(dof, "truncated ECB description");
11871                 return (NULL);
11872         }
11873
11874         if (sec->dofs_align != sizeof (uint64_t)) {
11875                 dtrace_dof_error(dof, "bad alignment in ECB description");
11876                 return (NULL);
11877         }
11878
11879         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
11880         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
11881
11882         if (sec == NULL)
11883                 return (NULL);
11884
11885         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11886         ep->dted_uarg = ecb->dofe_uarg;
11887         desc = &ep->dted_probe;
11888
11889         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
11890                 goto err;
11891
11892         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
11893                 if ((sec = dtrace_dof_sect(dof,
11894                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
11895                         goto err;
11896
11897                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
11898                         goto err;
11899
11900                 ep->dted_pred.dtpdd_predicate = pred;
11901         }
11902
11903         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
11904                 if ((sec = dtrace_dof_sect(dof,
11905                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
11906                         goto err;
11907
11908                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
11909
11910                 if (ep->dted_action == NULL)
11911                         goto err;
11912         }
11913
11914         return (ep);
11915
11916 err:
11917         if (pred != NULL)
11918                 dtrace_predicate_release(pred, vstate);
11919         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11920         return (NULL);
11921 }
11922
11923 /*
11924  * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
11925  * specified DOF.  At present, this amounts to simply adding 'ubase' to the
11926  * site of any user SETX relocations to account for load object base address.
11927  * In the future, if we need other relocations, this function can be extended.
11928  */
11929 static int
11930 dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
11931 {
11932         uintptr_t daddr = (uintptr_t)dof;
11933         dof_relohdr_t *dofr =
11934             (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11935         dof_sec_t *ss, *rs, *ts;
11936         dof_relodesc_t *r;
11937         uint_t i, n;
11938
11939         if (sec->dofs_size < sizeof (dof_relohdr_t) ||
11940             sec->dofs_align != sizeof (dof_secidx_t)) {
11941                 dtrace_dof_error(dof, "invalid relocation header");
11942                 return (-1);
11943         }
11944
11945         ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
11946         rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
11947         ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
11948
11949         if (ss == NULL || rs == NULL || ts == NULL)
11950                 return (-1); /* dtrace_dof_error() has been called already */
11951
11952         if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
11953             rs->dofs_align != sizeof (uint64_t)) {
11954                 dtrace_dof_error(dof, "invalid relocation section");
11955                 return (-1);
11956         }
11957
11958         r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
11959         n = rs->dofs_size / rs->dofs_entsize;
11960
11961         for (i = 0; i < n; i++) {
11962                 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
11963
11964                 switch (r->dofr_type) {
11965                 case DOF_RELO_NONE:
11966                         break;
11967                 case DOF_RELO_SETX:
11968                         if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
11969                             sizeof (uint64_t) > ts->dofs_size) {
11970                                 dtrace_dof_error(dof, "bad relocation offset");
11971                                 return (-1);
11972                         }
11973
11974                         if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
11975                                 dtrace_dof_error(dof, "misaligned setx relo");
11976                                 return (-1);
11977                         }
11978
11979                         *(uint64_t *)taddr += ubase;
11980                         break;
11981                 default:
11982                         dtrace_dof_error(dof, "invalid relocation type");
11983                         return (-1);
11984                 }
11985
11986                 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
11987         }
11988
11989         return (0);
11990 }
11991
11992 /*
11993  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
11994  * header:  it should be at the front of a memory region that is at least
11995  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
11996  * size.  It need not be validated in any other way.
11997  */
11998 static int
11999 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12000     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12001 {
12002         uint64_t len = dof->dofh_loadsz, seclen;
12003         uintptr_t daddr = (uintptr_t)dof;
12004         dtrace_ecbdesc_t *ep;
12005         dtrace_enabling_t *enab;
12006         uint_t i;
12007
12008         ASSERT(MUTEX_HELD(&dtrace_lock));
12009         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12010
12011         /*
12012          * Check the DOF header identification bytes.  In addition to checking
12013          * valid settings, we also verify that unused bits/bytes are zeroed so
12014          * we can use them later without fear of regressing existing binaries.
12015          */
12016         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12017             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12018                 dtrace_dof_error(dof, "DOF magic string mismatch");
12019                 return (-1);
12020         }
12021
12022         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12023             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12024                 dtrace_dof_error(dof, "DOF has invalid data model");
12025                 return (-1);
12026         }
12027
12028         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12029                 dtrace_dof_error(dof, "DOF encoding mismatch");
12030                 return (-1);
12031         }
12032
12033         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
12034             dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
12035                 dtrace_dof_error(dof, "DOF version mismatch");
12036                 return (-1);
12037         }
12038
12039         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12040                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12041                 return (-1);
12042         }
12043
12044         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12045                 dtrace_dof_error(dof, "DOF uses too many integer registers");
12046                 return (-1);
12047         }
12048
12049         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12050                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12051                 return (-1);
12052         }
12053
12054         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12055                 if (dof->dofh_ident[i] != 0) {
12056                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
12057                         return (-1);
12058                 }
12059         }
12060
12061         if (dof->dofh_flags & ~DOF_FL_VALID) {
12062                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12063                 return (-1);
12064         }
12065
12066         if (dof->dofh_secsize == 0) {
12067                 dtrace_dof_error(dof, "zero section header size");
12068                 return (-1);
12069         }
12070
12071         /*
12072          * Check that the section headers don't exceed the amount of DOF
12073          * data.  Note that we cast the section size and number of sections
12074          * to uint64_t's to prevent possible overflow in the multiplication.
12075          */
12076         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12077
12078         if (dof->dofh_secoff > len || seclen > len ||
12079             dof->dofh_secoff + seclen > len) {
12080                 dtrace_dof_error(dof, "truncated section headers");
12081                 return (-1);
12082         }
12083
12084         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12085                 dtrace_dof_error(dof, "misaligned section headers");
12086                 return (-1);
12087         }
12088
12089         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12090                 dtrace_dof_error(dof, "misaligned section size");
12091                 return (-1);
12092         }
12093
12094         /*
12095          * Take an initial pass through the section headers to be sure that
12096          * the headers don't have stray offsets.  If the 'noprobes' flag is
12097          * set, do not permit sections relating to providers, probes, or args.
12098          */
12099         for (i = 0; i < dof->dofh_secnum; i++) {
12100                 dof_sec_t *sec = (dof_sec_t *)(daddr +
12101                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12102
12103                 if (noprobes) {
12104                         switch (sec->dofs_type) {
12105                         case DOF_SECT_PROVIDER:
12106                         case DOF_SECT_PROBES:
12107                         case DOF_SECT_PRARGS:
12108                         case DOF_SECT_PROFFS:
12109                                 dtrace_dof_error(dof, "illegal sections "
12110                                     "for enabling");
12111                                 return (-1);
12112                         }
12113                 }
12114
12115                 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
12116                     !(sec->dofs_flags & DOF_SECF_LOAD)) {
12117                         dtrace_dof_error(dof, "loadable section with load "
12118                             "flag unset");
12119                         return (-1);
12120                 }
12121
12122                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12123                         continue; /* just ignore non-loadable sections */
12124
12125                 if (sec->dofs_align & (sec->dofs_align - 1)) {
12126                         dtrace_dof_error(dof, "bad section alignment");
12127                         return (-1);
12128                 }
12129
12130                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
12131                         dtrace_dof_error(dof, "misaligned section");
12132                         return (-1);
12133                 }
12134
12135                 if (sec->dofs_offset > len || sec->dofs_size > len ||
12136                     sec->dofs_offset + sec->dofs_size > len) {
12137                         dtrace_dof_error(dof, "corrupt section header");
12138                         return (-1);
12139                 }
12140
12141                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12142                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12143                         dtrace_dof_error(dof, "non-terminating string table");
12144                         return (-1);
12145                 }
12146         }
12147
12148         /*
12149          * Take a second pass through the sections and locate and perform any
12150          * relocations that are present.  We do this after the first pass to
12151          * be sure that all sections have had their headers validated.
12152          */
12153         for (i = 0; i < dof->dofh_secnum; i++) {
12154                 dof_sec_t *sec = (dof_sec_t *)(daddr +
12155                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12156
12157                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12158                         continue; /* skip sections that are not loadable */
12159
12160                 switch (sec->dofs_type) {
12161                 case DOF_SECT_URELHDR:
12162                         if (dtrace_dof_relocate(dof, sec, ubase) != 0)
12163                                 return (-1);
12164                         break;
12165                 }
12166         }
12167
12168         if ((enab = *enabp) == NULL)
12169                 enab = *enabp = dtrace_enabling_create(vstate);
12170
12171         for (i = 0; i < dof->dofh_secnum; i++) {
12172                 dof_sec_t *sec = (dof_sec_t *)(daddr +
12173                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12174
12175                 if (sec->dofs_type != DOF_SECT_ECBDESC)
12176                         continue;
12177
12178                 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
12179                         dtrace_enabling_destroy(enab);
12180                         *enabp = NULL;
12181                         return (-1);
12182                 }
12183
12184                 dtrace_enabling_add(enab, ep);
12185         }
12186
12187         return (0);
12188 }
12189
12190 /*
12191  * Process DOF for any options.  This routine assumes that the DOF has been
12192  * at least processed by dtrace_dof_slurp().
12193  */
12194 static int
12195 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12196 {
12197         int i, rval;
12198         uint32_t entsize;
12199         size_t offs;
12200         dof_optdesc_t *desc;
12201
12202         for (i = 0; i < dof->dofh_secnum; i++) {
12203                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12204                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12205
12206                 if (sec->dofs_type != DOF_SECT_OPTDESC)
12207                         continue;
12208
12209                 if (sec->dofs_align != sizeof (uint64_t)) {
12210                         dtrace_dof_error(dof, "bad alignment in "
12211                             "option description");
12212                         return (EINVAL);
12213                 }
12214
12215                 if ((entsize = sec->dofs_entsize) == 0) {
12216                         dtrace_dof_error(dof, "zeroed option entry size");
12217                         return (EINVAL);
12218                 }
12219
12220                 if (entsize < sizeof (dof_optdesc_t)) {
12221                         dtrace_dof_error(dof, "bad option entry size");
12222                         return (EINVAL);
12223                 }
12224
12225                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12226                         desc = (dof_optdesc_t *)((uintptr_t)dof +
12227                             (uintptr_t)sec->dofs_offset + offs);
12228
12229                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12230                                 dtrace_dof_error(dof, "non-zero option string");
12231                                 return (EINVAL);
12232                         }
12233
12234                         if (desc->dofo_value == DTRACEOPT_UNSET) {
12235                                 dtrace_dof_error(dof, "unset option");
12236                                 return (EINVAL);
12237                         }
12238
12239                         if ((rval = dtrace_state_option(state,
12240                             desc->dofo_option, desc->dofo_value)) != 0) {
12241                                 dtrace_dof_error(dof, "rejected option");
12242                                 return (rval);
12243                         }
12244                 }
12245         }
12246
12247         return (0);
12248 }
12249
12250 /*
12251  * DTrace Consumer State Functions
12252  */
12253 int
12254 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12255 {
12256         size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
12257         void *base;
12258         uintptr_t limit;
12259         dtrace_dynvar_t *dvar, *next, *start;
12260         int i;
12261
12262         ASSERT(MUTEX_HELD(&dtrace_lock));
12263         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12264
12265         bzero(dstate, sizeof (dtrace_dstate_t));
12266
12267         if ((dstate->dtds_chunksize = chunksize) == 0)
12268                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12269
12270         if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12271                 size = min;
12272
12273         if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12274                 return (ENOMEM);
12275
12276         dstate->dtds_size = size;
12277         dstate->dtds_base = base;
12278         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12279         bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
12280
12281         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12282
12283         if (hashsize != 1 && (hashsize & 1))
12284                 hashsize--;
12285
12286         dstate->dtds_hashsize = hashsize;
12287         dstate->dtds_hash = dstate->dtds_base;
12288
12289         /*
12290          * Set all of our hash buckets to point to the single sink, and (if
12291          * it hasn't already been set), set the sink's hash value to be the
12292          * sink sentinel value.  The sink is needed for dynamic variable
12293          * lookups to know that they have iterated over an entire, valid hash
12294          * chain.
12295          */
12296         for (i = 0; i < hashsize; i++)
12297                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12298
12299         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12300                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12301
12302         /*
12303          * Determine number of active CPUs.  Divide free list evenly among
12304          * active CPUs.
12305          */
12306         start = (dtrace_dynvar_t *)
12307             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12308         limit = (uintptr_t)base + size;
12309
12310         maxper = (limit - (uintptr_t)start) / NCPU;
12311         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12312
12313         for (i = 0; i < NCPU; i++) {
12314                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12315
12316                 /*
12317                  * If we don't even have enough chunks to make it once through
12318                  * NCPUs, we're just going to allocate everything to the first
12319                  * CPU.  And if we're on the last CPU, we're going to allocate
12320                  * whatever is left over.  In either case, we set the limit to
12321                  * be the limit of the dynamic variable space.
12322                  */
12323                 if (maxper == 0 || i == NCPU - 1) {
12324                         limit = (uintptr_t)base + size;
12325                         start = NULL;
12326                 } else {
12327                         limit = (uintptr_t)start + maxper;
12328                         start = (dtrace_dynvar_t *)limit;
12329                 }
12330
12331                 ASSERT(limit <= (uintptr_t)base + size);
12332
12333                 for (;;) {
12334                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12335                             dstate->dtds_chunksize);
12336
12337                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12338                                 break;
12339
12340                         dvar->dtdv_next = next;
12341                         dvar = next;
12342                 }
12343
12344                 if (maxper == 0)
12345                         break;
12346         }
12347
12348         return (0);
12349 }
12350
12351 void
12352 dtrace_dstate_fini(dtrace_dstate_t *dstate)
12353 {
12354         ASSERT(MUTEX_HELD(&cpu_lock));
12355
12356         if (dstate->dtds_base == NULL)
12357                 return;
12358
12359         kmem_free(dstate->dtds_base, dstate->dtds_size);
12360         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
12361 }
12362
12363 static void
12364 dtrace_vstate_fini(dtrace_vstate_t *vstate)
12365 {
12366         /*
12367          * Logical XOR, where are you?
12368          */
12369         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
12370
12371         if (vstate->dtvs_nglobals > 0) {
12372                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
12373                     sizeof (dtrace_statvar_t *));
12374         }
12375
12376         if (vstate->dtvs_ntlocals > 0) {
12377                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
12378                     sizeof (dtrace_difv_t));
12379         }
12380
12381         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
12382
12383         if (vstate->dtvs_nlocals > 0) {
12384                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
12385                     sizeof (dtrace_statvar_t *));
12386         }
12387 }
12388
12389 static void
12390 dtrace_state_clean(dtrace_state_t *state)
12391 {
12392         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
12393                 return;
12394
12395         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12396         dtrace_speculation_clean(state);
12397 }
12398
12399 static void
12400 dtrace_state_deadman(dtrace_state_t *state)
12401 {
12402         hrtime_t now;
12403
12404         dtrace_sync();
12405
12406         now = dtrace_gethrtime();
12407
12408         if (state != dtrace_anon.dta_state &&
12409             now - state->dts_laststatus >= dtrace_deadman_user)
12410                 return;
12411
12412         /*
12413          * We must be sure that dts_alive never appears to be less than the
12414          * value upon entry to dtrace_state_deadman(), and because we lack a
12415          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
12416          * store INT64_MAX to it, followed by a memory barrier, followed by
12417          * the new value.  This assures that dts_alive never appears to be
12418          * less than its true value, regardless of the order in which the
12419          * stores to the underlying storage are issued.
12420          */
12421         state->dts_alive = INT64_MAX;
12422         dtrace_membar_producer();
12423         state->dts_alive = now;
12424 }
12425
12426 dtrace_state_t *
12427 dtrace_state_create(dev_t *devp, cred_t *cr)
12428 {
12429         minor_t minor;
12430         major_t major;
12431         char c[30];
12432         dtrace_state_t *state;
12433         dtrace_optval_t *opt;
12434         int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
12435
12436         ASSERT(MUTEX_HELD(&dtrace_lock));
12437         ASSERT(MUTEX_HELD(&cpu_lock));
12438
12439         minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12440             VM_BESTFIT | VM_SLEEP);
12441
12442         if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12443                 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12444                 return (NULL);
12445         }
12446
12447         state = ddi_get_soft_state(dtrace_softstate, minor);
12448         state->dts_epid = DTRACE_EPIDNONE + 1;
12449
12450         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12451         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12452             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12453
12454         if (devp != NULL) {
12455                 major = getemajor(*devp);
12456         } else {
12457                 major = ddi_driver_major(dtrace_devi);
12458         }
12459
12460         state->dts_dev = makedevice(major, minor);
12461
12462         if (devp != NULL)
12463                 *devp = state->dts_dev;
12464
12465         /*
12466          * We allocate NCPU buffers.  On the one hand, this can be quite
12467          * a bit of memory per instance (nearly 36K on a Starcat).  On the
12468          * other hand, it saves an additional memory reference in the probe
12469          * path.
12470          */
12471         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12472         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12473         state->dts_cleaner = CYCLIC_NONE;
12474         state->dts_deadman = CYCLIC_NONE;
12475         state->dts_vstate.dtvs_state = state;
12476
12477         for (i = 0; i < DTRACEOPT_MAX; i++)
12478                 state->dts_options[i] = DTRACEOPT_UNSET;
12479
12480         /*
12481          * Set the default options.
12482          */
12483         opt = state->dts_options;
12484         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12485         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12486         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12487         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12488         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12489         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12490         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12491         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12492         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12493         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12494         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12495         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12496         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12497         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12498
12499         state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12500
12501         /*
12502          * Depending on the user credentials, we set flag bits which alter probe
12503          * visibility or the amount of destructiveness allowed.  In the case of
12504          * actual anonymous tracing, or the possession of all privileges, all of
12505          * the normal checks are bypassed.
12506          */
12507         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12508                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12509                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12510         } else {
12511                 /*
12512                  * Set up the credentials for this instantiation.  We take a
12513                  * hold on the credential to prevent it from disappearing on
12514                  * us; this in turn prevents the zone_t referenced by this
12515                  * credential from disappearing.  This means that we can
12516                  * examine the credential and the zone from probe context.
12517                  */
12518                 crhold(cr);
12519                 state->dts_cred.dcr_cred = cr;
12520
12521                 /*
12522                  * CRA_PROC means "we have *some* privilege for dtrace" and
12523                  * unlocks the use of variables like pid, zonename, etc.
12524                  */
12525                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12526                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12527                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12528                 }
12529
12530                 /*
12531                  * dtrace_user allows use of syscall and profile providers.
12532                  * If the user also has proc_owner and/or proc_zone, we
12533                  * extend the scope to include additional visibility and
12534                  * destructive power.
12535                  */
12536                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12537                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12538                                 state->dts_cred.dcr_visible |=
12539                                     DTRACE_CRV_ALLPROC;
12540
12541                                 state->dts_cred.dcr_action |=
12542                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12543                         }
12544
12545                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12546                                 state->dts_cred.dcr_visible |=
12547                                     DTRACE_CRV_ALLZONE;
12548
12549                                 state->dts_cred.dcr_action |=
12550                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12551                         }
12552
12553                         /*
12554                          * If we have all privs in whatever zone this is,
12555                          * we can do destructive things to processes which
12556                          * have altered credentials.
12557                          */
12558                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12559                             cr->cr_zone->zone_privset)) {
12560                                 state->dts_cred.dcr_action |=
12561                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12562                         }
12563                 }
12564
12565                 /*
12566                  * Holding the dtrace_kernel privilege also implies that
12567                  * the user has the dtrace_user privilege from a visibility
12568                  * perspective.  But without further privileges, some
12569                  * destructive actions are not available.
12570                  */
12571                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12572                         /*
12573                          * Make all probes in all zones visible.  However,
12574                          * this doesn't mean that all actions become available
12575                          * to all zones.
12576                          */
12577                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12578                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12579
12580                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12581                             DTRACE_CRA_PROC;
12582                         /*
12583                          * Holding proc_owner means that destructive actions
12584                          * for *this* zone are allowed.
12585                          */
12586                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12587                                 state->dts_cred.dcr_action |=
12588                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12589
12590                         /*
12591                          * Holding proc_zone means that destructive actions
12592                          * for this user/group ID in all zones is allowed.
12593                          */
12594                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12595                                 state->dts_cred.dcr_action |=
12596                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12597
12598                         /*
12599                          * If we have all privs in whatever zone this is,
12600                          * we can do destructive things to processes which
12601                          * have altered credentials.
12602                          */
12603                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
12604                             cr->cr_zone->zone_privset)) {
12605                                 state->dts_cred.dcr_action |=
12606                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12607                         }
12608                 }
12609
12610                 /*
12611                  * Holding the dtrace_proc privilege gives control over fasttrap
12612                  * and pid providers.  We need to grant wider destructive
12613                  * privileges in the event that the user has proc_owner and/or
12614                  * proc_zone.
12615                  */
12616                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12617                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12618                                 state->dts_cred.dcr_action |=
12619                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12620
12621                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12622                                 state->dts_cred.dcr_action |=
12623                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12624                 }
12625         }
12626
12627         return (state);
12628 }
12629
12630 static int
12631 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12632 {
12633         dtrace_optval_t *opt = state->dts_options, size;
12634         processorid_t cpu;
12635         int flags = 0, rval, factor, divisor = 1;
12636
12637         ASSERT(MUTEX_HELD(&dtrace_lock));
12638         ASSERT(MUTEX_HELD(&cpu_lock));
12639         ASSERT(which < DTRACEOPT_MAX);
12640         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12641             (state == dtrace_anon.dta_state &&
12642             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12643
12644         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12645                 return (0);
12646
12647         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12648                 cpu = opt[DTRACEOPT_CPU];
12649
12650         if (which == DTRACEOPT_SPECSIZE)
12651                 flags |= DTRACEBUF_NOSWITCH;
12652
12653         if (which == DTRACEOPT_BUFSIZE) {
12654                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12655                         flags |= DTRACEBUF_RING;
12656
12657                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12658                         flags |= DTRACEBUF_FILL;
12659
12660                 if (state != dtrace_anon.dta_state ||
12661                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12662                         flags |= DTRACEBUF_INACTIVE;
12663         }
12664
12665         for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
12666                 /*
12667                  * The size must be 8-byte aligned.  If the size is not 8-byte
12668                  * aligned, drop it down by the difference.
12669                  */
12670                 if (size & (sizeof (uint64_t) - 1))
12671                         size -= size & (sizeof (uint64_t) - 1);
12672
12673                 if (size < state->dts_reserve) {
12674                         /*
12675                          * Buffers always must be large enough to accommodate
12676                          * their prereserved space.  We return E2BIG instead
12677                          * of ENOMEM in this case to allow for user-level
12678                          * software to differentiate the cases.
12679                          */
12680                         return (E2BIG);
12681                 }
12682
12683                 rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
12684
12685                 if (rval != ENOMEM) {
12686                         opt[which] = size;
12687                         return (rval);
12688                 }
12689
12690                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12691                         return (rval);
12692
12693                 for (divisor = 2; divisor < factor; divisor <<= 1)
12694                         continue;
12695         }
12696
12697         return (ENOMEM);
12698 }
12699
12700 static int
12701 dtrace_state_buffers(dtrace_state_t *state)
12702 {
12703         dtrace_speculation_t *spec = state->dts_speculations;
12704         int rval, i;
12705
12706         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12707             DTRACEOPT_BUFSIZE)) != 0)
12708                 return (rval);
12709
12710         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12711             DTRACEOPT_AGGSIZE)) != 0)
12712                 return (rval);
12713
12714         for (i = 0; i < state->dts_nspeculations; i++) {
12715                 if ((rval = dtrace_state_buffer(state,
12716                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12717                         return (rval);
12718         }
12719
12720         return (0);
12721 }
12722
12723 static void
12724 dtrace_state_prereserve(dtrace_state_t *state)
12725 {
12726         dtrace_ecb_t *ecb;
12727         dtrace_probe_t *probe;
12728
12729         state->dts_reserve = 0;
12730
12731         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12732                 return;
12733
12734         /*
12735          * If our buffer policy is a "fill" buffer policy, we need to set the
12736          * prereserved space to be the space required by the END probes.
12737          */
12738         probe = dtrace_probes[dtrace_probeid_end - 1];
12739         ASSERT(probe != NULL);
12740
12741         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12742                 if (ecb->dte_state != state)
12743                         continue;
12744
12745                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
12746         }
12747 }
12748
12749 static int
12750 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12751 {
12752         dtrace_optval_t *opt = state->dts_options, sz, nspec;
12753         dtrace_speculation_t *spec;
12754         dtrace_buffer_t *buf;
12755         cyc_handler_t hdlr;
12756         cyc_time_t when;
12757         int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
12758         dtrace_icookie_t cookie;
12759
12760         mutex_enter(&cpu_lock);
12761         mutex_enter(&dtrace_lock);
12762
12763         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12764                 rval = EBUSY;
12765                 goto out;
12766         }
12767
12768         /*
12769          * Before we can perform any checks, we must prime all of the
12770          * retained enablings that correspond to this state.
12771          */
12772         dtrace_enabling_prime(state);
12773
12774         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12775                 rval = EACCES;
12776                 goto out;
12777         }
12778
12779         dtrace_state_prereserve(state);
12780
12781         /*
12782          * Now we want to do is try to allocate our speculations.
12783          * We do not automatically resize the number of speculations; if
12784          * this fails, we will fail the operation.
12785          */
12786         nspec = opt[DTRACEOPT_NSPEC];
12787         ASSERT(nspec != DTRACEOPT_UNSET);
12788
12789         if (nspec > INT_MAX) {
12790                 rval = ENOMEM;
12791                 goto out;
12792         }
12793
12794         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
12795             KM_NOSLEEP | KM_NORMALPRI);
12796
12797         if (spec == NULL) {
12798                 rval = ENOMEM;
12799                 goto out;
12800         }
12801
12802         state->dts_speculations = spec;
12803         state->dts_nspeculations = (int)nspec;
12804
12805         for (i = 0; i < nspec; i++) {
12806                 if ((buf = kmem_zalloc(bufsize,
12807                     KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
12808                         rval = ENOMEM;
12809                         goto err;
12810                 }
12811
12812                 spec[i].dtsp_buffer = buf;
12813         }
12814
12815         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
12816                 if (dtrace_anon.dta_state == NULL) {
12817                         rval = ENOENT;
12818                         goto out;
12819                 }
12820
12821                 if (state->dts_necbs != 0) {
12822                         rval = EALREADY;
12823                         goto out;
12824                 }
12825
12826                 state->dts_anon = dtrace_anon_grab();
12827                 ASSERT(state->dts_anon != NULL);
12828                 state = state->dts_anon;
12829
12830                 /*
12831                  * We want "grabanon" to be set in the grabbed state, so we'll
12832                  * copy that option value from the grabbing state into the
12833                  * grabbed state.
12834                  */
12835                 state->dts_options[DTRACEOPT_GRABANON] =
12836                     opt[DTRACEOPT_GRABANON];
12837
12838                 *cpu = dtrace_anon.dta_beganon;
12839
12840                 /*
12841                  * If the anonymous state is active (as it almost certainly
12842                  * is if the anonymous enabling ultimately matched anything),
12843                  * we don't allow any further option processing -- but we
12844                  * don't return failure.
12845                  */
12846                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
12847                         goto out;
12848         }
12849
12850         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
12851             opt[DTRACEOPT_AGGSIZE] != 0) {
12852                 if (state->dts_aggregations == NULL) {
12853                         /*
12854                          * We're not going to create an aggregation buffer
12855                          * because we don't have any ECBs that contain
12856                          * aggregations -- set this option to 0.
12857                          */
12858                         opt[DTRACEOPT_AGGSIZE] = 0;
12859                 } else {
12860                         /*
12861                          * If we have an aggregation buffer, we must also have
12862                          * a buffer to use as scratch.
12863                          */
12864                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
12865                             opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
12866                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
12867                         }
12868                 }
12869         }
12870
12871         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
12872             opt[DTRACEOPT_SPECSIZE] != 0) {
12873                 if (!state->dts_speculates) {
12874                         /*
12875                          * We're not going to create speculation buffers
12876                          * because we don't have any ECBs that actually
12877                          * speculate -- set the speculation size to 0.
12878                          */
12879                         opt[DTRACEOPT_SPECSIZE] = 0;
12880                 }
12881         }
12882
12883         /*
12884          * The bare minimum size for any buffer that we're actually going to
12885          * do anything to is sizeof (uint64_t).
12886          */
12887         sz = sizeof (uint64_t);
12888
12889         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
12890             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
12891             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
12892                 /*
12893                  * A buffer size has been explicitly set to 0 (or to a size
12894                  * that will be adjusted to 0) and we need the space -- we
12895                  * need to return failure.  We return ENOSPC to differentiate
12896                  * it from failing to allocate a buffer due to failure to meet
12897                  * the reserve (for which we return E2BIG).
12898                  */
12899                 rval = ENOSPC;
12900                 goto out;
12901         }
12902
12903         if ((rval = dtrace_state_buffers(state)) != 0)
12904                 goto err;
12905
12906         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
12907                 sz = dtrace_dstate_defsize;
12908
12909         do {
12910                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
12911
12912                 if (rval == 0)
12913                         break;
12914
12915                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12916                         goto err;
12917         } while (sz >>= 1);
12918
12919         opt[DTRACEOPT_DYNVARSIZE] = sz;
12920
12921         if (rval != 0)
12922                 goto err;
12923
12924         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
12925                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
12926
12927         if (opt[DTRACEOPT_CLEANRATE] == 0)
12928                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12929
12930         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
12931                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
12932
12933         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
12934                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
12935
12936         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
12937         hdlr.cyh_arg = state;
12938         hdlr.cyh_level = CY_LOW_LEVEL;
12939
12940         when.cyt_when = 0;
12941         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
12942
12943         state->dts_cleaner = cyclic_add(&hdlr, &when);
12944
12945         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
12946         hdlr.cyh_arg = state;
12947         hdlr.cyh_level = CY_LOW_LEVEL;
12948
12949         when.cyt_when = 0;
12950         when.cyt_interval = dtrace_deadman_interval;
12951
12952         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
12953         state->dts_deadman = cyclic_add(&hdlr, &when);
12954
12955         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
12956
12957         /*
12958          * Now it's time to actually fire the BEGIN probe.  We need to disable
12959          * interrupts here both to record the CPU on which we fired the BEGIN
12960          * probe (the data from this CPU will be processed first at user
12961          * level) and to manually activate the buffer for this CPU.
12962          */
12963         cookie = dtrace_interrupt_disable();
12964         *cpu = CPU->cpu_id;
12965         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
12966         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
12967
12968         dtrace_probe(dtrace_probeid_begin,
12969             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
12970         dtrace_interrupt_enable(cookie);
12971         /*
12972          * We may have had an exit action from a BEGIN probe; only change our
12973          * state to ACTIVE if we're still in WARMUP.
12974          */
12975         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
12976             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
12977
12978         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
12979                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
12980
12981         /*
12982          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
12983          * want each CPU to transition its principal buffer out of the
12984          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
12985          * processing an ECB halfway down a probe's ECB chain; all CPUs will
12986          * atomically transition from processing none of a state's ECBs to
12987          * processing all of them.
12988          */
12989         dtrace_xcall(DTRACE_CPUALL,
12990             (dtrace_xcall_t)dtrace_buffer_activate, state);
12991         goto out;
12992
12993 err:
12994         dtrace_buffer_free(state->dts_buffer);
12995         dtrace_buffer_free(state->dts_aggbuffer);
12996
12997         if ((nspec = state->dts_nspeculations) == 0) {
12998                 ASSERT(state->dts_speculations == NULL);
12999                 goto out;
13000         }
13001
13002         spec = state->dts_speculations;
13003         ASSERT(spec != NULL);
13004
13005         for (i = 0; i < state->dts_nspeculations; i++) {
13006                 if ((buf = spec[i].dtsp_buffer) == NULL)
13007                         break;
13008
13009                 dtrace_buffer_free(buf);
13010                 kmem_free(buf, bufsize);
13011         }
13012
13013         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13014         state->dts_nspeculations = 0;
13015         state->dts_speculations = NULL;
13016
13017 out:
13018         mutex_exit(&dtrace_lock);
13019         mutex_exit(&cpu_lock);
13020
13021         return (rval);
13022 }
13023
13024 static int
13025 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13026 {
13027         dtrace_icookie_t cookie;
13028
13029         ASSERT(MUTEX_HELD(&dtrace_lock));
13030
13031         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13032             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13033                 return (EINVAL);
13034
13035         /*
13036          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13037          * to be sure that every CPU has seen it.  See below for the details
13038          * on why this is done.
13039          */
13040         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13041         dtrace_sync();
13042
13043         /*
13044          * By this point, it is impossible for any CPU to be still processing
13045          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
13046          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13047          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
13048          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13049          * iff we're in the END probe.
13050          */
13051         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13052         dtrace_sync();
13053         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13054
13055         /*
13056          * Finally, we can release the reserve and call the END probe.  We
13057          * disable interrupts across calling the END probe to allow us to
13058          * return the CPU on which we actually called the END probe.  This
13059          * allows user-land to be sure that this CPU's principal buffer is
13060          * processed last.
13061          */
13062         state->dts_reserve = 0;
13063
13064         cookie = dtrace_interrupt_disable();
13065         *cpu = CPU->cpu_id;
13066         dtrace_probe(dtrace_probeid_end,
13067             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13068         dtrace_interrupt_enable(cookie);
13069
13070         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13071         dtrace_sync();
13072
13073         return (0);
13074 }
13075
13076 static int
13077 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13078     dtrace_optval_t val)
13079 {
13080         ASSERT(MUTEX_HELD(&dtrace_lock));
13081
13082         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13083                 return (EBUSY);
13084
13085         if (option >= DTRACEOPT_MAX)
13086                 return (EINVAL);
13087
13088         if (option != DTRACEOPT_CPU && val < 0)
13089                 return (EINVAL);
13090
13091         switch (option) {
13092         case DTRACEOPT_DESTRUCTIVE:
13093                 if (dtrace_destructive_disallow)
13094                         return (EACCES);
13095
13096                 state->dts_cred.dcr_destructive = 1;
13097                 break;
13098
13099         case DTRACEOPT_BUFSIZE:
13100         case DTRACEOPT_DYNVARSIZE:
13101         case DTRACEOPT_AGGSIZE:
13102         case DTRACEOPT_SPECSIZE:
13103         case DTRACEOPT_STRSIZE:
13104                 if (val < 0)
13105                         return (EINVAL);
13106
13107                 if (val >= LONG_MAX) {
13108                         /*
13109                          * If this is an otherwise negative value, set it to
13110                          * the highest multiple of 128m less than LONG_MAX.
13111                          * Technically, we're adjusting the size without
13112                          * regard to the buffer resizing policy, but in fact,
13113                          * this has no effect -- if we set the buffer size to
13114                          * ~LONG_MAX and the buffer policy is ultimately set to
13115                          * be "manual", the buffer allocation is guaranteed to
13116                          * fail, if only because the allocation requires two
13117                          * buffers.  (We set the the size to the highest
13118                          * multiple of 128m because it ensures that the size
13119                          * will remain a multiple of a megabyte when
13120                          * repeatedly halved -- all the way down to 15m.)
13121                          */
13122                         val = LONG_MAX - (1 << 27) + 1;
13123                 }
13124         }
13125
13126         state->dts_options[option] = val;
13127
13128         return (0);
13129 }
13130
13131 static void
13132 dtrace_state_destroy(dtrace_state_t *state)
13133 {
13134         dtrace_ecb_t *ecb;
13135         dtrace_vstate_t *vstate = &state->dts_vstate;
13136         minor_t minor = getminor(state->dts_dev);
13137         int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
13138         dtrace_speculation_t *spec = state->dts_speculations;
13139         int nspec = state->dts_nspeculations;
13140         uint32_t match;
13141
13142         ASSERT(MUTEX_HELD(&dtrace_lock));
13143         ASSERT(MUTEX_HELD(&cpu_lock));
13144
13145         /*
13146          * First, retract any retained enablings for this state.
13147          */
13148         dtrace_enabling_retract(state);
13149         ASSERT(state->dts_nretained == 0);
13150
13151         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13152             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13153                 /*
13154                  * We have managed to come into dtrace_state_destroy() on a
13155                  * hot enabling -- almost certainly because of a disorderly
13156                  * shutdown of a consumer.  (That is, a consumer that is
13157                  * exiting without having called dtrace_stop().) In this case,
13158                  * we're going to set our activity to be KILLED, and then
13159                  * issue a sync to be sure that everyone is out of probe
13160                  * context before we start blowing away ECBs.
13161                  */
13162                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
13163                 dtrace_sync();
13164         }
13165
13166         /*
13167          * Release the credential hold we took in dtrace_state_create().
13168          */
13169         if (state->dts_cred.dcr_cred != NULL)
13170                 crfree(state->dts_cred.dcr_cred);
13171
13172         /*
13173          * Now we can safely disable and destroy any enabled probes.  Because
13174          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13175          * (especially if they're all enabled), we take two passes through the
13176          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13177          * in the second we disable whatever is left over.
13178          */
13179         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13180                 for (i = 0; i < state->dts_necbs; i++) {
13181                         if ((ecb = state->dts_ecbs[i]) == NULL)
13182                                 continue;
13183
13184                         if (match && ecb->dte_probe != NULL) {
13185                                 dtrace_probe_t *probe = ecb->dte_probe;
13186                                 dtrace_provider_t *prov = probe->dtpr_provider;
13187
13188                                 if (!(prov->dtpv_priv.dtpp_flags & match))
13189                                         continue;
13190                         }
13191
13192                         dtrace_ecb_disable(ecb);
13193                         dtrace_ecb_destroy(ecb);
13194                 }
13195
13196                 if (!match)
13197                         break;
13198         }
13199
13200         /*
13201          * Before we free the buffers, perform one more sync to assure that
13202          * every CPU is out of probe context.
13203          */
13204         dtrace_sync();
13205
13206         dtrace_buffer_free(state->dts_buffer);
13207         dtrace_buffer_free(state->dts_aggbuffer);
13208
13209         for (i = 0; i < nspec; i++)
13210                 dtrace_buffer_free(spec[i].dtsp_buffer);
13211
13212         if (state->dts_cleaner != CYCLIC_NONE)
13213                 cyclic_remove(state->dts_cleaner);
13214
13215         if (state->dts_deadman != CYCLIC_NONE)
13216                 cyclic_remove(state->dts_deadman);
13217
13218         dtrace_dstate_fini(&vstate->dtvs_dynvars);
13219         dtrace_vstate_fini(vstate);
13220         kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13221
13222         if (state->dts_aggregations != NULL) {
13223 #ifdef DEBUG
13224                 for (i = 0; i < state->dts_naggregations; i++)
13225                         ASSERT(state->dts_aggregations[i] == NULL);
13226 #endif
13227                 ASSERT(state->dts_naggregations > 0);
13228                 kmem_free(state->dts_aggregations,
13229                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13230         }
13231
13232         kmem_free(state->dts_buffer, bufsize);
13233         kmem_free(state->dts_aggbuffer, bufsize);
13234
13235         for (i = 0; i < nspec; i++)
13236                 kmem_free(spec[i].dtsp_buffer, bufsize);
13237
13238         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13239
13240         dtrace_format_destroy(state);
13241
13242         vmem_destroy(state->dts_aggid_arena);
13243         ddi_soft_state_free(dtrace_softstate, minor);
13244         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13245 }
13246
13247 /*
13248  * DTrace Anonymous Enabling Functions
13249  */
13250 static dtrace_state_t *
13251 dtrace_anon_grab(void)
13252 {
13253         dtrace_state_t *state;
13254
13255         ASSERT(MUTEX_HELD(&dtrace_lock));
13256
13257         if ((state = dtrace_anon.dta_state) == NULL) {
13258                 ASSERT(dtrace_anon.dta_enabling == NULL);
13259                 return (NULL);
13260         }
13261
13262         ASSERT(dtrace_anon.dta_enabling != NULL);
13263         ASSERT(dtrace_retained != NULL);
13264
13265         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
13266         dtrace_anon.dta_enabling = NULL;
13267         dtrace_anon.dta_state = NULL;
13268
13269         return (state);
13270 }
13271
13272 static void
13273 dtrace_anon_property(void)
13274 {
13275         int i, rv;
13276         dtrace_state_t *state;
13277         dof_hdr_t *dof;
13278         char c[32];             /* enough for "dof-data-" + digits */
13279
13280         ASSERT(MUTEX_HELD(&dtrace_lock));
13281         ASSERT(MUTEX_HELD(&cpu_lock));
13282
13283         for (i = 0; ; i++) {
13284                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
13285
13286                 dtrace_err_verbose = 1;
13287
13288                 if ((dof = dtrace_dof_property(c)) == NULL) {
13289                         dtrace_err_verbose = 0;
13290                         break;
13291                 }
13292
13293                 /*
13294                  * We want to create anonymous state, so we need to transition
13295                  * the kernel debugger to indicate that DTrace is active.  If
13296                  * this fails (e.g. because the debugger has modified text in
13297                  * some way), we won't continue with the processing.
13298                  */
13299                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
13300                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
13301                             "enabling ignored.");
13302                         dtrace_dof_destroy(dof);
13303                         break;
13304                 }
13305
13306                 /*
13307                  * If we haven't allocated an anonymous state, we'll do so now.
13308                  */
13309                 if ((state = dtrace_anon.dta_state) == NULL) {
13310                         state = dtrace_state_create(NULL, NULL);
13311                         dtrace_anon.dta_state = state;
13312
13313                         if (state == NULL) {
13314                                 /*
13315                                  * This basically shouldn't happen:  the only
13316                                  * failure mode from dtrace_state_create() is a
13317                                  * failure of ddi_soft_state_zalloc() that
13318                                  * itself should never happen.  Still, the
13319                                  * interface allows for a failure mode, and
13320                                  * we want to fail as gracefully as possible:
13321                                  * we'll emit an error message and cease
13322                                  * processing anonymous state in this case.
13323                                  */
13324                                 cmn_err(CE_WARN, "failed to create "
13325                                     "anonymous state");
13326                                 dtrace_dof_destroy(dof);
13327                                 break;
13328                         }
13329                 }
13330
13331                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
13332                     &dtrace_anon.dta_enabling, 0, B_TRUE);
13333
13334                 if (rv == 0)
13335                         rv = dtrace_dof_options(dof, state);
13336
13337                 dtrace_err_verbose = 0;
13338                 dtrace_dof_destroy(dof);
13339
13340                 if (rv != 0) {
13341                         /*
13342                          * This is malformed DOF; chuck any anonymous state
13343                          * that we created.
13344                          */
13345                         ASSERT(dtrace_anon.dta_enabling == NULL);
13346                         dtrace_state_destroy(state);
13347                         dtrace_anon.dta_state = NULL;
13348                         break;
13349                 }
13350
13351                 ASSERT(dtrace_anon.dta_enabling != NULL);
13352         }
13353
13354         if (dtrace_anon.dta_enabling != NULL) {
13355                 int rval;
13356
13357                 /*
13358                  * dtrace_enabling_retain() can only fail because we are
13359                  * trying to retain more enablings than are allowed -- but
13360                  * we only have one anonymous enabling, and we are guaranteed
13361                  * to be allowed at least one retained enabling; we assert
13362                  * that dtrace_enabling_retain() returns success.
13363                  */
13364                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
13365                 ASSERT(rval == 0);
13366
13367                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
13368         }
13369 }
13370
13371 /*
13372  * DTrace Helper Functions
13373  */
13374 static void
13375 dtrace_helper_trace(dtrace_helper_action_t *helper,
13376     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
13377 {
13378         uint32_t size, next, nnext, i;
13379         dtrace_helptrace_t *ent;
13380         uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13381
13382         if (!dtrace_helptrace_enabled)
13383                 return;
13384
13385         ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
13386
13387         /*
13388          * What would a tracing framework be without its own tracing
13389          * framework?  (Well, a hell of a lot simpler, for starters...)
13390          */
13391         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
13392             sizeof (uint64_t) - sizeof (uint64_t);
13393
13394         /*
13395          * Iterate until we can allocate a slot in the trace buffer.
13396          */
13397         do {
13398                 next = dtrace_helptrace_next;
13399
13400                 if (next + size < dtrace_helptrace_bufsize) {
13401                         nnext = next + size;
13402                 } else {
13403                         nnext = size;
13404                 }
13405         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13406
13407         /*
13408          * We have our slot; fill it in.
13409          */
13410         if (nnext == size)
13411                 next = 0;
13412
13413         ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13414         ent->dtht_helper = helper;
13415         ent->dtht_where = where;
13416         ent->dtht_nlocals = vstate->dtvs_nlocals;
13417
13418         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13419             mstate->dtms_fltoffs : -1;
13420         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13421         ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
13422
13423         for (i = 0; i < vstate->dtvs_nlocals; i++) {
13424                 dtrace_statvar_t *svar;
13425
13426                 if ((svar = vstate->dtvs_locals[i]) == NULL)
13427                         continue;
13428
13429                 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
13430                 ent->dtht_locals[i] =
13431                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
13432         }
13433 }
13434
13435 static uint64_t
13436 dtrace_helper(int which, dtrace_mstate_t *mstate,
13437     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13438 {
13439         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13440         uint64_t sarg0 = mstate->dtms_arg[0];
13441         uint64_t sarg1 = mstate->dtms_arg[1];
13442         uint64_t rval;
13443         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13444         dtrace_helper_action_t *helper;
13445         dtrace_vstate_t *vstate;
13446         dtrace_difo_t *pred;
13447         int i, trace = dtrace_helptrace_enabled;
13448
13449         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13450
13451         if (helpers == NULL)
13452                 return (0);
13453
13454         if ((helper = helpers->dthps_actions[which]) == NULL)
13455                 return (0);
13456
13457         vstate = &helpers->dthps_vstate;
13458         mstate->dtms_arg[0] = arg0;
13459         mstate->dtms_arg[1] = arg1;
13460
13461         /*
13462          * Now iterate over each helper.  If its predicate evaluates to 'true',
13463          * we'll call the corresponding actions.  Note that the below calls
13464          * to dtrace_dif_emulate() may set faults in machine state.  This is
13465          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
13466          * the stored DIF offset with its own (which is the desired behavior).
13467          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13468          * from machine state; this is okay, too.
13469          */
13470         for (; helper != NULL; helper = helper->dtha_next) {
13471                 if ((pred = helper->dtha_predicate) != NULL) {
13472                         if (trace)
13473                                 dtrace_helper_trace(helper, mstate, vstate, 0);
13474
13475                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13476                                 goto next;
13477
13478                         if (*flags & CPU_DTRACE_FAULT)
13479                                 goto err;
13480                 }
13481
13482                 for (i = 0; i < helper->dtha_nactions; i++) {
13483                         if (trace)
13484                                 dtrace_helper_trace(helper,
13485                                     mstate, vstate, i + 1);
13486
13487                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
13488                             mstate, vstate, state);
13489
13490                         if (*flags & CPU_DTRACE_FAULT)
13491                                 goto err;
13492                 }
13493
13494 next:
13495                 if (trace)
13496                         dtrace_helper_trace(helper, mstate, vstate,
13497                             DTRACE_HELPTRACE_NEXT);
13498         }
13499
13500         if (trace)
13501                 dtrace_helper_trace(helper, mstate, vstate,
13502                     DTRACE_HELPTRACE_DONE);
13503
13504         /*
13505          * Restore the arg0 that we saved upon entry.
13506          */
13507         mstate->dtms_arg[0] = sarg0;
13508         mstate->dtms_arg[1] = sarg1;
13509
13510         return (rval);
13511
13512 err:
13513         if (trace)
13514                 dtrace_helper_trace(helper, mstate, vstate,
13515                     DTRACE_HELPTRACE_ERR);
13516
13517         /*
13518          * Restore the arg0 that we saved upon entry.
13519          */
13520         mstate->dtms_arg[0] = sarg0;
13521         mstate->dtms_arg[1] = sarg1;
13522
13523         return (NULL);
13524 }
13525
13526 static void
13527 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13528     dtrace_vstate_t *vstate)
13529 {
13530         int i;
13531
13532         if (helper->dtha_predicate != NULL)
13533                 dtrace_difo_release(helper->dtha_predicate, vstate);
13534
13535         for (i = 0; i < helper->dtha_nactions; i++) {
13536                 ASSERT(helper->dtha_actions[i] != NULL);
13537                 dtrace_difo_release(helper->dtha_actions[i], vstate);
13538         }
13539
13540         kmem_free(helper->dtha_actions,
13541             helper->dtha_nactions * sizeof (dtrace_difo_t *));
13542         kmem_free(helper, sizeof (dtrace_helper_action_t));
13543 }
13544
13545 static int
13546 dtrace_helper_destroygen(int gen)
13547 {
13548         proc_t *p = curproc;
13549         dtrace_helpers_t *help = p->p_dtrace_helpers;
13550         dtrace_vstate_t *vstate;
13551         int i;
13552
13553         ASSERT(MUTEX_HELD(&dtrace_lock));
13554
13555         if (help == NULL || gen > help->dthps_generation)
13556                 return (EINVAL);
13557
13558         vstate = &help->dthps_vstate;
13559
13560         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13561                 dtrace_helper_action_t *last = NULL, *h, *next;
13562
13563                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
13564                         next = h->dtha_next;
13565
13566                         if (h->dtha_generation == gen) {
13567                                 if (last != NULL) {
13568                                         last->dtha_next = next;
13569                                 } else {
13570                                         help->dthps_actions[i] = next;
13571                                 }
13572
13573                                 dtrace_helper_action_destroy(h, vstate);
13574                         } else {
13575                                 last = h;
13576                         }
13577                 }
13578         }
13579
13580         /*
13581          * Interate until we've cleared out all helper providers with the
13582          * given generation number.
13583          */
13584         for (;;) {
13585                 dtrace_helper_provider_t *prov;
13586
13587                 /*
13588                  * Look for a helper provider with the right generation. We
13589                  * have to start back at the beginning of the list each time
13590                  * because we drop dtrace_lock. It's unlikely that we'll make
13591                  * more than two passes.
13592                  */
13593                 for (i = 0; i < help->dthps_nprovs; i++) {
13594                         prov = help->dthps_provs[i];
13595
13596                         if (prov->dthp_generation == gen)
13597                                 break;
13598                 }
13599
13600                 /*
13601                  * If there were no matches, we're done.
13602                  */
13603                 if (i == help->dthps_nprovs)
13604                         break;
13605
13606                 /*
13607                  * Move the last helper provider into this slot.
13608                  */
13609                 help->dthps_nprovs--;
13610                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13611                 help->dthps_provs[help->dthps_nprovs] = NULL;
13612
13613                 mutex_exit(&dtrace_lock);
13614
13615                 /*
13616                  * If we have a meta provider, remove this helper provider.
13617                  */
13618                 mutex_enter(&dtrace_meta_lock);
13619                 if (dtrace_meta_pid != NULL) {
13620                         ASSERT(dtrace_deferred_pid == NULL);
13621                         dtrace_helper_provider_remove(&prov->dthp_prov,
13622                             p->p_pid);
13623                 }
13624                 mutex_exit(&dtrace_meta_lock);
13625
13626                 dtrace_helper_provider_destroy(prov);
13627
13628                 mutex_enter(&dtrace_lock);
13629         }
13630
13631         return (0);
13632 }
13633
13634 static int
13635 dtrace_helper_validate(dtrace_helper_action_t *helper)
13636 {
13637         int err = 0, i;
13638         dtrace_difo_t *dp;
13639
13640         if ((dp = helper->dtha_predicate) != NULL)
13641                 err += dtrace_difo_validate_helper(dp);
13642
13643         for (i = 0; i < helper->dtha_nactions; i++)
13644                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13645
13646         return (err == 0);
13647 }
13648
13649 static int
13650 dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
13651 {
13652         dtrace_helpers_t *help;
13653         dtrace_helper_action_t *helper, *last;
13654         dtrace_actdesc_t *act;
13655         dtrace_vstate_t *vstate;
13656         dtrace_predicate_t *pred;
13657         int count = 0, nactions = 0, i;
13658
13659         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13660                 return (EINVAL);
13661
13662         help = curproc->p_dtrace_helpers;
13663         last = help->dthps_actions[which];
13664         vstate = &help->dthps_vstate;
13665
13666         for (count = 0; last != NULL; last = last->dtha_next) {
13667                 count++;
13668                 if (last->dtha_next == NULL)
13669                         break;
13670         }
13671
13672         /*
13673          * If we already have dtrace_helper_actions_max helper actions for this
13674          * helper action type, we'll refuse to add a new one.
13675          */
13676         if (count >= dtrace_helper_actions_max)
13677                 return (ENOSPC);
13678
13679         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13680         helper->dtha_generation = help->dthps_generation;
13681
13682         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13683                 ASSERT(pred->dtp_difo != NULL);
13684                 dtrace_difo_hold(pred->dtp_difo);
13685                 helper->dtha_predicate = pred->dtp_difo;
13686         }
13687
13688         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13689                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
13690                         goto err;
13691
13692                 if (act->dtad_difo == NULL)
13693                         goto err;
13694
13695                 nactions++;
13696         }
13697
13698         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13699             (helper->dtha_nactions = nactions), KM_SLEEP);
13700
13701         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13702                 dtrace_difo_hold(act->dtad_difo);
13703                 helper->dtha_actions[i++] = act->dtad_difo;
13704         }
13705
13706         if (!dtrace_helper_validate(helper))
13707                 goto err;
13708
13709         if (last == NULL) {
13710                 help->dthps_actions[which] = helper;
13711         } else {
13712                 last->dtha_next = helper;
13713         }
13714
13715         if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
13716                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13717                 dtrace_helptrace_next = 0;
13718         }
13719
13720         return (0);
13721 err:
13722         dtrace_helper_action_destroy(helper, vstate);
13723         return (EINVAL);
13724 }
13725
13726 static void
13727 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13728     dof_helper_t *dofhp)
13729 {
13730         ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
13731
13732         mutex_enter(&dtrace_meta_lock);
13733         mutex_enter(&dtrace_lock);
13734
13735         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13736                 /*
13737                  * If the dtrace module is loaded but not attached, or if
13738                  * there aren't isn't a meta provider registered to deal with
13739                  * these provider descriptions, we need to postpone creating
13740                  * the actual providers until later.
13741                  */
13742
13743                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13744                     dtrace_deferred_pid != help) {
13745                         help->dthps_deferred = 1;
13746                         help->dthps_pid = p->p_pid;
13747                         help->dthps_next = dtrace_deferred_pid;
13748                         help->dthps_prev = NULL;
13749                         if (dtrace_deferred_pid != NULL)
13750                                 dtrace_deferred_pid->dthps_prev = help;
13751                         dtrace_deferred_pid = help;
13752                 }
13753
13754                 mutex_exit(&dtrace_lock);
13755
13756         } else if (dofhp != NULL) {
13757                 /*
13758                  * If the dtrace module is loaded and we have a particular
13759                  * helper provider description, pass that off to the
13760                  * meta provider.
13761                  */
13762
13763                 mutex_exit(&dtrace_lock);
13764
13765                 dtrace_helper_provide(dofhp, p->p_pid);
13766
13767         } else {
13768                 /*
13769                  * Otherwise, just pass all the helper provider descriptions
13770                  * off to the meta provider.
13771                  */
13772
13773                 int i;
13774                 mutex_exit(&dtrace_lock);
13775
13776                 for (i = 0; i < help->dthps_nprovs; i++) {
13777                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13778                             p->p_pid);
13779                 }
13780         }
13781
13782         mutex_exit(&dtrace_meta_lock);
13783 }
13784
13785 static int
13786 dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
13787 {
13788         dtrace_helpers_t *help;
13789         dtrace_helper_provider_t *hprov, **tmp_provs;
13790         uint_t tmp_maxprovs, i;
13791
13792         ASSERT(MUTEX_HELD(&dtrace_lock));
13793
13794         help = curproc->p_dtrace_helpers;
13795         ASSERT(help != NULL);
13796
13797         /*
13798          * If we already have dtrace_helper_providers_max helper providers,
13799          * we're refuse to add a new one.
13800          */
13801         if (help->dthps_nprovs >= dtrace_helper_providers_max)
13802                 return (ENOSPC);
13803
13804         /*
13805          * Check to make sure this isn't a duplicate.
13806          */
13807         for (i = 0; i < help->dthps_nprovs; i++) {
13808                 if (dofhp->dofhp_addr ==
13809                     help->dthps_provs[i]->dthp_prov.dofhp_addr)
13810                         return (EALREADY);
13811         }
13812
13813         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
13814         hprov->dthp_prov = *dofhp;
13815         hprov->dthp_ref = 1;
13816         hprov->dthp_generation = gen;
13817
13818         /*
13819          * Allocate a bigger table for helper providers if it's already full.
13820          */
13821         if (help->dthps_maxprovs == help->dthps_nprovs) {
13822                 tmp_maxprovs = help->dthps_maxprovs;
13823                 tmp_provs = help->dthps_provs;
13824
13825                 if (help->dthps_maxprovs == 0)
13826                         help->dthps_maxprovs = 2;
13827                 else
13828                         help->dthps_maxprovs *= 2;
13829                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
13830                         help->dthps_maxprovs = dtrace_helper_providers_max;
13831
13832                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
13833
13834                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
13835                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
13836
13837                 if (tmp_provs != NULL) {
13838                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
13839                             sizeof (dtrace_helper_provider_t *));
13840                         kmem_free(tmp_provs, tmp_maxprovs *
13841                             sizeof (dtrace_helper_provider_t *));
13842                 }
13843         }
13844
13845         help->dthps_provs[help->dthps_nprovs] = hprov;
13846         help->dthps_nprovs++;
13847
13848         return (0);
13849 }
13850
13851 static void
13852 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
13853 {
13854         mutex_enter(&dtrace_lock);
13855
13856         if (--hprov->dthp_ref == 0) {
13857                 dof_hdr_t *dof;
13858                 mutex_exit(&dtrace_lock);
13859                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
13860                 dtrace_dof_destroy(dof);
13861                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
13862         } else {
13863                 mutex_exit(&dtrace_lock);
13864         }
13865 }
13866
13867 static int
13868 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
13869 {
13870         uintptr_t daddr = (uintptr_t)dof;
13871         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
13872         dof_provider_t *provider;
13873         dof_probe_t *probe;
13874         uint8_t *arg;
13875         char *strtab, *typestr;
13876         dof_stridx_t typeidx;
13877         size_t typesz;
13878         uint_t nprobes, j, k;
13879
13880         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
13881
13882         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
13883                 dtrace_dof_error(dof, "misaligned section offset");
13884                 return (-1);
13885         }
13886
13887         /*
13888          * The section needs to be large enough to contain the DOF provider
13889          * structure appropriate for the given version.
13890          */
13891         if (sec->dofs_size <
13892             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
13893             offsetof(dof_provider_t, dofpv_prenoffs) :
13894             sizeof (dof_provider_t))) {
13895                 dtrace_dof_error(dof, "provider section too small");
13896                 return (-1);
13897         }
13898
13899         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
13900         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
13901         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
13902         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
13903         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
13904
13905         if (str_sec == NULL || prb_sec == NULL ||
13906             arg_sec == NULL || off_sec == NULL)
13907                 return (-1);
13908
13909         enoff_sec = NULL;
13910
13911         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13912             provider->dofpv_prenoffs != DOF_SECT_NONE &&
13913             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
13914             provider->dofpv_prenoffs)) == NULL)
13915                 return (-1);
13916
13917         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
13918
13919         if (provider->dofpv_name >= str_sec->dofs_size ||
13920             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
13921                 dtrace_dof_error(dof, "invalid provider name");
13922                 return (-1);
13923         }
13924
13925         if (prb_sec->dofs_entsize == 0 ||
13926             prb_sec->dofs_entsize > prb_sec->dofs_size) {
13927                 dtrace_dof_error(dof, "invalid entry size");
13928                 return (-1);
13929         }
13930
13931         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
13932                 dtrace_dof_error(dof, "misaligned entry size");
13933                 return (-1);
13934         }
13935
13936         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
13937                 dtrace_dof_error(dof, "invalid entry size");
13938                 return (-1);
13939         }
13940
13941         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
13942                 dtrace_dof_error(dof, "misaligned section offset");
13943                 return (-1);
13944         }
13945
13946         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
13947                 dtrace_dof_error(dof, "invalid entry size");
13948                 return (-1);
13949         }
13950
13951         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
13952
13953         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
13954
13955         /*
13956          * Take a pass through the probes to check for errors.
13957          */
13958         for (j = 0; j < nprobes; j++) {
13959                 probe = (dof_probe_t *)(uintptr_t)(daddr +
13960                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
13961
13962                 if (probe->dofpr_func >= str_sec->dofs_size) {
13963                         dtrace_dof_error(dof, "invalid function name");
13964                         return (-1);
13965                 }
13966
13967                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
13968                         dtrace_dof_error(dof, "function name too long");
13969                         return (-1);
13970                 }
13971
13972                 if (probe->dofpr_name >= str_sec->dofs_size ||
13973                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
13974                         dtrace_dof_error(dof, "invalid probe name");
13975                         return (-1);
13976                 }
13977
13978                 /*
13979                  * The offset count must not wrap the index, and the offsets
13980                  * must also not overflow the section's data.
13981                  */
13982                 if (probe->dofpr_offidx + probe->dofpr_noffs <
13983                     probe->dofpr_offidx ||
13984                     (probe->dofpr_offidx + probe->dofpr_noffs) *
13985                     off_sec->dofs_entsize > off_sec->dofs_size) {
13986                         dtrace_dof_error(dof, "invalid probe offset");
13987                         return (-1);
13988                 }
13989
13990                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
13991                         /*
13992                          * If there's no is-enabled offset section, make sure
13993                          * there aren't any is-enabled offsets. Otherwise
13994                          * perform the same checks as for probe offsets
13995                          * (immediately above).
13996                          */
13997                         if (enoff_sec == NULL) {
13998                                 if (probe->dofpr_enoffidx != 0 ||
13999                                     probe->dofpr_nenoffs != 0) {
14000                                         dtrace_dof_error(dof, "is-enabled "
14001                                             "offsets with null section");
14002                                         return (-1);
14003                                 }
14004                         } else if (probe->dofpr_enoffidx +
14005                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14006                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14007                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14008                                 dtrace_dof_error(dof, "invalid is-enabled "
14009                                     "offset");
14010                                 return (-1);
14011                         }
14012
14013                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14014                                 dtrace_dof_error(dof, "zero probe and "
14015                                     "is-enabled offsets");
14016                                 return (-1);
14017                         }
14018                 } else if (probe->dofpr_noffs == 0) {
14019                         dtrace_dof_error(dof, "zero probe offsets");
14020                         return (-1);
14021                 }
14022
14023                 if (probe->dofpr_argidx + probe->dofpr_xargc <
14024                     probe->dofpr_argidx ||
14025                     (probe->dofpr_argidx + probe->dofpr_xargc) *
14026                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
14027                         dtrace_dof_error(dof, "invalid args");
14028                         return (-1);
14029                 }
14030
14031                 typeidx = probe->dofpr_nargv;
14032                 typestr = strtab + probe->dofpr_nargv;
14033                 for (k = 0; k < probe->dofpr_nargc; k++) {
14034                         if (typeidx >= str_sec->dofs_size) {
14035                                 dtrace_dof_error(dof, "bad "
14036                                     "native argument type");
14037                                 return (-1);
14038                         }
14039
14040                         typesz = strlen(typestr) + 1;
14041                         if (typesz > DTRACE_ARGTYPELEN) {
14042                                 dtrace_dof_error(dof, "native "
14043                                     "argument type too long");
14044                                 return (-1);
14045                         }
14046                         typeidx += typesz;
14047                         typestr += typesz;
14048                 }
14049
14050                 typeidx = probe->dofpr_xargv;
14051                 typestr = strtab + probe->dofpr_xargv;
14052                 for (k = 0; k < probe->dofpr_xargc; k++) {
14053                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14054                                 dtrace_dof_error(dof, "bad "
14055                                     "native argument index");
14056                                 return (-1);
14057                         }
14058
14059                         if (typeidx >= str_sec->dofs_size) {
14060                                 dtrace_dof_error(dof, "bad "
14061                                     "translated argument type");
14062                                 return (-1);
14063                         }
14064
14065                         typesz = strlen(typestr) + 1;
14066                         if (typesz > DTRACE_ARGTYPELEN) {
14067                                 dtrace_dof_error(dof, "translated argument "
14068                                     "type too long");
14069                                 return (-1);
14070                         }
14071
14072                         typeidx += typesz;
14073                         typestr += typesz;
14074                 }
14075         }
14076
14077         return (0);
14078 }
14079
14080 static int
14081 dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
14082 {
14083         dtrace_helpers_t *help;
14084         dtrace_vstate_t *vstate;
14085         dtrace_enabling_t *enab = NULL;
14086         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14087         uintptr_t daddr = (uintptr_t)dof;
14088
14089         ASSERT(MUTEX_HELD(&dtrace_lock));
14090
14091         if ((help = curproc->p_dtrace_helpers) == NULL)
14092                 help = dtrace_helpers_create(curproc);
14093
14094         vstate = &help->dthps_vstate;
14095
14096         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14097             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14098                 dtrace_dof_destroy(dof);
14099                 return (rv);
14100         }
14101
14102         /*
14103          * Look for helper providers and validate their descriptions.
14104          */
14105         if (dhp != NULL) {
14106                 for (i = 0; i < dof->dofh_secnum; i++) {
14107                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14108                             dof->dofh_secoff + i * dof->dofh_secsize);
14109
14110                         if (sec->dofs_type != DOF_SECT_PROVIDER)
14111                                 continue;
14112
14113                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
14114                                 dtrace_enabling_destroy(enab);
14115                                 dtrace_dof_destroy(dof);
14116                                 return (-1);
14117                         }
14118
14119                         nprovs++;
14120                 }
14121         }
14122
14123         /*
14124          * Now we need to walk through the ECB descriptions in the enabling.
14125          */
14126         for (i = 0; i < enab->dten_ndesc; i++) {
14127                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14128                 dtrace_probedesc_t *desc = &ep->dted_probe;
14129
14130                 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
14131                         continue;
14132
14133                 if (strcmp(desc->dtpd_mod, "helper") != 0)
14134                         continue;
14135
14136                 if (strcmp(desc->dtpd_func, "ustack") != 0)
14137                         continue;
14138
14139                 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
14140                     ep)) != 0) {
14141                         /*
14142                          * Adding this helper action failed -- we are now going
14143                          * to rip out the entire generation and return failure.
14144                          */
14145                         (void) dtrace_helper_destroygen(help->dthps_generation);
14146                         dtrace_enabling_destroy(enab);
14147                         dtrace_dof_destroy(dof);
14148                         return (-1);
14149                 }
14150
14151                 nhelpers++;
14152         }
14153
14154         if (nhelpers < enab->dten_ndesc)
14155                 dtrace_dof_error(dof, "unmatched helpers");
14156
14157         gen = help->dthps_generation++;
14158         dtrace_enabling_destroy(enab);
14159
14160         if (dhp != NULL && nprovs > 0) {
14161                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14162                 if (dtrace_helper_provider_add(dhp, gen) == 0) {
14163                         mutex_exit(&dtrace_lock);
14164                         dtrace_helper_provider_register(curproc, help, dhp);
14165                         mutex_enter(&dtrace_lock);
14166
14167                         destroy = 0;
14168                 }
14169         }
14170
14171         if (destroy)
14172                 dtrace_dof_destroy(dof);
14173
14174         return (gen);
14175 }
14176
14177 static dtrace_helpers_t *
14178 dtrace_helpers_create(proc_t *p)
14179 {
14180         dtrace_helpers_t *help;
14181
14182         ASSERT(MUTEX_HELD(&dtrace_lock));
14183         ASSERT(p->p_dtrace_helpers == NULL);
14184
14185         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14186         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14187             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14188
14189         p->p_dtrace_helpers = help;
14190         dtrace_helpers++;
14191
14192         return (help);
14193 }
14194
14195 static void
14196 dtrace_helpers_destroy(void)
14197 {
14198         dtrace_helpers_t *help;
14199         dtrace_vstate_t *vstate;
14200         proc_t *p = curproc;
14201         int i;
14202
14203         mutex_enter(&dtrace_lock);
14204
14205         ASSERT(p->p_dtrace_helpers != NULL);
14206         ASSERT(dtrace_helpers > 0);
14207
14208         help = p->p_dtrace_helpers;
14209         vstate = &help->dthps_vstate;
14210
14211         /*
14212          * We're now going to lose the help from this process.
14213          */
14214         p->p_dtrace_helpers = NULL;
14215         dtrace_sync();
14216
14217         /*
14218          * Destory the helper actions.
14219          */
14220         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14221                 dtrace_helper_action_t *h, *next;
14222
14223                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14224                         next = h->dtha_next;
14225                         dtrace_helper_action_destroy(h, vstate);
14226                         h = next;
14227                 }
14228         }
14229
14230         mutex_exit(&dtrace_lock);
14231
14232         /*
14233          * Destroy the helper providers.
14234          */
14235         if (help->dthps_maxprovs > 0) {
14236                 mutex_enter(&dtrace_meta_lock);
14237                 if (dtrace_meta_pid != NULL) {
14238                         ASSERT(dtrace_deferred_pid == NULL);
14239
14240                         for (i = 0; i < help->dthps_nprovs; i++) {
14241                                 dtrace_helper_provider_remove(
14242                                     &help->dthps_provs[i]->dthp_prov, p->p_pid);
14243                         }
14244                 } else {
14245                         mutex_enter(&dtrace_lock);
14246                         ASSERT(help->dthps_deferred == 0 ||
14247                             help->dthps_next != NULL ||
14248                             help->dthps_prev != NULL ||
14249                             help == dtrace_deferred_pid);
14250
14251                         /*
14252                          * Remove the helper from the deferred list.
14253                          */
14254                         if (help->dthps_next != NULL)
14255                                 help->dthps_next->dthps_prev = help->dthps_prev;
14256                         if (help->dthps_prev != NULL)
14257                                 help->dthps_prev->dthps_next = help->dthps_next;
14258                         if (dtrace_deferred_pid == help) {
14259                                 dtrace_deferred_pid = help->dthps_next;
14260                                 ASSERT(help->dthps_prev == NULL);
14261                         }
14262
14263                         mutex_exit(&dtrace_lock);
14264                 }
14265
14266                 mutex_exit(&dtrace_meta_lock);
14267
14268                 for (i = 0; i < help->dthps_nprovs; i++) {
14269                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
14270                 }
14271
14272                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
14273                     sizeof (dtrace_helper_provider_t *));
14274         }
14275
14276         mutex_enter(&dtrace_lock);
14277
14278         dtrace_vstate_fini(&help->dthps_vstate);
14279         kmem_free(help->dthps_actions,
14280             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
14281         kmem_free(help, sizeof (dtrace_helpers_t));
14282
14283         --dtrace_helpers;
14284         mutex_exit(&dtrace_lock);
14285 }
14286
14287 static void
14288 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
14289 {
14290         dtrace_helpers_t *help, *newhelp;
14291         dtrace_helper_action_t *helper, *new, *last;
14292         dtrace_difo_t *dp;
14293         dtrace_vstate_t *vstate;
14294         int i, j, sz, hasprovs = 0;
14295
14296         mutex_enter(&dtrace_lock);
14297         ASSERT(from->p_dtrace_helpers != NULL);
14298         ASSERT(dtrace_helpers > 0);
14299
14300         help = from->p_dtrace_helpers;
14301         newhelp = dtrace_helpers_create(to);
14302         ASSERT(to->p_dtrace_helpers != NULL);
14303
14304         newhelp->dthps_generation = help->dthps_generation;
14305         vstate = &newhelp->dthps_vstate;
14306
14307         /*
14308          * Duplicate the helper actions.
14309          */
14310         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14311                 if ((helper = help->dthps_actions[i]) == NULL)
14312                         continue;
14313
14314                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
14315                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
14316                             KM_SLEEP);
14317                         new->dtha_generation = helper->dtha_generation;
14318
14319                         if ((dp = helper->dtha_predicate) != NULL) {
14320                                 dp = dtrace_difo_duplicate(dp, vstate);
14321                                 new->dtha_predicate = dp;
14322                         }
14323
14324                         new->dtha_nactions = helper->dtha_nactions;
14325                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
14326                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
14327
14328                         for (j = 0; j < new->dtha_nactions; j++) {
14329                                 dtrace_difo_t *dp = helper->dtha_actions[j];
14330
14331                                 ASSERT(dp != NULL);
14332                                 dp = dtrace_difo_duplicate(dp, vstate);
14333                                 new->dtha_actions[j] = dp;
14334                         }
14335
14336                         if (last != NULL) {
14337                                 last->dtha_next = new;
14338                         } else {
14339                                 newhelp->dthps_actions[i] = new;
14340                         }
14341
14342                         last = new;
14343                 }
14344         }
14345
14346         /*
14347          * Duplicate the helper providers and register them with the
14348          * DTrace framework.
14349          */
14350         if (help->dthps_nprovs > 0) {
14351                 newhelp->dthps_nprovs = help->dthps_nprovs;
14352                 newhelp->dthps_maxprovs = help->dthps_nprovs;
14353                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
14354                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14355                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
14356                         newhelp->dthps_provs[i] = help->dthps_provs[i];
14357                         newhelp->dthps_provs[i]->dthp_ref++;
14358                 }
14359
14360                 hasprovs = 1;
14361         }
14362
14363         mutex_exit(&dtrace_lock);
14364
14365         if (hasprovs)
14366                 dtrace_helper_provider_register(to, newhelp, NULL);
14367 }
14368
14369 /*
14370  * DTrace Hook Functions
14371  */
14372 static void
14373 dtrace_module_loaded(struct modctl *ctl)
14374 {
14375         dtrace_provider_t *prv;
14376
14377         mutex_enter(&dtrace_provider_lock);
14378         mutex_enter(&mod_lock);
14379
14380         ASSERT(ctl->mod_busy);
14381
14382         /*
14383          * We're going to call each providers per-module provide operation
14384          * specifying only this module.
14385          */
14386         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
14387                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
14388
14389         mutex_exit(&mod_lock);
14390         mutex_exit(&dtrace_provider_lock);
14391
14392         /*
14393          * If we have any retained enablings, we need to match against them.
14394          * Enabling probes requires that cpu_lock be held, and we cannot hold
14395          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
14396          * module.  (In particular, this happens when loading scheduling
14397          * classes.)  So if we have any retained enablings, we need to dispatch
14398          * our task queue to do the match for us.
14399          */
14400         mutex_enter(&dtrace_lock);
14401
14402         if (dtrace_retained == NULL) {
14403                 mutex_exit(&dtrace_lock);
14404                 return;
14405         }
14406
14407         (void) taskq_dispatch(dtrace_taskq,
14408             (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
14409
14410         mutex_exit(&dtrace_lock);
14411
14412         /*
14413          * And now, for a little heuristic sleaze:  in general, we want to
14414          * match modules as soon as they load.  However, we cannot guarantee
14415          * this, because it would lead us to the lock ordering violation
14416          * outlined above.  The common case, of course, is that cpu_lock is
14417          * _not_ held -- so we delay here for a clock tick, hoping that that's
14418          * long enough for the task queue to do its work.  If it's not, it's
14419          * not a serious problem -- it just means that the module that we
14420          * just loaded may not be immediately instrumentable.
14421          */
14422         delay(1);
14423 }
14424
14425 static void
14426 dtrace_module_unloaded(struct modctl *ctl)
14427 {
14428         dtrace_probe_t template, *probe, *first, *next;
14429         dtrace_provider_t *prov;
14430
14431         template.dtpr_mod = ctl->mod_modname;
14432
14433         mutex_enter(&dtrace_provider_lock);
14434         mutex_enter(&mod_lock);
14435         mutex_enter(&dtrace_lock);
14436
14437         if (dtrace_bymod == NULL) {
14438                 /*
14439                  * The DTrace module is loaded (obviously) but not attached;
14440                  * we don't have any work to do.
14441                  */
14442                 mutex_exit(&dtrace_provider_lock);
14443                 mutex_exit(&mod_lock);
14444                 mutex_exit(&dtrace_lock);
14445                 return;
14446         }
14447
14448         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
14449             probe != NULL; probe = probe->dtpr_nextmod) {
14450                 if (probe->dtpr_ecb != NULL) {
14451                         mutex_exit(&dtrace_provider_lock);
14452                         mutex_exit(&mod_lock);
14453                         mutex_exit(&dtrace_lock);
14454
14455                         /*
14456                          * This shouldn't _actually_ be possible -- we're
14457                          * unloading a module that has an enabled probe in it.
14458                          * (It's normally up to the provider to make sure that
14459                          * this can't happen.)  However, because dtps_enable()
14460                          * doesn't have a failure mode, there can be an
14461                          * enable/unload race.  Upshot:  we don't want to
14462                          * assert, but we're not going to disable the
14463                          * probe, either.
14464                          */
14465                         if (dtrace_err_verbose) {
14466                                 cmn_err(CE_WARN, "unloaded module '%s' had "
14467                                     "enabled probes", ctl->mod_modname);
14468                         }
14469
14470                         return;
14471                 }
14472         }
14473
14474         probe = first;
14475
14476         for (first = NULL; probe != NULL; probe = next) {
14477                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
14478
14479                 dtrace_probes[probe->dtpr_id - 1] = NULL;
14480
14481                 next = probe->dtpr_nextmod;
14482                 dtrace_hash_remove(dtrace_bymod, probe);
14483                 dtrace_hash_remove(dtrace_byfunc, probe);
14484                 dtrace_hash_remove(dtrace_byname, probe);
14485
14486                 if (first == NULL) {
14487                         first = probe;
14488                         probe->dtpr_nextmod = NULL;
14489                 } else {
14490                         probe->dtpr_nextmod = first;
14491                         first = probe;
14492                 }
14493         }
14494
14495         /*
14496          * We've removed all of the module's probes from the hash chains and
14497          * from the probe array.  Now issue a dtrace_sync() to be sure that
14498          * everyone has cleared out from any probe array processing.
14499          */
14500         dtrace_sync();
14501
14502         for (probe = first; probe != NULL; probe = first) {
14503                 first = probe->dtpr_nextmod;
14504                 prov = probe->dtpr_provider;
14505                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
14506                     probe->dtpr_arg);
14507                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
14508                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
14509                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
14510                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
14511                 kmem_free(probe, sizeof (dtrace_probe_t));
14512         }
14513
14514         mutex_exit(&dtrace_lock);
14515         mutex_exit(&mod_lock);
14516         mutex_exit(&dtrace_provider_lock);
14517 }
14518
14519 void
14520 dtrace_suspend(void)
14521 {
14522         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
14523 }
14524
14525 void
14526 dtrace_resume(void)
14527 {
14528         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
14529 }
14530
14531 static int
14532 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
14533 {
14534         ASSERT(MUTEX_HELD(&cpu_lock));
14535         mutex_enter(&dtrace_lock);
14536
14537         switch (what) {
14538         case CPU_CONFIG: {
14539                 dtrace_state_t *state;
14540                 dtrace_optval_t *opt, rs, c;
14541
14542                 /*
14543                  * For now, we only allocate a new buffer for anonymous state.
14544                  */
14545                 if ((state = dtrace_anon.dta_state) == NULL)
14546                         break;
14547
14548                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14549                         break;
14550
14551                 opt = state->dts_options;
14552                 c = opt[DTRACEOPT_CPU];
14553
14554                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
14555                         break;
14556
14557                 /*
14558                  * Regardless of what the actual policy is, we're going to
14559                  * temporarily set our resize policy to be manual.  We're
14560                  * also going to temporarily set our CPU option to denote
14561                  * the newly configured CPU.
14562                  */
14563                 rs = opt[DTRACEOPT_BUFRESIZE];
14564                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
14565                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
14566
14567                 (void) dtrace_state_buffers(state);
14568
14569                 opt[DTRACEOPT_BUFRESIZE] = rs;
14570                 opt[DTRACEOPT_CPU] = c;
14571
14572                 break;
14573         }
14574
14575         case CPU_UNCONFIG:
14576                 /*
14577                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
14578                  * buffer will be freed when the consumer exits.)
14579                  */
14580                 break;
14581
14582         default:
14583                 break;
14584         }
14585
14586         mutex_exit(&dtrace_lock);
14587         return (0);
14588 }
14589
14590 static void
14591 dtrace_cpu_setup_initial(processorid_t cpu)
14592 {
14593         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
14594 }
14595
14596 static void
14597 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
14598 {
14599         if (dtrace_toxranges >= dtrace_toxranges_max) {
14600                 int osize, nsize;
14601                 dtrace_toxrange_t *range;
14602
14603                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14604
14605                 if (osize == 0) {
14606                         ASSERT(dtrace_toxrange == NULL);
14607                         ASSERT(dtrace_toxranges_max == 0);
14608                         dtrace_toxranges_max = 1;
14609                 } else {
14610                         dtrace_toxranges_max <<= 1;
14611                 }
14612
14613                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
14614                 range = kmem_zalloc(nsize, KM_SLEEP);
14615
14616                 if (dtrace_toxrange != NULL) {
14617                         ASSERT(osize != 0);
14618                         bcopy(dtrace_toxrange, range, osize);
14619                         kmem_free(dtrace_toxrange, osize);
14620                 }
14621
14622                 dtrace_toxrange = range;
14623         }
14624
14625         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
14626         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
14627
14628         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
14629         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
14630         dtrace_toxranges++;
14631 }
14632
14633 /*
14634  * DTrace Driver Cookbook Functions
14635  */
14636 /*ARGSUSED*/
14637 static int
14638 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
14639 {
14640         dtrace_provider_id_t id;
14641         dtrace_state_t *state = NULL;
14642         dtrace_enabling_t *enab;
14643
14644         mutex_enter(&cpu_lock);
14645         mutex_enter(&dtrace_provider_lock);
14646         mutex_enter(&dtrace_lock);
14647
14648         if (ddi_soft_state_init(&dtrace_softstate,
14649             sizeof (dtrace_state_t), 0) != 0) {
14650                 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
14651                 mutex_exit(&cpu_lock);
14652                 mutex_exit(&dtrace_provider_lock);
14653                 mutex_exit(&dtrace_lock);
14654                 return (DDI_FAILURE);
14655         }
14656
14657         if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
14658             DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
14659             ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
14660             DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
14661                 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
14662                 ddi_remove_minor_node(devi, NULL);
14663                 ddi_soft_state_fini(&dtrace_softstate);
14664                 mutex_exit(&cpu_lock);
14665                 mutex_exit(&dtrace_provider_lock);
14666                 mutex_exit(&dtrace_lock);
14667                 return (DDI_FAILURE);
14668         }
14669
14670         ddi_report_dev(devi);
14671         dtrace_devi = devi;
14672
14673         dtrace_modload = dtrace_module_loaded;
14674         dtrace_modunload = dtrace_module_unloaded;
14675         dtrace_cpu_init = dtrace_cpu_setup_initial;
14676         dtrace_helpers_cleanup = dtrace_helpers_destroy;
14677         dtrace_helpers_fork = dtrace_helpers_duplicate;
14678         dtrace_cpustart_init = dtrace_suspend;
14679         dtrace_cpustart_fini = dtrace_resume;
14680         dtrace_debugger_init = dtrace_suspend;
14681         dtrace_debugger_fini = dtrace_resume;
14682
14683         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
14684
14685         ASSERT(MUTEX_HELD(&cpu_lock));
14686
14687         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
14688             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14689         dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
14690             UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
14691             VM_SLEEP | VMC_IDENTIFIER);
14692         dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
14693             1, INT_MAX, 0);
14694
14695         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
14696             sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
14697             NULL, NULL, NULL, NULL, NULL, 0);
14698
14699         ASSERT(MUTEX_HELD(&cpu_lock));
14700         dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
14701             offsetof(dtrace_probe_t, dtpr_nextmod),
14702             offsetof(dtrace_probe_t, dtpr_prevmod));
14703
14704         dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
14705             offsetof(dtrace_probe_t, dtpr_nextfunc),
14706             offsetof(dtrace_probe_t, dtpr_prevfunc));
14707
14708         dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
14709             offsetof(dtrace_probe_t, dtpr_nextname),
14710             offsetof(dtrace_probe_t, dtpr_prevname));
14711
14712         if (dtrace_retain_max < 1) {
14713                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
14714                     "setting to 1", dtrace_retain_max);
14715                 dtrace_retain_max = 1;
14716         }
14717
14718         /*
14719          * Now discover our toxic ranges.
14720          */
14721         dtrace_toxic_ranges(dtrace_toxrange_add);
14722
14723         /*
14724          * Before we register ourselves as a provider to our own framework,
14725          * we would like to assert that dtrace_provider is NULL -- but that's
14726          * not true if we were loaded as a dependency of a DTrace provider.
14727          * Once we've registered, we can assert that dtrace_provider is our
14728          * pseudo provider.
14729          */
14730         (void) dtrace_register("dtrace", &dtrace_provider_attr,
14731             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
14732
14733         ASSERT(dtrace_provider != NULL);
14734         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
14735
14736         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
14737             dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
14738         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
14739             dtrace_provider, NULL, NULL, "END", 0, NULL);
14740         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
14741             dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
14742
14743         dtrace_anon_property();
14744         mutex_exit(&cpu_lock);
14745
14746         /*
14747          * If DTrace helper tracing is enabled, we need to allocate the
14748          * trace buffer and initialize the values.
14749          */
14750         if (dtrace_helptrace_enabled) {
14751                 ASSERT(dtrace_helptrace_buffer == NULL);
14752                 dtrace_helptrace_buffer =
14753                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
14754                 dtrace_helptrace_next = 0;
14755         }
14756
14757         /*
14758          * If there are already providers, we must ask them to provide their
14759          * probes, and then match any anonymous enabling against them.  Note
14760          * that there should be no other retained enablings at this time:
14761          * the only retained enablings at this time should be the anonymous
14762          * enabling.
14763          */
14764         if (dtrace_anon.dta_enabling != NULL) {
14765                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
14766
14767                 dtrace_enabling_provide(NULL);
14768                 state = dtrace_anon.dta_state;
14769
14770                 /*
14771                  * We couldn't hold cpu_lock across the above call to
14772                  * dtrace_enabling_provide(), but we must hold it to actually
14773                  * enable the probes.  We have to drop all of our locks, pick
14774                  * up cpu_lock, and regain our locks before matching the
14775                  * retained anonymous enabling.
14776                  */
14777                 mutex_exit(&dtrace_lock);
14778                 mutex_exit(&dtrace_provider_lock);
14779
14780                 mutex_enter(&cpu_lock);
14781                 mutex_enter(&dtrace_provider_lock);
14782                 mutex_enter(&dtrace_lock);
14783
14784                 if ((enab = dtrace_anon.dta_enabling) != NULL)
14785                         (void) dtrace_enabling_match(enab, NULL);
14786
14787                 mutex_exit(&cpu_lock);
14788         }
14789
14790         mutex_exit(&dtrace_lock);
14791         mutex_exit(&dtrace_provider_lock);
14792
14793         if (state != NULL) {
14794                 /*
14795                  * If we created any anonymous state, set it going now.
14796                  */
14797                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
14798         }
14799
14800         return (DDI_SUCCESS);
14801 }
14802
14803 /*ARGSUSED*/
14804 static int
14805 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
14806 {
14807         dtrace_state_t *state;
14808         uint32_t priv;
14809         uid_t uid;
14810         zoneid_t zoneid;
14811
14812         if (getminor(*devp) == DTRACEMNRN_HELPER)
14813                 return (0);
14814
14815         /*
14816          * If this wasn't an open with the "helper" minor, then it must be
14817          * the "dtrace" minor.
14818          */
14819         if (getminor(*devp) != DTRACEMNRN_DTRACE)
14820                 return (ENXIO);
14821
14822         /*
14823          * If no DTRACE_PRIV_* bits are set in the credential, then the
14824          * caller lacks sufficient permission to do anything with DTrace.
14825          */
14826         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
14827         if (priv == DTRACE_PRIV_NONE)
14828                 return (EACCES);
14829
14830         /*
14831          * Ask all providers to provide all their probes.
14832          */
14833         mutex_enter(&dtrace_provider_lock);
14834         dtrace_probe_provide(NULL, NULL);
14835         mutex_exit(&dtrace_provider_lock);
14836
14837         mutex_enter(&cpu_lock);
14838         mutex_enter(&dtrace_lock);
14839         dtrace_opens++;
14840         dtrace_membar_producer();
14841
14842         /*
14843          * If the kernel debugger is active (that is, if the kernel debugger
14844          * modified text in some way), we won't allow the open.
14845          */
14846         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14847                 dtrace_opens--;
14848                 mutex_exit(&cpu_lock);
14849                 mutex_exit(&dtrace_lock);
14850                 return (EBUSY);
14851         }
14852
14853         state = dtrace_state_create(devp, cred_p);
14854         mutex_exit(&cpu_lock);
14855
14856         if (state == NULL) {
14857                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
14858                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14859                 mutex_exit(&dtrace_lock);
14860                 return (EAGAIN);
14861         }
14862
14863         mutex_exit(&dtrace_lock);
14864
14865         return (0);
14866 }
14867
14868 /*ARGSUSED*/
14869 static int
14870 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
14871 {
14872         minor_t minor = getminor(dev);
14873         dtrace_state_t *state;
14874
14875         if (minor == DTRACEMNRN_HELPER)
14876                 return (0);
14877
14878         state = ddi_get_soft_state(dtrace_softstate, minor);
14879
14880         mutex_enter(&cpu_lock);
14881         mutex_enter(&dtrace_lock);
14882
14883         if (state->dts_anon) {
14884                 /*
14885                  * There is anonymous state. Destroy that first.
14886                  */
14887                 ASSERT(dtrace_anon.dta_state == NULL);
14888                 dtrace_state_destroy(state->dts_anon);
14889         }
14890
14891         dtrace_state_destroy(state);
14892         ASSERT(dtrace_opens > 0);
14893
14894         /*
14895          * Only relinquish control of the kernel debugger interface when there
14896          * are no consumers and no anonymous enablings.
14897          */
14898         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
14899                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
14900
14901         mutex_exit(&dtrace_lock);
14902         mutex_exit(&cpu_lock);
14903
14904         return (0);
14905 }
14906
14907 /*ARGSUSED*/
14908 static int
14909 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
14910 {
14911         int rval;
14912         dof_helper_t help, *dhp = NULL;
14913
14914         switch (cmd) {
14915         case DTRACEHIOC_ADDDOF:
14916                 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
14917                         dtrace_dof_error(NULL, "failed to copyin DOF helper");
14918                         return (EFAULT);
14919                 }
14920
14921                 dhp = &help;
14922                 arg = (intptr_t)help.dofhp_dof;
14923                 /*FALLTHROUGH*/
14924
14925         case DTRACEHIOC_ADD: {
14926                 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
14927
14928                 if (dof == NULL)
14929                         return (rval);
14930
14931                 mutex_enter(&dtrace_lock);
14932
14933                 /*
14934                  * dtrace_helper_slurp() takes responsibility for the dof --
14935                  * it may free it now or it may save it and free it later.
14936                  */
14937                 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
14938                         *rv = rval;
14939                         rval = 0;
14940                 } else {
14941                         rval = EINVAL;
14942                 }
14943
14944                 mutex_exit(&dtrace_lock);
14945                 return (rval);
14946         }
14947
14948         case DTRACEHIOC_REMOVE: {
14949                 mutex_enter(&dtrace_lock);
14950                 rval = dtrace_helper_destroygen(arg);
14951                 mutex_exit(&dtrace_lock);
14952
14953                 return (rval);
14954         }
14955
14956         default:
14957                 break;
14958         }
14959
14960         return (ENOTTY);
14961 }
14962
14963 /*ARGSUSED*/
14964 static int
14965 dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
14966 {
14967         minor_t minor = getminor(dev);
14968         dtrace_state_t *state;
14969         int rval;
14970
14971         if (minor == DTRACEMNRN_HELPER)
14972                 return (dtrace_ioctl_helper(cmd, arg, rv));
14973
14974         state = ddi_get_soft_state(dtrace_softstate, minor);
14975
14976         if (state->dts_anon) {
14977                 ASSERT(dtrace_anon.dta_state == NULL);
14978                 state = state->dts_anon;
14979         }
14980
14981         switch (cmd) {
14982         case DTRACEIOC_PROVIDER: {
14983                 dtrace_providerdesc_t pvd;
14984                 dtrace_provider_t *pvp;
14985
14986                 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
14987                         return (EFAULT);
14988
14989                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
14990                 mutex_enter(&dtrace_provider_lock);
14991
14992                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
14993                         if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
14994                                 break;
14995                 }
14996
14997                 mutex_exit(&dtrace_provider_lock);
14998
14999                 if (pvp == NULL)
15000                         return (ESRCH);
15001
15002                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
15003                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
15004                 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
15005                         return (EFAULT);
15006
15007                 return (0);
15008         }
15009
15010         case DTRACEIOC_EPROBE: {
15011                 dtrace_eprobedesc_t epdesc;
15012                 dtrace_ecb_t *ecb;
15013                 dtrace_action_t *act;
15014                 void *buf;
15015                 size_t size;
15016                 uintptr_t dest;
15017                 int nrecs;
15018
15019                 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
15020                         return (EFAULT);
15021
15022                 mutex_enter(&dtrace_lock);
15023
15024                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
15025                         mutex_exit(&dtrace_lock);
15026                         return (EINVAL);
15027                 }
15028
15029                 if (ecb->dte_probe == NULL) {
15030                         mutex_exit(&dtrace_lock);
15031                         return (EINVAL);
15032                 }
15033
15034                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
15035                 epdesc.dtepd_uarg = ecb->dte_uarg;
15036                 epdesc.dtepd_size = ecb->dte_size;
15037
15038                 nrecs = epdesc.dtepd_nrecs;
15039                 epdesc.dtepd_nrecs = 0;
15040                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15041                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15042                                 continue;
15043
15044                         epdesc.dtepd_nrecs++;
15045                 }
15046
15047                 /*
15048                  * Now that we have the size, we need to allocate a temporary
15049                  * buffer in which to store the complete description.  We need
15050                  * the temporary buffer to be able to drop dtrace_lock()
15051                  * across the copyout(), below.
15052                  */
15053                 size = sizeof (dtrace_eprobedesc_t) +
15054                     (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
15055
15056                 buf = kmem_alloc(size, KM_SLEEP);
15057                 dest = (uintptr_t)buf;
15058
15059                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
15060                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
15061
15062                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
15063                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
15064                                 continue;
15065
15066                         if (nrecs-- == 0)
15067                                 break;
15068
15069                         bcopy(&act->dta_rec, (void *)dest,
15070                             sizeof (dtrace_recdesc_t));
15071                         dest += sizeof (dtrace_recdesc_t);
15072                 }
15073
15074                 mutex_exit(&dtrace_lock);
15075
15076                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15077                         kmem_free(buf, size);
15078                         return (EFAULT);
15079                 }
15080
15081                 kmem_free(buf, size);
15082                 return (0);
15083         }
15084
15085         case DTRACEIOC_AGGDESC: {
15086                 dtrace_aggdesc_t aggdesc;
15087                 dtrace_action_t *act;
15088                 dtrace_aggregation_t *agg;
15089                 int nrecs;
15090                 uint32_t offs;
15091                 dtrace_recdesc_t *lrec;
15092                 void *buf;
15093                 size_t size;
15094                 uintptr_t dest;
15095
15096                 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
15097                         return (EFAULT);
15098
15099                 mutex_enter(&dtrace_lock);
15100
15101                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
15102                         mutex_exit(&dtrace_lock);
15103                         return (EINVAL);
15104                 }
15105
15106                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
15107
15108                 nrecs = aggdesc.dtagd_nrecs;
15109                 aggdesc.dtagd_nrecs = 0;
15110
15111                 offs = agg->dtag_base;
15112                 lrec = &agg->dtag_action.dta_rec;
15113                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
15114
15115                 for (act = agg->dtag_first; ; act = act->dta_next) {
15116                         ASSERT(act->dta_intuple ||
15117                             DTRACEACT_ISAGG(act->dta_kind));
15118
15119                         /*
15120                          * If this action has a record size of zero, it
15121                          * denotes an argument to the aggregating action.
15122                          * Because the presence of this record doesn't (or
15123                          * shouldn't) affect the way the data is interpreted,
15124                          * we don't copy it out to save user-level the
15125                          * confusion of dealing with a zero-length record.
15126                          */
15127                         if (act->dta_rec.dtrd_size == 0) {
15128                                 ASSERT(agg->dtag_hasarg);
15129                                 continue;
15130                         }
15131
15132                         aggdesc.dtagd_nrecs++;
15133
15134                         if (act == &agg->dtag_action)
15135                                 break;
15136                 }
15137
15138                 /*
15139                  * Now that we have the size, we need to allocate a temporary
15140                  * buffer in which to store the complete description.  We need
15141                  * the temporary buffer to be able to drop dtrace_lock()
15142                  * across the copyout(), below.
15143                  */
15144                 size = sizeof (dtrace_aggdesc_t) +
15145                     (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
15146
15147                 buf = kmem_alloc(size, KM_SLEEP);
15148                 dest = (uintptr_t)buf;
15149
15150                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
15151                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
15152
15153                 for (act = agg->dtag_first; ; act = act->dta_next) {
15154                         dtrace_recdesc_t rec = act->dta_rec;
15155
15156                         /*
15157                          * See the comment in the above loop for why we pass
15158                          * over zero-length records.
15159                          */
15160                         if (rec.dtrd_size == 0) {
15161                                 ASSERT(agg->dtag_hasarg);
15162                                 continue;
15163                         }
15164
15165                         if (nrecs-- == 0)
15166                                 break;
15167
15168                         rec.dtrd_offset -= offs;
15169                         bcopy(&rec, (void *)dest, sizeof (rec));
15170                         dest += sizeof (dtrace_recdesc_t);
15171
15172                         if (act == &agg->dtag_action)
15173                                 break;
15174                 }
15175
15176                 mutex_exit(&dtrace_lock);
15177
15178                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
15179                         kmem_free(buf, size);
15180                         return (EFAULT);
15181                 }
15182
15183                 kmem_free(buf, size);
15184                 return (0);
15185         }
15186
15187         case DTRACEIOC_ENABLE: {
15188                 dof_hdr_t *dof;
15189                 dtrace_enabling_t *enab = NULL;
15190                 dtrace_vstate_t *vstate;
15191                 int err = 0;
15192
15193                 *rv = 0;
15194
15195                 /*
15196                  * If a NULL argument has been passed, we take this as our
15197                  * cue to reevaluate our enablings.
15198                  */
15199                 if (arg == NULL) {
15200                         dtrace_enabling_matchall();
15201
15202                         return (0);
15203                 }
15204
15205                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
15206                         return (rval);
15207
15208                 mutex_enter(&cpu_lock);
15209                 mutex_enter(&dtrace_lock);
15210                 vstate = &state->dts_vstate;
15211
15212                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
15213                         mutex_exit(&dtrace_lock);
15214                         mutex_exit(&cpu_lock);
15215                         dtrace_dof_destroy(dof);
15216                         return (EBUSY);
15217                 }
15218
15219                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
15220                         mutex_exit(&dtrace_lock);
15221                         mutex_exit(&cpu_lock);
15222                         dtrace_dof_destroy(dof);
15223                         return (EINVAL);
15224                 }
15225
15226                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
15227                         dtrace_enabling_destroy(enab);
15228                         mutex_exit(&dtrace_lock);
15229                         mutex_exit(&cpu_lock);
15230                         dtrace_dof_destroy(dof);
15231                         return (rval);
15232                 }
15233
15234                 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
15235                         err = dtrace_enabling_retain(enab);
15236                 } else {
15237                         dtrace_enabling_destroy(enab);
15238                 }
15239
15240                 mutex_exit(&cpu_lock);
15241                 mutex_exit(&dtrace_lock);
15242                 dtrace_dof_destroy(dof);
15243
15244                 return (err);
15245         }
15246
15247         case DTRACEIOC_REPLICATE: {
15248                 dtrace_repldesc_t desc;
15249                 dtrace_probedesc_t *match = &desc.dtrpd_match;
15250                 dtrace_probedesc_t *create = &desc.dtrpd_create;
15251                 int err;
15252
15253                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15254                         return (EFAULT);
15255
15256                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15257                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15258                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15259                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15260
15261                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15262                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15263                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15264                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15265
15266                 mutex_enter(&dtrace_lock);
15267                 err = dtrace_enabling_replicate(state, match, create);
15268                 mutex_exit(&dtrace_lock);
15269
15270                 return (err);
15271         }
15272
15273         case DTRACEIOC_PROBEMATCH:
15274         case DTRACEIOC_PROBES: {
15275                 dtrace_probe_t *probe = NULL;
15276                 dtrace_probedesc_t desc;
15277                 dtrace_probekey_t pkey;
15278                 dtrace_id_t i;
15279                 int m = 0;
15280                 uint32_t priv;
15281                 uid_t uid;
15282                 zoneid_t zoneid;
15283
15284                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15285                         return (EFAULT);
15286
15287                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
15288                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
15289                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
15290                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
15291
15292                 /*
15293                  * Before we attempt to match this probe, we want to give
15294                  * all providers the opportunity to provide it.
15295                  */
15296                 if (desc.dtpd_id == DTRACE_IDNONE) {
15297                         mutex_enter(&dtrace_provider_lock);
15298                         dtrace_probe_provide(&desc, NULL);
15299                         mutex_exit(&dtrace_provider_lock);
15300                         desc.dtpd_id++;
15301                 }
15302
15303                 if (cmd == DTRACEIOC_PROBEMATCH)  {
15304                         dtrace_probekey(&desc, &pkey);
15305                         pkey.dtpk_id = DTRACE_IDNONE;
15306                 }
15307
15308                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
15309
15310                 mutex_enter(&dtrace_lock);
15311
15312                 if (cmd == DTRACEIOC_PROBEMATCH) {
15313                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15314                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
15315                                     (m = dtrace_match_probe(probe, &pkey,
15316                                     priv, uid, zoneid)) != 0)
15317                                         break;
15318                         }
15319
15320                         if (m < 0) {
15321                                 mutex_exit(&dtrace_lock);
15322                                 return (EINVAL);
15323                         }
15324
15325                 } else {
15326                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
15327                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
15328                                     dtrace_match_priv(probe, priv, uid, zoneid))
15329                                         break;
15330                         }
15331                 }
15332
15333                 if (probe == NULL) {
15334                         mutex_exit(&dtrace_lock);
15335                         return (ESRCH);
15336                 }
15337
15338                 dtrace_probe_description(probe, &desc);
15339                 mutex_exit(&dtrace_lock);
15340
15341                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15342                         return (EFAULT);
15343
15344                 return (0);
15345         }
15346
15347         case DTRACEIOC_PROBEARG: {
15348                 dtrace_argdesc_t desc;
15349                 dtrace_probe_t *probe;
15350                 dtrace_provider_t *prov;
15351
15352                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15353                         return (EFAULT);
15354
15355                 if (desc.dtargd_id == DTRACE_IDNONE)
15356                         return (EINVAL);
15357
15358                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
15359                         return (EINVAL);
15360
15361                 mutex_enter(&dtrace_provider_lock);
15362                 mutex_enter(&mod_lock);
15363                 mutex_enter(&dtrace_lock);
15364
15365                 if (desc.dtargd_id > dtrace_nprobes) {
15366                         mutex_exit(&dtrace_lock);
15367                         mutex_exit(&mod_lock);
15368                         mutex_exit(&dtrace_provider_lock);
15369                         return (EINVAL);
15370                 }
15371
15372                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
15373                         mutex_exit(&dtrace_lock);
15374                         mutex_exit(&mod_lock);
15375                         mutex_exit(&dtrace_provider_lock);
15376                         return (EINVAL);
15377                 }
15378
15379                 mutex_exit(&dtrace_lock);
15380
15381                 prov = probe->dtpr_provider;
15382
15383                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
15384                         /*
15385                          * There isn't any typed information for this probe.
15386                          * Set the argument number to DTRACE_ARGNONE.
15387                          */
15388                         desc.dtargd_ndx = DTRACE_ARGNONE;
15389                 } else {
15390                         desc.dtargd_native[0] = '\0';
15391                         desc.dtargd_xlate[0] = '\0';
15392                         desc.dtargd_mapping = desc.dtargd_ndx;
15393
15394                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
15395                             probe->dtpr_id, probe->dtpr_arg, &desc);
15396                 }
15397
15398                 mutex_exit(&mod_lock);
15399                 mutex_exit(&dtrace_provider_lock);
15400
15401                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15402                         return (EFAULT);
15403
15404                 return (0);
15405         }
15406
15407         case DTRACEIOC_GO: {
15408                 processorid_t cpuid;
15409                 rval = dtrace_state_go(state, &cpuid);
15410
15411                 if (rval != 0)
15412                         return (rval);
15413
15414                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15415                         return (EFAULT);
15416
15417                 return (0);
15418         }
15419
15420         case DTRACEIOC_STOP: {
15421                 processorid_t cpuid;
15422
15423                 mutex_enter(&dtrace_lock);
15424                 rval = dtrace_state_stop(state, &cpuid);
15425                 mutex_exit(&dtrace_lock);
15426
15427                 if (rval != 0)
15428                         return (rval);
15429
15430                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
15431                         return (EFAULT);
15432
15433                 return (0);
15434         }
15435
15436         case DTRACEIOC_DOFGET: {
15437                 dof_hdr_t hdr, *dof;
15438                 uint64_t len;
15439
15440                 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
15441                         return (EFAULT);
15442
15443                 mutex_enter(&dtrace_lock);
15444                 dof = dtrace_dof_create(state);
15445                 mutex_exit(&dtrace_lock);
15446
15447                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
15448                 rval = copyout(dof, (void *)arg, len);
15449                 dtrace_dof_destroy(dof);
15450
15451                 return (rval == 0 ? 0 : EFAULT);
15452         }
15453
15454         case DTRACEIOC_AGGSNAP:
15455         case DTRACEIOC_BUFSNAP: {
15456                 dtrace_bufdesc_t desc;
15457                 caddr_t cached;
15458                 dtrace_buffer_t *buf;
15459
15460                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
15461                         return (EFAULT);
15462
15463                 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
15464                         return (EINVAL);
15465
15466                 mutex_enter(&dtrace_lock);
15467
15468                 if (cmd == DTRACEIOC_BUFSNAP) {
15469                         buf = &state->dts_buffer[desc.dtbd_cpu];
15470                 } else {
15471                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
15472                 }
15473
15474                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
15475                         size_t sz = buf->dtb_offset;
15476
15477                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
15478                                 mutex_exit(&dtrace_lock);
15479                                 return (EBUSY);
15480                         }
15481
15482                         /*
15483                          * If this buffer has already been consumed, we're
15484                          * going to indicate that there's nothing left here
15485                          * to consume.
15486                          */
15487                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
15488                                 mutex_exit(&dtrace_lock);
15489
15490                                 desc.dtbd_size = 0;
15491                                 desc.dtbd_drops = 0;
15492                                 desc.dtbd_errors = 0;
15493                                 desc.dtbd_oldest = 0;
15494                                 sz = sizeof (desc);
15495
15496                                 if (copyout(&desc, (void *)arg, sz) != 0)
15497                                         return (EFAULT);
15498
15499                                 return (0);
15500                         }
15501
15502                         /*
15503                          * If this is a ring buffer that has wrapped, we want
15504                          * to copy the whole thing out.
15505                          */
15506                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
15507                                 dtrace_buffer_polish(buf);
15508                                 sz = buf->dtb_size;
15509                         }
15510
15511                         if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
15512                                 mutex_exit(&dtrace_lock);
15513                                 return (EFAULT);
15514                         }
15515
15516                         desc.dtbd_size = sz;
15517                         desc.dtbd_drops = buf->dtb_drops;
15518                         desc.dtbd_errors = buf->dtb_errors;
15519                         desc.dtbd_oldest = buf->dtb_xamot_offset;
15520
15521                         mutex_exit(&dtrace_lock);
15522
15523                         if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15524                                 return (EFAULT);
15525
15526                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
15527
15528                         return (0);
15529                 }
15530
15531                 if (buf->dtb_tomax == NULL) {
15532                         ASSERT(buf->dtb_xamot == NULL);
15533                         mutex_exit(&dtrace_lock);
15534                         return (ENOENT);
15535                 }
15536
15537                 cached = buf->dtb_tomax;
15538                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
15539
15540                 dtrace_xcall(desc.dtbd_cpu,
15541                     (dtrace_xcall_t)dtrace_buffer_switch, buf);
15542
15543                 state->dts_errors += buf->dtb_xamot_errors;
15544
15545                 /*
15546                  * If the buffers did not actually switch, then the cross call
15547                  * did not take place -- presumably because the given CPU is
15548                  * not in the ready set.  If this is the case, we'll return
15549                  * ENOENT.
15550                  */
15551                 if (buf->dtb_tomax == cached) {
15552                         ASSERT(buf->dtb_xamot != cached);
15553                         mutex_exit(&dtrace_lock);
15554                         return (ENOENT);
15555                 }
15556
15557                 ASSERT(cached == buf->dtb_xamot);
15558
15559                 /*
15560                  * We have our snapshot; now copy it out.
15561                  */
15562                 if (copyout(buf->dtb_xamot, desc.dtbd_data,
15563                     buf->dtb_xamot_offset) != 0) {
15564                         mutex_exit(&dtrace_lock);
15565                         return (EFAULT);
15566                 }
15567
15568                 desc.dtbd_size = buf->dtb_xamot_offset;
15569                 desc.dtbd_drops = buf->dtb_xamot_drops;
15570                 desc.dtbd_errors = buf->dtb_xamot_errors;
15571                 desc.dtbd_oldest = 0;
15572
15573                 mutex_exit(&dtrace_lock);
15574
15575                 /*
15576                  * Finally, copy out the buffer description.
15577                  */
15578                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
15579                         return (EFAULT);
15580
15581                 return (0);
15582         }
15583
15584         case DTRACEIOC_CONF: {
15585                 dtrace_conf_t conf;
15586
15587                 bzero(&conf, sizeof (conf));
15588                 conf.dtc_difversion = DIF_VERSION;
15589                 conf.dtc_difintregs = DIF_DIR_NREGS;
15590                 conf.dtc_diftupregs = DIF_DTR_NREGS;
15591                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
15592
15593                 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
15594                         return (EFAULT);
15595
15596                 return (0);
15597         }
15598
15599         case DTRACEIOC_STATUS: {
15600                 dtrace_status_t stat;
15601                 dtrace_dstate_t *dstate;
15602                 int i, j;
15603                 uint64_t nerrs;
15604
15605                 /*
15606                  * See the comment in dtrace_state_deadman() for the reason
15607                  * for setting dts_laststatus to INT64_MAX before setting
15608                  * it to the correct value.
15609                  */
15610                 state->dts_laststatus = INT64_MAX;
15611                 dtrace_membar_producer();
15612                 state->dts_laststatus = dtrace_gethrtime();
15613
15614                 bzero(&stat, sizeof (stat));
15615
15616                 mutex_enter(&dtrace_lock);
15617
15618                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
15619                         mutex_exit(&dtrace_lock);
15620                         return (ENOENT);
15621                 }
15622
15623                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
15624                         stat.dtst_exiting = 1;
15625
15626                 nerrs = state->dts_errors;
15627                 dstate = &state->dts_vstate.dtvs_dynvars;
15628
15629                 for (i = 0; i < NCPU; i++) {
15630                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
15631
15632                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
15633                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
15634                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
15635
15636                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
15637                                 stat.dtst_filled++;
15638
15639                         nerrs += state->dts_buffer[i].dtb_errors;
15640
15641                         for (j = 0; j < state->dts_nspeculations; j++) {
15642                                 dtrace_speculation_t *spec;
15643                                 dtrace_buffer_t *buf;
15644
15645                                 spec = &state->dts_speculations[j];
15646                                 buf = &spec->dtsp_buffer[i];
15647                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
15648                         }
15649                 }
15650
15651                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
15652                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
15653                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
15654                 stat.dtst_dblerrors = state->dts_dblerrors;
15655                 stat.dtst_killed =
15656                     (state->dts_activity == DTRACE_ACTIVITY_KILLED);
15657                 stat.dtst_errors = nerrs;
15658
15659                 mutex_exit(&dtrace_lock);
15660
15661                 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
15662                         return (EFAULT);
15663
15664                 return (0);
15665         }
15666
15667         case DTRACEIOC_FORMAT: {
15668                 dtrace_fmtdesc_t fmt;
15669                 char *str;
15670                 int len;
15671
15672                 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
15673                         return (EFAULT);
15674
15675                 mutex_enter(&dtrace_lock);
15676
15677                 if (fmt.dtfd_format == 0 ||
15678                     fmt.dtfd_format > state->dts_nformats) {
15679                         mutex_exit(&dtrace_lock);
15680                         return (EINVAL);
15681                 }
15682
15683                 /*
15684                  * Format strings are allocated contiguously and they are
15685                  * never freed; if a format index is less than the number
15686                  * of formats, we can assert that the format map is non-NULL
15687                  * and that the format for the specified index is non-NULL.
15688                  */
15689                 ASSERT(state->dts_formats != NULL);
15690                 str = state->dts_formats[fmt.dtfd_format - 1];
15691                 ASSERT(str != NULL);
15692
15693                 len = strlen(str) + 1;
15694
15695                 if (len > fmt.dtfd_length) {
15696                         fmt.dtfd_length = len;
15697
15698                         if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
15699                                 mutex_exit(&dtrace_lock);
15700                                 return (EINVAL);
15701                         }
15702                 } else {
15703                         if (copyout(str, fmt.dtfd_string, len) != 0) {
15704                                 mutex_exit(&dtrace_lock);
15705                                 return (EINVAL);
15706                         }
15707                 }
15708
15709                 mutex_exit(&dtrace_lock);
15710                 return (0);
15711         }
15712
15713         default:
15714                 break;
15715         }
15716
15717         return (ENOTTY);
15718 }
15719
15720 /*ARGSUSED*/
15721 static int
15722 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
15723 {
15724         dtrace_state_t *state;
15725
15726         switch (cmd) {
15727         case DDI_DETACH:
15728                 break;
15729
15730         case DDI_SUSPEND:
15731                 return (DDI_SUCCESS);
15732
15733         default:
15734                 return (DDI_FAILURE);
15735         }
15736
15737         mutex_enter(&cpu_lock);
15738         mutex_enter(&dtrace_provider_lock);
15739         mutex_enter(&dtrace_lock);
15740
15741         ASSERT(dtrace_opens == 0);
15742
15743         if (dtrace_helpers > 0) {
15744                 mutex_exit(&dtrace_provider_lock);
15745                 mutex_exit(&dtrace_lock);
15746                 mutex_exit(&cpu_lock);
15747                 return (DDI_FAILURE);
15748         }
15749
15750         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
15751                 mutex_exit(&dtrace_provider_lock);
15752                 mutex_exit(&dtrace_lock);
15753                 mutex_exit(&cpu_lock);
15754                 return (DDI_FAILURE);
15755         }
15756
15757         dtrace_provider = NULL;
15758
15759         if ((state = dtrace_anon_grab()) != NULL) {
15760                 /*
15761                  * If there were ECBs on this state, the provider should
15762                  * have not been allowed to detach; assert that there is
15763                  * none.
15764                  */
15765                 ASSERT(state->dts_necbs == 0);
15766                 dtrace_state_destroy(state);
15767
15768                 /*
15769                  * If we're being detached with anonymous state, we need to
15770                  * indicate to the kernel debugger that DTrace is now inactive.
15771                  */
15772                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15773         }
15774
15775         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
15776         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15777         dtrace_cpu_init = NULL;
15778         dtrace_helpers_cleanup = NULL;
15779         dtrace_helpers_fork = NULL;
15780         dtrace_cpustart_init = NULL;
15781         dtrace_cpustart_fini = NULL;
15782         dtrace_debugger_init = NULL;
15783         dtrace_debugger_fini = NULL;
15784         dtrace_modload = NULL;
15785         dtrace_modunload = NULL;
15786
15787         mutex_exit(&cpu_lock);
15788
15789         if (dtrace_helptrace_enabled) {
15790                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
15791                 dtrace_helptrace_buffer = NULL;
15792         }
15793
15794         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
15795         dtrace_probes = NULL;
15796         dtrace_nprobes = 0;
15797
15798         dtrace_hash_destroy(dtrace_bymod);
15799         dtrace_hash_destroy(dtrace_byfunc);
15800         dtrace_hash_destroy(dtrace_byname);
15801         dtrace_bymod = NULL;
15802         dtrace_byfunc = NULL;
15803         dtrace_byname = NULL;
15804
15805         kmem_cache_destroy(dtrace_state_cache);
15806         vmem_destroy(dtrace_minor);
15807         vmem_destroy(dtrace_arena);
15808
15809         if (dtrace_toxrange != NULL) {
15810                 kmem_free(dtrace_toxrange,
15811                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
15812                 dtrace_toxrange = NULL;
15813                 dtrace_toxranges = 0;
15814                 dtrace_toxranges_max = 0;
15815         }
15816
15817         ddi_remove_minor_node(dtrace_devi, NULL);
15818         dtrace_devi = NULL;
15819
15820         ddi_soft_state_fini(&dtrace_softstate);
15821
15822         ASSERT(dtrace_vtime_references == 0);
15823         ASSERT(dtrace_opens == 0);
15824         ASSERT(dtrace_retained == NULL);
15825
15826         mutex_exit(&dtrace_lock);
15827         mutex_exit(&dtrace_provider_lock);
15828
15829         /*
15830          * We don't destroy the task queue until after we have dropped our
15831          * locks (taskq_destroy() may block on running tasks).  To prevent
15832          * attempting to do work after we have effectively detached but before
15833          * the task queue has been destroyed, all tasks dispatched via the
15834          * task queue must check that DTrace is still attached before
15835          * performing any operation.
15836          */
15837         taskq_destroy(dtrace_taskq);
15838         dtrace_taskq = NULL;
15839
15840         return (DDI_SUCCESS);
15841 }
15842
15843 /*ARGSUSED*/
15844 static int
15845 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
15846 {
15847         int error;
15848
15849         switch (infocmd) {
15850         case DDI_INFO_DEVT2DEVINFO:
15851                 *result = (void *)dtrace_devi;
15852                 error = DDI_SUCCESS;
15853                 break;
15854         case DDI_INFO_DEVT2INSTANCE:
15855                 *result = (void *)0;
15856                 error = DDI_SUCCESS;
15857                 break;
15858         default:
15859                 error = DDI_FAILURE;
15860         }
15861         return (error);
15862 }
15863
15864 static struct cb_ops dtrace_cb_ops = {
15865         dtrace_open,            /* open */
15866         dtrace_close,           /* close */
15867         nulldev,                /* strategy */
15868         nulldev,                /* print */
15869         nodev,                  /* dump */
15870         nodev,                  /* read */
15871         nodev,                  /* write */
15872         dtrace_ioctl,           /* ioctl */
15873         nodev,                  /* devmap */
15874         nodev,                  /* mmap */
15875         nodev,                  /* segmap */
15876         nochpoll,               /* poll */
15877         ddi_prop_op,            /* cb_prop_op */
15878         0,                      /* streamtab  */
15879         D_NEW | D_MP            /* Driver compatibility flag */
15880 };
15881
15882 static struct dev_ops dtrace_ops = {
15883         DEVO_REV,               /* devo_rev */
15884         0,                      /* refcnt */
15885         dtrace_info,            /* get_dev_info */
15886         nulldev,                /* identify */
15887         nulldev,                /* probe */
15888         dtrace_attach,          /* attach */
15889         dtrace_detach,          /* detach */
15890         nodev,                  /* reset */
15891         &dtrace_cb_ops,         /* driver operations */
15892         NULL,                   /* bus operations */
15893         nodev,                  /* dev power */
15894         ddi_quiesce_not_needed,         /* quiesce */
15895 };
15896
15897 static struct modldrv modldrv = {
15898         &mod_driverops,         /* module type (this is a pseudo driver) */
15899         "Dynamic Tracing",      /* name of module */
15900         &dtrace_ops,            /* driver ops */
15901 };
15902
15903 static struct modlinkage modlinkage = {
15904         MODREV_1,
15905         (void *)&modldrv,
15906         NULL
15907 };
15908
15909 int
15910 _init(void)
15911 {
15912         return (mod_install(&modlinkage));
15913 }
15914
15915 int
15916 _info(struct modinfo *modinfop)
15917 {
15918         return (mod_info(&modlinkage, modinfop));
15919 }
15920
15921 int
15922 _fini(void)
15923 {
15924         return (mod_remove(&modlinkage));
15925 }