kernel/os/exec.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 /*      Copyright (c) 1988 AT&T */
  27 /*        All Rights Reserved   */
  28 /*
  29  * Copyright 2017 Joyent, Inc.
  30  */
  31
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/systm.h>
  36 #include <sys/signal.h>
  37 #include <sys/cred_impl.h>
  38 #include <sys/policy.h>
  39 #include <sys/user.h>
  40 #include <sys/errno.h>
  41 #include <sys/file.h>
  42 #include <sys/vfs.h>
  43 #include <sys/vnode.h>
  44 #include <sys/mman.h>
  45 #include <sys/acct.h>
  46 #include <sys/cpuvar.h>
  47 #include <sys/proc.h>
  48 #include <sys/cmn_err.h>
  49 #include <sys/debug.h>
  50 #include <sys/pathname.h>
  51 #include <sys/vm.h>
  52 #include <sys/lgrp.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/exec.h>
  55 #include <sys/exechdr.h>
  56 #include <sys/kmem.h>
  57 #include <sys/prsystm.h>
  58 #include <sys/modctl.h>
  59 #include <sys/vmparam.h>
  60 #include <sys/door.h>
  61 #include <sys/schedctl.h>
  62 #include <sys/utrap.h>
  63 #include <sys/systeminfo.h>
  64 #include <sys/stack.h>
  65 #include <sys/rctl.h>
  66 #include <sys/dtrace.h>
  67 #include <sys/lwpchan_impl.h>
  68 #include <sys/pool.h>
  69 #include <sys/sdt.h>
  70 #include <sys/brand.h>
  71 #include <sys/klpd.h>
  72 #include <sys/random.h>
  73
  74 #include <c2/audit.h>
  75
  76 #include <vm/hat.h>
  77 #include <vm/anon.h>
  78 #include <vm/as.h>
  79 #include <vm/seg.h>
  80 #include <vm/seg_vn.h>
  81 #include <vm/seg_hole.h>
  82
  83 #define PRIV_RESET              0x01    /* needs to reset privs */
  84 #define PRIV_SETID              0x02    /* needs to change uids */
  85 #define PRIV_SETUGID            0x04    /* is setuid/setgid/forced privs */
  86 #define PRIV_INCREASE           0x08    /* child runs with more privs */
  87 #define PRIV_FORCED             0x20    /* has forced privileges */
  88
  89 static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *,
  90     priv_set_t *, cred_t *, const char *);
  91 static int hold_execsw(struct execsw *);
  92
  93 uint_t auxv_hwcap = 0;  /* auxv AT_SUN_HWCAP value; determined on the fly */
  94 uint_t auxv_hwcap_2 = 0;        /* AT_SUN_HWCAP2 */
  95 #if defined(_SYSCALL32_IMPL)
  96 uint_t auxv_hwcap32 = 0;        /* 32-bit version of auxv_hwcap */
  97 uint_t auxv_hwcap32_2 = 0;      /* 32-bit version of auxv_hwcap2 */
  98 #endif
  99
 100 #define PSUIDFLAGS              (SNOCD|SUGID)
 101
 102 /*
 103  * These are consumed within the specific exec modules, but are defined here
 104  * because
 105  *
 106  * 1) The exec modules are unloadable, which would make this near useless.
 107  *
 108  * 2) We want them to be common across all of them, should more than ELF come
 109  *    to support them.
 110  *
 111  * All must be powers of 2.
 112  */
 113 size_t aslr_max_brk_skew = 16 * 1024 * 1024; /* 16MB */
 114 #pragma weak exec_stackgap = aslr_max_stack_skew /* Old, compatible name */
 115 size_t aslr_max_stack_skew = 64 * 1024; /* 64KB */
 116
 117 /*
 118  * Size of guard segment for 64-bit processes and minimum size it can be shrunk
 119  * to in the case of grow() operations.  These are kept as variables in case
 120  * they need to be tuned in an emergency.
 121  */
 122 size_t stack_guard_seg_sz = 256 * 1024 * 1024;
 123 size_t stack_guard_min_sz = 64 * 1024 * 1024;
 124
 125 /*
 126  * exece() - system call wrapper around exec_common()
 127  */
 128 int
 129 exece(const char *fname, const char **argp, const char **envp)
 130 {
 131         int error;
 132
 133         error = exec_common(fname, argp, envp, EBA_NONE);
 134         return (error ? (set_errno(error)) : 0);
 135 }
 136
 137 int
 138 exec_common(const char *fname, const char **argp, const char **envp,
 139     int brand_action)
 140 {
 141         vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
 142         proc_t *p = ttoproc(curthread);
 143         klwp_t *lwp = ttolwp(curthread);
 144         struct user *up = PTOU(p);
 145         long execsz;            /* temporary count of exec size */
 146         int i;
 147         int error;
 148         char exec_file[MAXCOMLEN+1];
 149         struct pathname pn;
 150         struct pathname resolvepn;
 151         struct uarg args;
 152         struct execa ua;
 153         k_sigset_t savedmask;
 154         lwpdir_t *lwpdir = NULL;
 155         tidhash_t *tidhash;
 156         lwpdir_t *old_lwpdir = NULL;
 157         uint_t old_lwpdir_sz;
 158         tidhash_t *old_tidhash;
 159         uint_t old_tidhash_sz;
 160         ret_tidhash_t *ret_tidhash;
 161         lwpent_t *lep;
 162         boolean_t brandme = B_FALSE;
 163
 164         /*
 165          * exec() is not supported for the /proc agent lwp.
 166          */
 167         if (curthread == p->p_agenttp)
 168                 return (ENOTSUP);
 169
 170         if (brand_action != EBA_NONE) {
 171                 /*
 172                  * Brand actions are not supported for processes that are not
 173                  * running in a branded zone.
 174                  */
 175                 if (!ZONE_IS_BRANDED(p->p_zone))
 176                         return (ENOTSUP);
 177
 178                 if (brand_action == EBA_NATIVE) {
 179                         /* Only branded processes can be unbranded */
 180                         if (!PROC_IS_BRANDED(p))
 181                                 return (ENOTSUP);
 182                 } else {
 183                         /* Only unbranded processes can be branded */
 184                         if (PROC_IS_BRANDED(p))
 185                                 return (ENOTSUP);
 186                         brandme = B_TRUE;
 187                 }
 188         } else {
 189                 /*
 190                  * If this is a native zone, or if the process is already
 191                  * branded, then we don't need to do anything.  If this is
 192                  * a native process in a branded zone, we need to brand the
 193                  * process as it exec()s the new binary.
 194                  */
 195                 if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
 196                         brandme = B_TRUE;
 197         }
 198
 199         /*
 200          * Inform /proc that an exec() has started.
 201          * Hold signals that are ignored by default so that we will
 202          * not be interrupted by a signal that will be ignored after
 203          * successful completion of gexec().
 204          */
 205         mutex_enter(&p->p_lock);
 206         prexecstart();
 207         schedctl_finish_sigblock(curthread);
 208         savedmask = curthread->t_hold;
 209         sigorset(&curthread->t_hold, &ignoredefault);
 210         mutex_exit(&p->p_lock);
 211
 212         /*
 213          * Look up path name and remember last component for later.
 214          * To help coreadm expand its %d token, we attempt to save
 215          * the directory containing the executable in p_execdir. The
 216          * first call to lookuppn() may fail and return EINVAL because
 217          * dirvpp is non-NULL. In that case, we make a second call to
 218          * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
 219          * but coreadm is allowed to expand %d to the empty string and
 220          * there are other cases in which that failure may occur.
 221          */
 222         if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
 223                 goto out;
 224         pn_alloc(&resolvepn);
 225         if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
 226                 pn_free(&resolvepn);
 227                 pn_free(&pn);
 228                 if (error != EINVAL)
 229                         goto out;
 230
 231                 dir = NULL;
 232                 if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
 233                         goto out;
 234                 pn_alloc(&resolvepn);
 235                 if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
 236                     &vp)) != 0) {
 237                         pn_free(&resolvepn);
 238                         pn_free(&pn);
 239                         goto out;
 240                 }
 241         }
 242         if (vp == NULL) {
 243                 if (dir != NULL)
 244                         VN_RELE(dir);
 245                 error = ENOENT;
 246                 pn_free(&resolvepn);
 247                 pn_free(&pn);
 248                 goto out;
 249         }
 250
 251         if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
 252                 if (dir != NULL)
 253                         VN_RELE(dir);
 254                 pn_free(&resolvepn);
 255                 pn_free(&pn);
 256                 VN_RELE(vp);
 257                 goto out;
 258         }
 259
 260         /*
 261          * We do not allow executing files in attribute directories.
 262          * We test this by determining whether the resolved path
 263          * contains a "/" when we're in an attribute directory;
 264          * only if the pathname does not contain a "/" the resolved path
 265          * points to a file in the current working (attribute) directory.
 266          */
 267         if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
 268             strchr(resolvepn.pn_path, '/') == NULL) {
 269                 if (dir != NULL)
 270                         VN_RELE(dir);
 271                 error = EACCES;
 272                 pn_free(&resolvepn);
 273                 pn_free(&pn);
 274                 VN_RELE(vp);
 275                 goto out;
 276         }
 277
 278         bzero(exec_file, MAXCOMLEN+1);
 279         (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
 280         bzero(&args, sizeof (args));
 281         args.pathname = resolvepn.pn_path;
 282         /* don't free resolvepn until we are done with args */
 283         pn_free(&pn);
 284
 285         /*
 286          * If we're running in a profile shell, then call pfexecd.
 287          */
 288         if ((CR_FLAGS(p->p_cred) & PRIV_PFEXEC) != 0) {
 289                 error = pfexec_call(p->p_cred, &resolvepn, &args.pfcred,
 290                     &args.scrubenv);
 291
 292                 /* Returning errno in case we're not allowed to execute. */
 293                 if (error > 0) {
 294                         if (dir != NULL)
 295                                 VN_RELE(dir);
 296                         pn_free(&resolvepn);
 297                         VN_RELE(vp);
 298                         goto out;
 299                 }
 300
 301                 /* Don't change the credentials when using old ptrace. */
 302                 if (args.pfcred != NULL &&
 303                     (p->p_proc_flag & P_PR_PTRACE) != 0) {
 304                         crfree(args.pfcred);
 305                         args.pfcred = NULL;
 306                         args.scrubenv = B_FALSE;
 307                 }
 308         }
 309
 310         /*
 311          * Specific exec handlers, or policies determined via
 312          * /etc/system may override the historical default.
 313          */
 314         args.stk_prot = PROT_ZFOD;
 315         args.dat_prot = PROT_ZFOD;
 316
 317         CPU_STATS_ADD_K(sys, sysexec, 1);
 318         DTRACE_PROC1(exec, char *, args.pathname);
 319
 320         ua.fname = fname;
 321         ua.argp = argp;
 322         ua.envp = envp;
 323
 324         /* If necessary, brand this process before we start the exec. */
 325         if (brandme)
 326                 brand_setbrand(p);
 327
 328         if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
 329             exec_file, p->p_cred, brand_action)) != 0) {
 330                 if (brandme)
 331                         brand_clearbrand(p, B_FALSE);
 332                 VN_RELE(vp);
 333                 if (dir != NULL)
 334                         VN_RELE(dir);
 335                 pn_free(&resolvepn);
 336                 goto fail;
 337         }
 338
 339         /*
 340          * Free floating point registers (sun4u only)
 341          */
 342         ASSERT(lwp != NULL);
 343         lwp_freeregs(lwp, 1);
 344
 345         /*
 346          * Free thread and process context ops.
 347          */
 348         if (curthread->t_ctx)
 349                 freectx(curthread, 1);
 350         if (p->p_pctx)
 351                 freepctx(p, 1);
 352
 353         /*
 354          * Remember file name for accounting; clear any cached DTrace predicate.
 355          */
 356         up->u_acflag &= ~AFORK;
 357         bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
 358         curthread->t_predcache = 0;
 359
 360         /*
 361          * Clear contract template state
 362          */
 363         lwp_ctmpl_clear(lwp);
 364
 365         /*
 366          * Save the directory in which we found the executable for expanding
 367          * the %d token used in core file patterns.
 368          */
 369         mutex_enter(&p->p_lock);
 370         tmpvp = p->p_execdir;
 371         p->p_execdir = dir;
 372         if (p->p_execdir != NULL)
 373                 VN_HOLD(p->p_execdir);
 374         mutex_exit(&p->p_lock);
 375
 376         if (tmpvp != NULL)
 377                 VN_RELE(tmpvp);
 378
 379         /*
 380          * Reset stack state to the user stack, clear set of signals
 381          * caught on the signal stack, and reset list of signals that
 382          * restart system calls; the new program's environment should
 383          * not be affected by detritus from the old program.  Any
 384          * pending held signals remain held, so don't clear t_hold.
 385          */
 386         mutex_enter(&p->p_lock);
 387         lwp->lwp_oldcontext = 0;
 388         lwp->lwp_ustack = 0;
 389         lwp->lwp_old_stk_ctl = 0;
 390         sigemptyset(&up->u_signodefer);
 391         sigemptyset(&up->u_sigonstack);
 392         sigemptyset(&up->u_sigresethand);
 393         lwp->lwp_sigaltstack.ss_sp = 0;
 394         lwp->lwp_sigaltstack.ss_size = 0;
 395         lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
 396
 397         /*
 398          * Make saved resource limit == current resource limit.
 399          */
 400         for (i = 0; i < RLIM_NLIMITS; i++) {
 401                 /*CONSTCOND*/
 402                 if (RLIM_SAVED(i)) {
 403                         (void) rctl_rlimit_get(rctlproc_legacy[i], p,
 404                             &up->u_saved_rlimit[i]);
 405                 }
 406         }
 407
 408         /*
 409          * If the action was to catch the signal, then the action
 410          * must be reset to SIG_DFL.
 411          */
 412         sigdefault(p);
 413         p->p_flag &= ~(SNOWAIT|SJCTL);
 414         p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
 415         up->u_signal[SIGCLD - 1] = SIG_DFL;
 416
 417         /*
 418          * Delete the dot4 sigqueues/signotifies.
 419          */
 420         sigqfree(p);
 421
 422         mutex_exit(&p->p_lock);
 423
 424         mutex_enter(&p->p_pflock);
 425         p->p_prof.pr_base = NULL;
 426         p->p_prof.pr_size = 0;
 427         p->p_prof.pr_off = 0;
 428         p->p_prof.pr_scale = 0;
 429         p->p_prof.pr_samples = 0;
 430         mutex_exit(&p->p_pflock);
 431
 432         ASSERT(curthread->t_schedctl == NULL);
 433
 434
 435         /*
 436          * Close all close-on-exec files.
 437          */
 438         close_exec(P_FINFO(p));
 439         TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
 440
 441         /* Unbrand ourself if necessary. */
 442         if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE))
 443                 brand_clearbrand(p, B_FALSE);
 444
 445         setregs(&args);
 446
 447         /* Mark this as an executable vnode */
 448         mutex_enter(&vp->v_lock);
 449         vp->v_flag |= VVMEXEC;
 450         mutex_exit(&vp->v_lock);
 451
 452         VN_RELE(vp);
 453         if (dir != NULL)
 454                 VN_RELE(dir);
 455         pn_free(&resolvepn);
 456
 457         /*
 458          * Allocate a new lwp directory and lwpid hash table if necessary.
 459          */
 460         if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
 461                 lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
 462                 lwpdir->ld_next = lwpdir + 1;
 463                 tidhash = kmem_zalloc(2 * sizeof (tidhash_t), KM_SLEEP);
 464                 if (p->p_lwpdir != NULL)
 465                         lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
 466                 else
 467                         lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
 468         }
 469
 470         if (PROC_IS_BRANDED(p))
 471                 BROP(p)->b_exec();
 472
 473         mutex_enter(&p->p_lock);
 474         prbarrier(p);
 475
 476         /*
 477          * Reset lwp id to the default value of 1.
 478          * This is a single-threaded process now
 479          * and lwp #1 is lwp_wait()able by default.
 480          * The t_unpark flag should not be inherited.
 481          */
 482         ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
 483         curthread->t_tid = 1;
 484         kpreempt_disable();
 485         ASSERT(curthread->t_lpl != NULL);
 486         p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
 487         kpreempt_enable();
 488         if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
 489                 lgrp_update_trthr_migrations(1);
 490         }
 491         curthread->t_unpark = 0;
 492         curthread->t_proc_flag |= TP_TWAIT;
 493         curthread->t_proc_flag &= ~TP_DAEMON;   /* daemons shouldn't exec */
 494         p->p_lwpdaemon = 0;                     /* but oh well ... */
 495         p->p_lwpid = 1;
 496
 497         /*
 498          * Install the newly-allocated lwp directory and lwpid hash table
 499          * and insert the current thread into the new hash table.
 500          */
 501         if (lwpdir != NULL) {
 502                 old_lwpdir = p->p_lwpdir;
 503                 old_lwpdir_sz = p->p_lwpdir_sz;
 504                 old_tidhash = p->p_tidhash;
 505                 old_tidhash_sz = p->p_tidhash_sz;
 506                 p->p_lwpdir = p->p_lwpfree = lwpdir;
 507                 p->p_lwpdir_sz = 2;
 508                 lep->le_thread = curthread;
 509                 lep->le_lwpid = curthread->t_tid;
 510                 lep->le_start = curthread->t_start;
 511                 lwp_hash_in(p, lep, tidhash, 2, 0);
 512                 p->p_tidhash = tidhash;
 513                 p->p_tidhash_sz = 2;
 514         }
 515         ret_tidhash = p->p_ret_tidhash;
 516         p->p_ret_tidhash = NULL;
 517
 518         /*
 519          * Restore the saved signal mask and
 520          * inform /proc that the exec() has finished.
 521          */
 522         curthread->t_hold = savedmask;
 523         prexecend();
 524         mutex_exit(&p->p_lock);
 525         if (old_lwpdir) {
 526                 kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
 527                 kmem_free(old_tidhash, old_tidhash_sz * sizeof (tidhash_t));
 528         }
 529         while (ret_tidhash != NULL) {
 530                 ret_tidhash_t *next = ret_tidhash->rth_next;
 531                 kmem_free(ret_tidhash->rth_tidhash,
 532                     ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
 533                 kmem_free(ret_tidhash, sizeof (*ret_tidhash));
 534                 ret_tidhash = next;
 535         }
 536
 537         ASSERT(error == 0);
 538         DTRACE_PROC(exec__success);
 539         return (0);
 540
 541 fail:
 542         DTRACE_PROC1(exec__failure, int, error);
 543 out:            /* error return */
 544         mutex_enter(&p->p_lock);
 545         curthread->t_hold = savedmask;
 546         prexecend();
 547         mutex_exit(&p->p_lock);
 548         ASSERT(error != 0);
 549         return (error);
 550 }
 551
 552
 553 /*
 554  * Perform generic exec duties and switchout to object-file specific
 555  * handler.
 556  */
 557 int
 558 gexec(
 559         struct vnode **vpp,
 560         struct execa *uap,
 561         struct uarg *args,
 562         struct intpdata *idatap,
 563         int level,
 564         long *execsz,
 565         caddr_t exec_file,
 566         struct cred *cred,
 567         int brand_action)
 568 {
 569         struct vnode *vp, *execvp = NULL;
 570         proc_t *pp = ttoproc(curthread);
 571         struct execsw *eswp;
 572         int error = 0;
 573         int suidflags = 0;
 574         ssize_t resid;
 575         uid_t uid, gid;
 576         struct vattr vattr;
 577         char magbuf[MAGIC_BYTES];
 578         int setid;
 579         cred_t *oldcred, *newcred = NULL;
 580         int privflags = 0;
 581         int setidfl;
 582         priv_set_t fset;
 583         secflagset_t old_secflags;
 584
 585         secflags_copy(&old_secflags, &pp->p_secflags.psf_effective);
 586
 587         /*
 588          * If the SNOCD or SUGID flag is set, turn it off and remember the
 589          * previous setting so we can restore it if we encounter an error.
 590          */
 591         if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
 592                 mutex_enter(&pp->p_lock);
 593                 suidflags = pp->p_flag & PSUIDFLAGS;
 594                 pp->p_flag &= ~PSUIDFLAGS;
 595                 mutex_exit(&pp->p_lock);
 596         }
 597
 598         if ((error = execpermissions(*vpp, &vattr, args)) != 0)
 599                 goto bad_noclose;
 600
 601         /* need to open vnode for stateful file systems */
 602         if ((error = fop_open(vpp, FREAD, CRED(), NULL)) != 0)
 603                 goto bad_noclose;
 604         vp = *vpp;
 605
 606         /*
 607          * Note: to support binary compatibility with SunOS a.out
 608          * executables, we read in the first four bytes, as the
 609          * magic number is in bytes 2-3.
 610          */
 611         if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
 612             0, UIO_SYSSPACE, 0, 0, CRED(), &resid))
 613                 goto bad;
 614         if (resid != 0)
 615                 goto bad;
 616
 617         if ((eswp = findexec_by_hdr(magbuf)) == NULL)
 618                 goto bad;
 619
 620         if (level == 0 &&
 621             (privflags = execsetid(vp, &vattr, &uid, &gid, &fset,
 622             args->pfcred == NULL ? cred : args->pfcred, args->pathname)) != 0) {
 623
 624                 /* Pfcred is a credential with a ref count of 1 */
 625
 626                 if (args->pfcred != NULL) {
 627                         privflags |= PRIV_INCREASE|PRIV_RESET;
 628                         newcred = cred = args->pfcred;
 629                 } else {
 630                         newcred = cred = crdup(cred);
 631                 }
 632
 633                 /* If we can, drop the PA bit */
 634                 if ((privflags & PRIV_RESET) != 0)
 635                         priv_adjust_PA(cred);
 636
 637                 if (privflags & PRIV_SETID) {
 638                         cred->cr_uid = uid;
 639                         cred->cr_gid = gid;
 640                         cred->cr_suid = uid;
 641                         cred->cr_sgid = gid;
 642                 }
 643
 644                 /*
 645                  * Implement the privilege updates:
 646                  *
 647                  * Restrict with L:
 648                  *
 649                  *      I' = I & L
 650                  *
 651                  *      E' = P' = (I' + F) & A
 652                  *
 653                  * But if running under ptrace, we cap I and F with P.
 654                  */
 655                 if ((privflags & (PRIV_RESET|PRIV_FORCED)) != 0) {
 656                         if ((privflags & PRIV_INCREASE) != 0 &&
 657                             (pp->p_proc_flag & P_PR_PTRACE) != 0) {
 658                                 priv_intersect(&CR_OPPRIV(cred),
 659                                     &CR_IPRIV(cred));
 660                                 priv_intersect(&CR_OPPRIV(cred), &fset);
 661                         }
 662                         priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
 663                         CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
 664                         if (privflags & PRIV_FORCED) {
 665                                 priv_set_PA(cred);
 666                                 priv_union(&fset, &CR_EPRIV(cred));
 667                                 priv_union(&fset, &CR_PPRIV(cred));
 668                         }
 669                         priv_adjust_PA(cred);
 670                 }
 671         } else if (level == 0 && args->pfcred != NULL) {
 672                 newcred = cred = args->pfcred;
 673                 privflags |= PRIV_INCREASE;
 674                 /* pfcred is not forced to adhere to these settings */
 675                 priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
 676                 CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
 677                 priv_adjust_PA(cred);
 678         }
 679
 680         /* The new image gets the inheritable secflags as its secflags */
 681         secflags_promote(pp);
 682
 683         /* SunOS 4.x buy-back */
 684         if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
 685             (vattr.va_mode & (VSUID|VSGID))) {
 686                 char path[MAXNAMELEN];
 687                 refstr_t *mntpt = NULL;
 688                 int ret = -1;
 689
 690                 bzero(path, sizeof (path));
 691                 zone_hold(pp->p_zone);
 692
 693                 ret = vnodetopath(pp->p_zone->zone_rootvp, vp, path,
 694                     sizeof (path), cred);
 695
 696                 /* fallback to mountpoint if a path can't be found */
 697                 if ((ret != 0) || (ret == 0 && path[0] == '\0'))
 698                         mntpt = vfs_getmntpoint(vp->v_vfsp);
 699
 700                 if (mntpt == NULL)
 701                         zcmn_err(pp->p_zone->zone_id, CE_NOTE,
 702                             "!uid %d: setuid execution not allowed, "
 703                             "file=%s", cred->cr_uid, path);
 704                 else
 705                         zcmn_err(pp->p_zone->zone_id, CE_NOTE,
 706                             "!uid %d: setuid execution not allowed, "
 707                             "fs=%s, file=%s", cred->cr_uid,
 708                             ZONE_PATH_TRANSLATE(refstr_value(mntpt),
 709                             pp->p_zone), exec_file);
 710
 711                 if (!INGLOBALZONE(pp)) {
 712                         /* zone_rootpath always has trailing / */
 713                         if (mntpt == NULL)
 714                                 cmn_err(CE_NOTE, "!zone: %s, uid: %d "
 715                                     "setuid execution not allowed, file=%s%s",
 716                                     pp->p_zone->zone_name, cred->cr_uid,
 717                                     pp->p_zone->zone_rootpath, path + 1);
 718                         else
 719                                 cmn_err(CE_NOTE, "!zone: %s, uid: %d "
 720                                     "setuid execution not allowed, fs=%s, "
 721                                     "file=%s", pp->p_zone->zone_name,
 722                                     cred->cr_uid, refstr_value(mntpt),
 723                                     exec_file);
 724                 }
 725
 726                 if (mntpt != NULL)
 727                         refstr_rele(mntpt);
 728
 729                 zone_rele(pp->p_zone);
 730         }
 731
 732         /*
 733          * execsetid() told us whether or not we had to change the
 734          * credentials of the process.  In privflags, it told us
 735          * whether we gained any privileges or executed a set-uid executable.
 736          */
 737         setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE|PRIV_FORCED));
 738
 739         /*
 740          * Use /etc/system variable to determine if the stack
 741          * should be marked as executable by default.
 742          */
 743         if ((noexec_user_stack != 0) ||
 744             secflag_enabled(pp, PROC_SEC_NOEXECSTACK))
 745                 args->stk_prot &= ~PROT_EXEC;
 746
 747         args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
 748         args->ex_vp = vp;
 749
 750         /*
 751          * Traditionally, the setid flags told the sub processes whether
 752          * the file just executed was set-uid or set-gid; this caused
 753          * some confusion as the 'setid' flag did not match the SUGID
 754          * process flag which is only set when the uids/gids do not match.
 755          * A script set-gid/set-uid to the real uid/gid would start with
 756          * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
 757          * Now we flag those cases where the calling process cannot
 758          * be trusted to influence the newly exec'ed process, either
 759          * because it runs with more privileges or when the uids/gids
 760          * do in fact not match.
 761          * This also makes the runtime linker agree with the on exec
 762          * values of SNOCD and SUGID.
 763          */
 764         setidfl = 0;
 765         if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
 766             !supgroupmember(cred->cr_gid, cred))) {
 767                 setidfl |= EXECSETID_UGIDS;
 768         }
 769         if (setid & PRIV_SETUGID)
 770                 setidfl |= EXECSETID_SETID;
 771         if (setid & PRIV_FORCED)
 772                 setidfl |= EXECSETID_PRIVS;
 773
 774         execvp = pp->p_exec;
 775         if (execvp)
 776                 VN_HOLD(execvp);
 777
 778         error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
 779             setidfl, exec_file, cred, brand_action);
 780         rw_exit(eswp->exec_lock);
 781         if (error != 0) {
 782                 if (execvp)
 783                         VN_RELE(execvp);
 784                 /*
 785                  * If this process's p_exec has been set to the vp of
 786                  * the executable by exec_func, we will return without
 787                  * calling fop_close because proc_exit will close it
 788                  * on exit.
 789                  */
 790                 if (pp->p_exec == vp)
 791                         goto bad_noclose;
 792                 else
 793                         goto bad;
 794         }
 795
 796         if (level == 0) {
 797                 uid_t oruid;
 798
 799                 if (execvp != NULL) {
 800                         /*
 801                          * Close the previous executable only if we are
 802                          * at level 0.
 803                          */
 804                         (void) fop_close(execvp, FREAD, 1, 0,
 805                             cred, NULL);
 806                 }
 807
 808                 mutex_enter(&pp->p_crlock);
 809
 810                 oruid = pp->p_cred->cr_ruid;
 811
 812                 if (newcred != NULL) {
 813                         /*
 814                          * Free the old credentials, and set the new ones.
 815                          * Do this for both the process and the (single) thread.
 816                          */
 817                         crfree(pp->p_cred);
 818                         pp->p_cred = cred;      /* cred already held for proc */
 819                         crhold(cred);           /* hold new cred for thread */
 820                         /*
 821                          * DTrace accesses t_cred in probe context.  t_cred
 822                          * must always be either NULL, or point to a valid,
 823                          * allocated cred structure.
 824                          */
 825                         oldcred = curthread->t_cred;
 826                         curthread->t_cred = cred;
 827                         crfree(oldcred);
 828
 829                         if (priv_basic_test >= 0 &&
 830                             !PRIV_ISMEMBER(&CR_IPRIV(newcred),
 831                             priv_basic_test)) {
 832                                 pid_t pid = pp->p_pid;
 833                                 char *fn = PTOU(pp)->u_comm;
 834
 835                                 cmn_err(CE_WARN, "%s[%d]: exec: basic_test "
 836                                     "privilege removed from E/I", fn, pid);
 837                         }
 838                 }
 839                 /*
 840                  * On emerging from a successful exec(), the saved
 841                  * uid and gid equal the effective uid and gid.
 842                  */
 843                 cred->cr_suid = cred->cr_uid;
 844                 cred->cr_sgid = cred->cr_gid;
 845
 846                 /*
 847                  * If the real and effective ids do not match, this
 848                  * is a setuid process that should not dump core.
 849                  * The group comparison is tricky; we prevent the code
 850                  * from flagging SNOCD when executing with an effective gid
 851                  * which is a supplementary group.
 852                  */
 853                 if (cred->cr_ruid != cred->cr_uid ||
 854                     (cred->cr_rgid != cred->cr_gid &&
 855                     !supgroupmember(cred->cr_gid, cred)) ||
 856                     (privflags & PRIV_INCREASE) != 0)
 857                         suidflags = PSUIDFLAGS;
 858                 else
 859                         suidflags = 0;
 860
 861                 mutex_exit(&pp->p_crlock);
 862                 if (newcred != NULL && oruid != newcred->cr_ruid) {
 863                         /* Note that the process remains in the same zone. */
 864                         mutex_enter(&pidlock);
 865                         upcount_dec(oruid, crgetzoneid(newcred));
 866                         upcount_inc(newcred->cr_ruid, crgetzoneid(newcred));
 867                         mutex_exit(&pidlock);
 868                 }
 869                 if (suidflags) {
 870                         mutex_enter(&pp->p_lock);
 871                         pp->p_flag |= suidflags;
 872                         mutex_exit(&pp->p_lock);
 873                 }
 874                 if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
 875                         /*
 876                          * If process is traced via /proc, arrange to
 877                          * invalidate the associated /proc vnode.
 878                          */
 879                         if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
 880                                 args->traceinval = 1;
 881                 }
 882                 if (pp->p_proc_flag & P_PR_PTRACE)
 883                         psignal(pp, SIGTRAP);
 884                 if (args->traceinval)
 885                         prinvalidate(&pp->p_user);
 886         }
 887         if (execvp)
 888                 VN_RELE(execvp);
 889         return (0);
 890
 891 bad:
 892         (void) fop_close(vp, FREAD, 1, 0, cred, NULL);
 893
 894 bad_noclose:
 895         if (newcred != NULL)
 896                 crfree(newcred);
 897         if (error == 0)
 898                 error = ENOEXEC;
 899
 900         mutex_enter(&pp->p_lock);
 901         if (suidflags) {
 902                 pp->p_flag |= suidflags;
 903         }
 904         /*
 905          * Restore the effective secflags, to maintain the invariant they
 906          * never change for a given process
 907          */
 908         secflags_copy(&pp->p_secflags.psf_effective, &old_secflags);
 909         mutex_exit(&pp->p_lock);
 910
 911         return (error);
 912 }
 913
 914 extern char *execswnames[];
 915
 916 struct execsw *
 917 allocate_execsw(char *name, char *magic, size_t magic_size)
 918 {
 919         int i, j;
 920         char *ename;
 921         char *magicp;
 922
 923         mutex_enter(&execsw_lock);
 924         for (i = 0; i < nexectype; i++) {
 925                 if (execswnames[i] == NULL) {
 926                         ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 927                         (void) strcpy(ename, name);
 928                         execswnames[i] = ename;
 929                         /*
 930                          * Set the magic number last so that we
 931                          * don't need to hold the execsw_lock in
 932                          * findexectype().
 933                          */
 934                         magicp = kmem_alloc(magic_size, KM_SLEEP);
 935                         for (j = 0; j < magic_size; j++)
 936                                 magicp[j] = magic[j];
 937                         execsw[i].exec_magic = magicp;
 938                         mutex_exit(&execsw_lock);
 939                         return (&execsw[i]);
 940                 }
 941         }
 942         mutex_exit(&execsw_lock);
 943         return (NULL);
 944 }
 945
 946 /*
 947  * Find the exec switch table entry with the corresponding magic string.
 948  */
 949 struct execsw *
 950 findexecsw(char *magic)
 951 {
 952         struct execsw *eswp;
 953
 954         for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
 955                 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
 956                 if (magic && eswp->exec_maglen != 0 &&
 957                     bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
 958                         return (eswp);
 959         }
 960         return (NULL);
 961 }
 962
 963 /*
 964  * Find the execsw[] index for the given exec header string by looking for the
 965  * magic string at a specified offset and length for each kind of executable
 966  * file format until one matches.  If no execsw[] entry is found, try to
 967  * autoload a module for this magic string.
 968  */
 969 struct execsw *
 970 findexec_by_hdr(char *header)
 971 {
 972         struct execsw *eswp;
 973
 974         for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
 975                 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
 976                 if (header && eswp->exec_maglen != 0 &&
 977                     bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
 978                     eswp->exec_maglen) == 0) {
 979                         if (hold_execsw(eswp) != 0)
 980                                 return (NULL);
 981                         return (eswp);
 982                 }
 983         }
 984         return (NULL);  /* couldn't find the type */
 985 }
 986
 987 /*
 988  * Find the execsw[] index for the given magic string.  If no execsw[] entry
 989  * is found, try to autoload a module for this magic string.
 990  */
 991 struct execsw *
 992 findexec_by_magic(char *magic)
 993 {
 994         struct execsw *eswp;
 995
 996         for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
 997                 ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
 998                 if (magic && eswp->exec_maglen != 0 &&
 999                     bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
1000                         if (hold_execsw(eswp) != 0)
1001                                 return (NULL);
1002                         return (eswp);
1003                 }
1004         }
1005         return (NULL);  /* couldn't find the type */
1006 }
1007
1008 static int
1009 hold_execsw(struct execsw *eswp)
1010 {
1011         char *name;
1012
1013         rw_enter(eswp->exec_lock, RW_READER);
1014         while (!LOADED_EXEC(eswp)) {
1015                 rw_exit(eswp->exec_lock);
1016                 name = execswnames[eswp-execsw];
1017                 ASSERT(name);
1018                 if (modload("exec", name) == -1)
1019                         return (-1);
1020                 rw_enter(eswp->exec_lock, RW_READER);
1021         }
1022         return (0);
1023 }
1024
1025 static int
1026 execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp,
1027     priv_set_t *fset, cred_t *cr, const char *pathname)
1028 {
1029         proc_t *pp = ttoproc(curthread);
1030         uid_t uid, gid;
1031         int privflags = 0;
1032
1033         /*
1034          * Remember credentials.
1035          */
1036         uid = cr->cr_uid;
1037         gid = cr->cr_gid;
1038
1039         /* Will try to reset the PRIV_AWARE bit later. */
1040         if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
1041                 privflags |= PRIV_RESET;
1042
1043         if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
1044                 /*
1045                  * If it's a set-uid root program we perform the
1046                  * forced privilege look-aside. This has three possible
1047                  * outcomes:
1048                  *      no look aside information -> treat as before
1049                  *      look aside in Limit set -> apply forced privs
1050                  *      look aside not in Limit set -> ignore set-uid root
1051                  *
1052                  * Ordinary set-uid root execution only allowed if the limit
1053                  * set holds all unsafe privileges.
1054                  */
1055                 if (vattrp->va_mode & VSUID) {
1056                         if (vattrp->va_uid == 0) {
1057                                 int res = get_forced_privs(cr, pathname, fset);
1058
1059                                 switch (res) {
1060                                 case -1:
1061                                         if (priv_issubset(&priv_unsafe,
1062                                             &CR_LPRIV(cr))) {
1063                                                 uid = vattrp->va_uid;
1064                                                 privflags |= PRIV_SETUGID;
1065                                         }
1066                                         break;
1067                                 case 0:
1068                                         privflags |= PRIV_FORCED|PRIV_INCREASE;
1069                                         break;
1070                                 default:
1071                                         break;
1072                                 }
1073                         } else {
1074                                 uid = vattrp->va_uid;
1075                                 privflags |= PRIV_SETUGID;
1076                         }
1077                 }
1078                 if (vattrp->va_mode & VSGID) {
1079                         gid = vattrp->va_gid;
1080                         privflags |= PRIV_SETUGID;
1081                 }
1082         }
1083
1084         /*
1085          * Do we need to change our credential anyway?
1086          * This is the case when E != I or P != I, as
1087          * we need to do the assignments (with F empty and A full)
1088          * Or when I is not a subset of L; in that case we need to
1089          * enforce L.
1090          *
1091          *              I' = L & I
1092          *
1093          *              E' = P' = (I' + F) & A
1094          * or
1095          *              E' = P' = I'
1096          */
1097         if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
1098             !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
1099             !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
1100                 privflags |= PRIV_RESET;
1101
1102         /* Child has more privileges than parent */
1103         if (!priv_issubset(&CR_IPRIV(cr), &CR_PPRIV(cr)))
1104                 privflags |= PRIV_INCREASE;
1105
1106         /*
1107          * Set setuid/setgid protections if no ptrace() compatibility.
1108          * For privileged processes, honor setuid/setgid even in
1109          * the presence of ptrace() compatibility.
1110          */
1111         if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
1112             PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
1113             (cr->cr_uid != uid ||
1114             cr->cr_gid != gid ||
1115             cr->cr_suid != uid ||
1116             cr->cr_sgid != gid)) {
1117                 *uidp = uid;
1118                 *gidp = gid;
1119                 privflags |= PRIV_SETID;
1120         }
1121         return (privflags);
1122 }
1123
1124 int
1125 execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
1126 {
1127         int error;
1128         proc_t *p = ttoproc(curthread);
1129
1130         vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
1131         if (error = fop_getattr(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
1132                 return (error);
1133         /*
1134          * Check the access mode.
1135          * If VPROC, ask /proc if the file is an object file.
1136          */
1137         if ((error = fop_access(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
1138             !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
1139             (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
1140             (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
1141                 if (error == 0)
1142                         error = EACCES;
1143                 return (error);
1144         }
1145
1146         if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
1147             (error = fop_access(vp, VREAD, 0, p->p_cred, NULL))) {
1148                 /*
1149                  * If process is under ptrace(2) compatibility,
1150                  * fail the exec(2).
1151                  */
1152                 if (p->p_proc_flag & P_PR_PTRACE)
1153                         goto bad;
1154                 /*
1155                  * Process is traced via /proc.
1156                  * Arrange to invalidate the /proc vnode.
1157                  */
1158                 args->traceinval = 1;
1159         }
1160         return (0);
1161 bad:
1162         if (error == 0)
1163                 error = ENOEXEC;
1164         return (error);
1165 }
1166
1167 /*
1168  * Map a section of an executable file into the user's
1169  * address space.
1170  */
1171 int
1172 execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
1173     off_t offset, int prot, int page, uint_t szc)
1174 {
1175         int error = 0;
1176         off_t oldoffset;
1177         caddr_t zfodbase, oldaddr;
1178         size_t end, oldlen;
1179         size_t zfoddiff;
1180         label_t ljb;
1181         proc_t *p = ttoproc(curthread);
1182
1183         oldaddr = addr;
1184         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1185         if (len) {
1186                 oldlen = len;
1187                 len += ((size_t)oldaddr - (size_t)addr);
1188                 oldoffset = offset;
1189                 offset = (off_t)((uintptr_t)offset & PAGEMASK);
1190                 if (page) {
1191                         spgcnt_t  prefltmem, availm, npages;
1192                         int preread;
1193                         uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1194
1195                         if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1196                                 mflag |= MAP_TEXT;
1197                         } else {
1198                                 mflag |= MAP_INITDATA;
1199                         }
1200
1201                         if (valid_usr_range(addr, len, prot, p->p_as,
1202                             p->p_as->a_userlimit) != RANGE_OKAY) {
1203                                 error = ENOMEM;
1204                                 goto bad;
1205                         }
1206                         if (error = fop_map(vp, (offset_t)offset,
1207                             p->p_as, &addr, len, prot, PROT_ALL,
1208                             mflag, CRED(), NULL))
1209                                 goto bad;
1210
1211                         /*
1212                          * If the segment can fit, then we prefault
1213                          * the entire segment in.  This is based on the
1214                          * model that says the best working set of a
1215                          * small program is all of its pages.
1216                          */
1217                         npages = (spgcnt_t)btopr(len);
1218                         prefltmem = freemem - desfree;
1219                         preread =
1220                             (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1221
1222                         /*
1223                          * If we aren't prefaulting the segment,
1224                          * increment "deficit", if necessary to ensure
1225                          * that pages will become available when this
1226                          * process starts executing.
1227                          */
1228                         availm = freemem - lotsfree;
1229                         if (preread == 0 && npages > availm &&
1230                             deficit < lotsfree) {
1231                                 deficit += MIN((pgcnt_t)(npages - availm),
1232                                     lotsfree - deficit);
1233                         }
1234
1235                         if (preread) {
1236                                 TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1237                                     "execmap preread:freemem %d size %lu",
1238                                     freemem, len);
1239                                 (void) as_fault(p->p_as->a_hat, p->p_as,
1240                                     (caddr_t)addr, len, F_INVAL, S_READ);
1241                         }
1242                 } else {
1243                         if (valid_usr_range(addr, len, prot, p->p_as,
1244                             p->p_as->a_userlimit) != RANGE_OKAY) {
1245                                 error = ENOMEM;
1246                                 goto bad;
1247                         }
1248
1249                         if (error = as_map(p->p_as, addr, len,
1250                             segvn_create, zfod_argsp))
1251                                 goto bad;
1252                         /*
1253                          * Read in the segment in one big chunk.
1254                          */
1255                         if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1256                             oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1257                             0, CRED(), NULL))
1258                                 goto bad;
1259                         /*
1260                          * Now set protections.
1261                          */
1262                         if (prot != PROT_ZFOD) {
1263                                 (void) as_setprot(p->p_as, (caddr_t)addr,
1264                                     len, prot);
1265                         }
1266                 }
1267         }
1268
1269         if (zfodlen) {
1270                 struct as *as = curproc->p_as;
1271                 struct seg *seg;
1272                 uint_t zprot = 0;
1273
1274                 end = (size_t)addr + len;
1275                 zfodbase = (caddr_t)roundup(end, PAGESIZE);
1276                 zfoddiff = (uintptr_t)zfodbase - end;
1277                 if (zfoddiff) {
1278                         /*
1279                          * Before we go to zero the remaining space on the last
1280                          * page, make sure we have write permission.
1281                          *
1282                          * Normal illumos binaries don't even hit the case
1283                          * where we have to change permission on the last page
1284                          * since their protection is typically either
1285                          *    PROT_USER | PROT_WRITE | PROT_READ
1286                          * or
1287                          *    PROT_ZFOD (same as PROT_ALL).
1288                          *
1289                          * We need to be careful how we zero-fill the last page
1290                          * if the segment protection does not include
1291                          * PROT_WRITE. Using as_setprot() can cause the VM
1292                          * segment code to call segvn_vpage(), which must
1293                          * allocate a page struct for each page in the segment.
1294                          * If we have a very large segment, this may fail, so
1295                          * we have to check for that, even though we ignore
1296                          * other return values from as_setprot.
1297                          */
1298
1299                         AS_LOCK_ENTER(as, RW_READER);
1300                         seg = as_segat(curproc->p_as, (caddr_t)end);
1301                         if (seg != NULL)
1302                                 (void) segop_getprot(seg, (caddr_t)end,
1303                                     zfoddiff - 1, &zprot);
1304                         AS_LOCK_EXIT(as);
1305
1306                         if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1307                                 if (as_setprot(as, (caddr_t)end, zfoddiff - 1,
1308                                     zprot | PROT_WRITE) == ENOMEM) {
1309                                         error = ENOMEM;
1310                                         goto bad;
1311                                 }
1312                         }
1313
1314                         if (on_fault(&ljb)) {
1315                                 no_fault();
1316                                 if (seg != NULL && (zprot & PROT_WRITE) == 0)
1317                                         (void) as_setprot(as, (caddr_t)end,
1318                                             zfoddiff - 1, zprot);
1319                                 error = EFAULT;
1320                                 goto bad;
1321                         }
1322                         uzero((void *)end, zfoddiff);
1323                         no_fault();
1324                         if (seg != NULL && (zprot & PROT_WRITE) == 0)
1325                                 (void) as_setprot(as, (caddr_t)end,
1326                                     zfoddiff - 1, zprot);
1327                 }
1328                 if (zfodlen > zfoddiff) {
1329                         struct segvn_crargs crargs =
1330                             SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1331
1332                         zfodlen -= zfoddiff;
1333                         if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1334                             p->p_as->a_userlimit) != RANGE_OKAY) {
1335                                 error = ENOMEM;
1336                                 goto bad;
1337                         }
1338                         if (szc > 0) {
1339                                 /*
1340                                  * ASSERT alignment because the mapelfexec()
1341                                  * caller for the szc > 0 case extended zfod
1342                                  * so it's end is pgsz aligned.
1343                                  */
1344                                 size_t pgsz = page_get_pagesize(szc);
1345                                 ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1346
1347                                 if (IS_P2ALIGNED(zfodbase, pgsz)) {
1348                                         crargs.szc = szc;
1349                                 } else {
1350                                         crargs.szc = AS_MAP_HEAP;
1351                                 }
1352                         } else {
1353                                 crargs.szc = AS_MAP_NO_LPOOB;
1354                         }
1355                         if (error = as_map(p->p_as, (caddr_t)zfodbase,
1356                             zfodlen, segvn_create, &crargs))
1357                                 goto bad;
1358                         if (prot != PROT_ZFOD) {
1359                                 (void) as_setprot(p->p_as, (caddr_t)zfodbase,
1360                                     zfodlen, prot);
1361                         }
1362                 }
1363         }
1364         return (0);
1365 bad:
1366         return (error);
1367 }
1368
1369 void
1370 setexecenv(struct execenv *ep)
1371 {
1372         proc_t *p = ttoproc(curthread);
1373         klwp_t *lwp = ttolwp(curthread);
1374         struct vnode *vp;
1375
1376         p->p_bssbase = ep->ex_bssbase;
1377         p->p_brkbase = ep->ex_brkbase;
1378         p->p_brksize = ep->ex_brksize;
1379         if (p->p_exec)
1380                 VN_RELE(p->p_exec);     /* out with the old */
1381         vp = p->p_exec = ep->ex_vp;
1382         if (vp != NULL)
1383                 VN_HOLD(vp);            /* in with the new */
1384
1385         lwp->lwp_sigaltstack.ss_sp = 0;
1386         lwp->lwp_sigaltstack.ss_size = 0;
1387         lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1388 }
1389
1390 int
1391 execopen(struct vnode **vpp, int *fdp)
1392 {
1393         struct vnode *vp = *vpp;
1394         file_t *fp;
1395         int error = 0;
1396         int filemode = FREAD;
1397
1398         VN_HOLD(vp);            /* open reference */
1399         if (error = falloc(NULL, filemode, &fp, fdp)) {
1400                 VN_RELE(vp);
1401                 *fdp = -1;      /* just in case falloc changed value */
1402                 return (error);
1403         }
1404         if (error = fop_open(&vp, filemode, CRED(), NULL)) {
1405                 VN_RELE(vp);
1406                 setf(*fdp, NULL);
1407                 unfalloc(fp);
1408                 *fdp = -1;
1409                 return (error);
1410         }
1411         *vpp = vp;              /* vnode should not have changed */
1412         fp->f_vnode = vp;
1413         mutex_exit(&fp->f_tlock);
1414         setf(*fdp, fp);
1415         return (0);
1416 }
1417
1418 int
1419 execclose(int fd)
1420 {
1421         return (closeandsetf(fd, NULL));
1422 }
1423
1424
1425 /*
1426  * noexec stub function.
1427  */
1428 /*ARGSUSED*/
1429 int
1430 noexec(
1431     struct vnode *vp,
1432     struct execa *uap,
1433     struct uarg *args,
1434     struct intpdata *idatap,
1435     int level,
1436     long *execsz,
1437     int setid,
1438     caddr_t exec_file,
1439     struct cred *cred)
1440 {
1441         cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1442         return (ENOEXEC);
1443 }
1444
1445 /*
1446  * Support routines for building a user stack.
1447  *
1448  * execve(path, argv, envp) must construct a new stack with the specified
1449  * arguments and environment variables (see exec_args() for a description
1450  * of the user stack layout).  To do this, we copy the arguments and
1451  * environment variables from the old user address space into the kernel,
1452  * free the old as, create the new as, and copy our buffered information
1453  * to the new stack.  Our kernel buffer has the following structure:
1454  *
1455  *      +-----------------------+ <--- stk_base + stk_size
1456  *      | string offsets        |
1457  *      +-----------------------+ <--- stk_offp
1458  *      |                       |
1459  *      | STK_AVAIL() space     |
1460  *      |                       |
1461  *      +-----------------------+ <--- stk_strp
1462  *      | strings               |
1463  *      +-----------------------+ <--- stk_base
1464  *
1465  * When we add a string, we store the string's contents (including the null
1466  * terminator) at stk_strp, and we store the offset of the string relative to
1467  * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1468  * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1469  * the difference between these pointers.  If we run out of space, we return
1470  * an error and exec_args() starts all over again with a buffer twice as large.
1471  * When we're all done, the kernel buffer looks like this:
1472  *
1473  *      +-----------------------+ <--- stk_base + stk_size
1474  *      | argv[0] offset        |
1475  *      +-----------------------+
1476  *      | ...                   |
1477  *      +-----------------------+
1478  *      | argv[argc-1] offset   |
1479  *      +-----------------------+
1480  *      | envp[0] offset        |
1481  *      +-----------------------+
1482  *      | ...                   |
1483  *      +-----------------------+
1484  *      | envp[envc-1] offset   |
1485  *      +-----------------------+
1486  *      | AT_SUN_PLATFORM offset|
1487  *      +-----------------------+
1488  *      | AT_SUN_EXECNAME offset|
1489  *      +-----------------------+ <--- stk_offp
1490  *      |                       |
1491  *      | STK_AVAIL() space     |
1492  *      |                       |
1493  *      +-----------------------+ <--- stk_strp
1494  *      | AT_SUN_EXECNAME offset|
1495  *      +-----------------------+
1496  *      | AT_SUN_PLATFORM offset|
1497  *      +-----------------------+
1498  *      | envp[envc-1] string   |
1499  *      +-----------------------+
1500  *      | ...                   |
1501  *      +-----------------------+
1502  *      | envp[0] string        |
1503  *      +-----------------------+
1504  *      | argv[argc-1] string   |
1505  *      +-----------------------+
1506  *      | ...                   |
1507  *      +-----------------------+
1508  *      | argv[0] string        |
1509  *      +-----------------------+ <--- stk_base
1510  */
1511
1512 #define STK_AVAIL(args)         ((char *)(args)->stk_offp - (args)->stk_strp)
1513
1514 /*
1515  * Add a string to the stack.
1516  */
1517 static int
1518 stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1519 {
1520         int error;
1521         size_t len;
1522
1523         if (STK_AVAIL(args) < sizeof (int))
1524                 return (E2BIG);
1525         *--args->stk_offp = args->stk_strp - args->stk_base;
1526
1527         if (segflg == UIO_USERSPACE) {
1528                 error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1529                 if (error != 0)
1530                         return (error);
1531         } else {
1532                 len = strlen(sp) + 1;
1533                 if (len > STK_AVAIL(args))
1534                         return (E2BIG);
1535                 bcopy(sp, args->stk_strp, len);
1536         }
1537
1538         args->stk_strp += len;
1539
1540         return (0);
1541 }
1542
1543 static int
1544 stk_getptr(uarg_t *args, char *src, char **dst)
1545 {
1546         int error;
1547
1548         if (args->from_model == DATAMODEL_NATIVE) {
1549                 ulong_t ptr;
1550                 error = fulword(src, &ptr);
1551                 *dst = (caddr_t)ptr;
1552         } else {
1553                 uint32_t ptr;
1554                 error = fuword32(src, &ptr);
1555                 *dst = (caddr_t)(uintptr_t)ptr;
1556         }
1557         return (error);
1558 }
1559
1560 static int
1561 stk_putptr(uarg_t *args, char *addr, char *value)
1562 {
1563         if (args->to_model == DATAMODEL_NATIVE)
1564                 return (sulword(addr, (ulong_t)value));
1565         else
1566                 return (suword32(addr, (uint32_t)(uintptr_t)value));
1567 }
1568
1569 static int
1570 stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1571 {
1572         char *sp;
1573         int argc, error;
1574         int argv_empty = 0;
1575         size_t ptrsize = args->from_ptrsize;
1576         size_t size, pad;
1577         char *argv = (char *)uap->argp;
1578         char *envp = (char *)uap->envp;
1579
1580         /*
1581          * Copy interpreter's name and argument to argv[0] and argv[1].
1582          * In the rare case that we have nested interpreters then those names
1583          * and arguments are also copied to the subsequent slots in argv.
1584          */
1585         if (intp != NULL && intp->intp_name[0] != NULL) {
1586                 int i;
1587
1588                 for (i = 0; i < INTP_MAXDEPTH; i++) {
1589                         if (intp->intp_name[i] == NULL)
1590                                 break;
1591                         error = stk_add(args, intp->intp_name[i], UIO_SYSSPACE);
1592                         if (error != 0)
1593                                 return (error);
1594                         if (intp->intp_arg[i] != NULL) {
1595                                 error = stk_add(args, intp->intp_arg[i],
1596                                     UIO_SYSSPACE);
1597                                 if (error != 0)
1598                                         return (error);
1599                         }
1600                 }
1601
1602                 if (args->fname != NULL)
1603                         error = stk_add(args, args->fname, UIO_SYSSPACE);
1604                 else
1605                         error = stk_add(args, uap->fname, UIO_USERSPACE);
1606                 if (error)
1607                         return (error);
1608
1609                 /*
1610                  * Check for an empty argv[].
1611                  */
1612                 if (stk_getptr(args, argv, &sp))
1613                         return (EFAULT);
1614                 if (sp == NULL)
1615                         argv_empty = 1;
1616
1617                 argv += ptrsize;                /* ignore original argv[0] */
1618         }
1619
1620         if (argv_empty == 0) {
1621                 /*
1622                  * Add argv[] strings to the stack.
1623                  */
1624                 for (;;) {
1625                         if (stk_getptr(args, argv, &sp))
1626                                 return (EFAULT);
1627                         if (sp == NULL)
1628                                 break;
1629                         if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1630                                 return (error);
1631                         argv += ptrsize;
1632                 }
1633         }
1634         argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1635         args->arglen = args->stk_strp - args->stk_base;
1636
1637         /*
1638          * Add environ[] strings to the stack.
1639          */
1640         if (envp != NULL) {
1641                 for (;;) {
1642                         char *tmp = args->stk_strp;
1643                         if (stk_getptr(args, envp, &sp))
1644                                 return (EFAULT);
1645                         if (sp == NULL)
1646                                 break;
1647                         if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1648                                 return (error);
1649                         if (args->scrubenv && strncmp(tmp, "LD_", 3) == 0) {
1650                                 /* Undo the copied string */
1651                                 args->stk_strp = tmp;
1652                                 *(args->stk_offp++) = 0;
1653                         }
1654                         envp += ptrsize;
1655                 }
1656         }
1657         args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1658         args->ne = args->na - argc;
1659
1660         /*
1661          * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and
1662          * AT_SUN_EMULATOR strings to the stack.
1663          */
1664         if (auxvpp != NULL && *auxvpp != NULL) {
1665                 if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1666                         return (error);
1667                 if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1668                         return (error);
1669                 if (args->brandname != NULL &&
1670                     (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
1671                         return (error);
1672                 if (args->emulator != NULL &&
1673                     (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
1674                         return (error);
1675         }
1676
1677         /*
1678          * Compute the size of the stack.  This includes all the pointers,
1679          * the space reserved for the aux vector, and all the strings.
1680          * The total number of pointers is args->na (which is argc + envc)
1681          * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1682          * after the last argument (i.e. argv[argc]); (3) the NULL after the
1683          * last environment variable (i.e. envp[envc]); and (4) the NULL after
1684          * all the strings, at the very top of the stack.
1685          */
1686         size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1687             (args->stk_strp - args->stk_base);
1688
1689         /*
1690          * Pad the string section with zeroes to align the stack size.
1691          */
1692         pad = P2NPHASE(size, args->stk_align);
1693
1694         if (STK_AVAIL(args) < pad)
1695                 return (E2BIG);
1696
1697         args->usrstack_size = size + pad;
1698
1699         while (pad-- != 0)
1700                 *args->stk_strp++ = 0;
1701
1702         args->nc = args->stk_strp - args->stk_base;
1703
1704         return (0);
1705 }
1706
1707 static int
1708 stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1709 {
1710         size_t ptrsize = args->to_ptrsize;
1711         ssize_t pslen;
1712         char *kstrp = args->stk_base;
1713         char *ustrp = usrstack - args->nc - ptrsize;
1714         char *usp = usrstack - args->usrstack_size;
1715         int *offp = (int *)(args->stk_base + args->stk_size);
1716         int envc = args->ne;
1717         int argc = args->na - envc;
1718         int i;
1719
1720         /*
1721          * Record argc for /proc.
1722          */
1723         up->u_argc = argc;
1724
1725         /*
1726          * Put argc on the stack.  Note that even though it's an int,
1727          * it always consumes ptrsize bytes (for alignment).
1728          */
1729         if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1730                 return (-1);
1731
1732         /*
1733          * Add argc space (ptrsize) to usp and record argv for /proc.
1734          */
1735         up->u_argv = (uintptr_t)(usp += ptrsize);
1736
1737         /*
1738          * Put the argv[] pointers on the stack.
1739          */
1740         for (i = 0; i < argc; i++, usp += ptrsize)
1741                 if (stk_putptr(args, usp, &ustrp[*--offp]))
1742                         return (-1);
1743
1744         /*
1745          * Copy arguments to u_psargs.
1746          */
1747         pslen = MIN(args->arglen, PSARGSZ) - 1;
1748         for (i = 0; i < pslen; i++)
1749                 up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1750         while (i < PSARGSZ)
1751                 up->u_psargs[i++] = '\0';
1752
1753         /*
1754          * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1755          * record envp for /proc.
1756          */
1757         up->u_envp = (uintptr_t)(usp += ptrsize);
1758
1759         /*
1760          * Put the envp[] pointers on the stack.
1761          */
1762         for (i = 0; i < envc; i++, usp += ptrsize)
1763                 if (stk_putptr(args, usp, &ustrp[*--offp]))
1764                         return (-1);
1765
1766         /*
1767          * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1768          * remember where the stack ends, which is also where auxv begins.
1769          */
1770         args->stackend = usp += ptrsize;
1771
1772         /*
1773          * Put all the argv[], envp[], and auxv strings on the stack.
1774          */
1775         if (copyout(args->stk_base, ustrp, args->nc))
1776                 return (-1);
1777
1778         /*
1779          * Fill in the aux vector now that we know the user stack addresses
1780          * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1781          * AT_SUN_EMULATOR strings.
1782          */
1783         if (auxvpp != NULL && *auxvpp != NULL) {
1784                 if (args->to_model == DATAMODEL_NATIVE) {
1785                         auxv_t **a = (auxv_t **)auxvpp;
1786                         ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1787                         ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1788                         if (args->brandname != NULL)
1789                                 ADDAUX(*a,
1790                                     AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1791                         if (args->emulator != NULL)
1792                                 ADDAUX(*a,
1793                                     AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1794                 } else {
1795                         auxv32_t **a = (auxv32_t **)auxvpp;
1796                         ADDAUX(*a,
1797                             AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1798                         ADDAUX(*a,
1799                             AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1800                         if (args->brandname != NULL)
1801                                 ADDAUX(*a, AT_SUN_BRANDNAME,
1802                                     (int)(uintptr_t)&ustrp[*--offp])
1803                         if (args->emulator != NULL)
1804                                 ADDAUX(*a, AT_SUN_EMULATOR,
1805                                     (int)(uintptr_t)&ustrp[*--offp])
1806                 }
1807         }
1808
1809         return (0);
1810 }
1811
1812 /*
1813  * Though the actual stack base is constant, slew the %sp by a random aligned
1814  * amount in [0,aslr_max_stack_skew).  Mostly, this makes life slightly more
1815  * complicated for buffer overflows hoping to overwrite the return address.
1816  *
1817  * On some platforms this helps avoid cache thrashing when identical processes
1818  * simultaneously share caches that don't provide enough associativity
1819  * (e.g. sun4v systems). In this case stack slewing makes the same hot stack
1820  * variables in different processes live in different cache sets increasing
1821  * effective associativity.
1822  */
1823 size_t
1824 exec_get_spslew(void)
1825 {
1826 #ifdef sun4v
1827         static uint_t sp_color_stride = 16;
1828         static uint_t sp_color_mask = 0x1f;
1829         static uint_t sp_current_color = (uint_t)-1;
1830 #endif
1831         size_t off;
1832
1833         ASSERT(ISP2(aslr_max_stack_skew));
1834
1835         if ((aslr_max_stack_skew == 0) ||
1836             !secflag_enabled(curproc, PROC_SEC_ASLR)) {
1837 #ifdef sun4v
1838                 uint_t spcolor = atomic_inc_32_nv(&sp_current_color);
1839                 return ((size_t)((spcolor & sp_color_mask) *
1840                     SA(sp_color_stride)));
1841 #else
1842                 return (0);
1843 #endif
1844         }
1845
1846         (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1847         return (SA(P2PHASE(off, aslr_max_stack_skew)));
1848 }
1849
1850 /*
1851  * Initialize a new user stack with the specified arguments and environment.
1852  * The initial user stack layout is as follows:
1853  *
1854  *      User Stack
1855  *      +---------------+
1856  *      |               |
1857  *      | stack guard   |
1858  *      | (64-bit only) |
1859  *      |               |
1860  *      +...............+ <--- stack limit (base - curproc->p_stk_ctl)
1861  *      .               .
1862  *      .               .
1863  *      .               .
1864  *      +---------------+ <--- curproc->p_usrstack
1865  *      |               |
1866  *      | slew          |
1867  *      |               |
1868  *      +---------------+
1869  *      | NULL          |
1870  *      +---------------+
1871  *      |               |
1872  *      | auxv strings  |
1873  *      |               |
1874  *      +---------------+
1875  *      |               |
1876  *      | envp strings  |
1877  *      |               |
1878  *      +---------------+
1879  *      |               |
1880  *      | argv strings  |
1881  *      |               |
1882  *      +---------------+ <--- ustrp
1883  *      |               |
1884  *      | aux vector    |
1885  *      |               |
1886  *      +---------------+ <--- auxv
1887  *      | NULL          |
1888  *      +---------------+
1889  *      | envp[envc-1]  |
1890  *      +---------------+
1891  *      | ...           |
1892  *      +---------------+
1893  *      | envp[0]       |
1894  *      +---------------+ <--- envp[]
1895  *      | NULL          |
1896  *      +---------------+
1897  *      | argv[argc-1]  |
1898  *      +---------------+
1899  *      | ...           |
1900  *      +---------------+
1901  *      | argv[0]       |
1902  *      +---------------+ <--- argv[]
1903  *      | argc          |
1904  *      +---------------+ <--- stack base
1905  *
1906  * In 64-bit processes, a stack guard segment is allocated at the address
1907  * immediately below where the stack limit ends.  This protects new library
1908  * mappings (such as the linker) from being placed in relatively dangerous
1909  * proximity to the stack.
1910  */
1911 int
1912 exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1913 {
1914         size_t size;
1915         int error;
1916         proc_t *p = ttoproc(curthread);
1917         user_t *up = PTOU(p);
1918         char *usrstack;
1919         rctl_entity_p_t e;
1920         struct as *as;
1921         extern int use_stk_lpg;
1922         size_t sp_slew;
1923 #if defined(_LP64)
1924         const size_t sg_sz = (stack_guard_seg_sz & PAGEMASK);
1925 #endif /* defined(_LP64) */
1926
1927         args->from_model = p->p_model;
1928         if (p->p_model == DATAMODEL_NATIVE) {
1929                 args->from_ptrsize = sizeof (long);
1930         } else {
1931                 args->from_ptrsize = sizeof (int32_t);
1932         }
1933
1934         if (args->to_model == DATAMODEL_NATIVE) {
1935                 args->to_ptrsize = sizeof (long);
1936                 args->ncargs = NCARGS;
1937                 args->stk_align = STACK_ALIGN;
1938                 if (args->addr32)
1939                         usrstack = (char *)USRSTACK64_32;
1940                 else
1941                         usrstack = (char *)USRSTACK;
1942         } else {
1943                 args->to_ptrsize = sizeof (int32_t);
1944                 args->ncargs = NCARGS32;
1945                 args->stk_align = STACK_ALIGN32;
1946                 usrstack = (char *)USRSTACK32;
1947         }
1948
1949         ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1950
1951
1952         for (size = PAGESIZE; ; size *= 2) {
1953                 args->stk_size = size;
1954                 args->stk_base = kmem_alloc(size, KM_SLEEP);
1955                 args->stk_strp = args->stk_base;
1956                 args->stk_offp = (int *)(args->stk_base + size);
1957                 error = stk_copyin(uap, args, intp, auxvpp);
1958                 if (error == 0)
1959                         break;
1960                 kmem_free(args->stk_base, size);
1961                 if (error != E2BIG && error != ENAMETOOLONG)
1962                         return (error);
1963                 if (size >= args->ncargs)
1964                         return (E2BIG);
1965         }
1966
1967         size = args->usrstack_size;
1968
1969         ASSERT(error == 0);
1970         ASSERT(P2PHASE(size, args->stk_align) == 0);
1971         ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1972
1973         if (size > args->ncargs) {
1974                 kmem_free(args->stk_base, args->stk_size);
1975                 return (E2BIG);
1976         }
1977
1978         /*
1979          * Leave only the current lwp and force the other lwps to exit.
1980          * If another lwp beat us to the punch by calling exit(), bail out.
1981          */
1982         if ((error = exitlwps(0)) != 0) {
1983                 kmem_free(args->stk_base, args->stk_size);
1984                 return (error);
1985         }
1986
1987         /*
1988          * Revoke any doors created by the process.
1989          */
1990         if (p->p_door_list)
1991                 door_exit();
1992
1993         /*
1994          * Release schedctl data structures.
1995          */
1996         if (p->p_pagep)
1997                 schedctl_proc_cleanup();
1998
1999         /*
2000          * Clean up any DTrace helpers for the process.
2001          */
2002         if (p->p_dtrace_helpers != NULL) {
2003                 ASSERT(dtrace_helpers_cleanup != NULL);
2004                 (*dtrace_helpers_cleanup)(p);
2005         }
2006
2007         mutex_enter(&p->p_lock);
2008         /*
2009          * Cleanup the DTrace provider associated with this process.
2010          */
2011         if (p->p_dtrace_probes) {
2012                 ASSERT(dtrace_fasttrap_exec_ptr != NULL);
2013                 dtrace_fasttrap_exec_ptr(p);
2014         }
2015         mutex_exit(&p->p_lock);
2016
2017         /*
2018          * discard the lwpchan cache.
2019          */
2020         if (p->p_lcp != NULL)
2021                 lwpchan_destroy_cache(1);
2022
2023         /*
2024          * Delete the POSIX timers.
2025          */
2026         if (p->p_itimer != NULL)
2027                 timer_exit();
2028
2029         /*
2030          * Delete the ITIMER_REALPROF interval timer.
2031          * The other ITIMER_* interval timers are specified
2032          * to be inherited across exec().
2033          */
2034         delete_itimer_realprof();
2035
2036         if (AU_AUDITING())
2037                 audit_exec(args->stk_base, args->stk_base + args->arglen,
2038                     args->na - args->ne, args->ne, args->pfcred);
2039
2040         /*
2041          * Ensure that we don't change resource associations while we
2042          * change address spaces.
2043          */
2044         mutex_enter(&p->p_lock);
2045         pool_barrier_enter();
2046         mutex_exit(&p->p_lock);
2047
2048         /*
2049          * Destroy the old address space and create a new one.
2050          * From here on, any errors are fatal to the exec()ing process.
2051          * On error we return -1, which means the caller must SIGKILL
2052          * the process.
2053          */
2054         relvm();
2055
2056         mutex_enter(&p->p_lock);
2057         pool_barrier_exit();
2058         mutex_exit(&p->p_lock);
2059
2060         up->u_execsw = args->execswp;
2061
2062         p->p_brkbase = NULL;
2063         p->p_brksize = 0;
2064         p->p_brkpageszc = 0;
2065         p->p_stksize = 0;
2066         p->p_stkpageszc = 0;
2067         p->p_stkg_start = 0;
2068         p->p_stkg_end = 0;
2069         p->p_model = args->to_model;
2070         p->p_usrstack = usrstack;
2071         p->p_stkprot = args->stk_prot;
2072         p->p_datprot = args->dat_prot;
2073
2074         /*
2075          * Reset resource controls such that all controls are again active as
2076          * well as appropriate to the potentially new address model for the
2077          * process.
2078          */
2079         e.rcep_p.proc = p;
2080         e.rcep_t = RCENTITY_PROCESS;
2081         rctl_set_reset(p->p_rctls, p, &e);
2082
2083         /* Too early to call map_pgsz for the heap */
2084         if (use_stk_lpg) {
2085                 p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
2086         }
2087
2088         mutex_enter(&p->p_lock);
2089         p->p_flag |= SAUTOLPG;  /* kernel controls page sizes */
2090         mutex_exit(&p->p_lock);
2091
2092         sp_slew = exec_get_spslew();
2093         ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
2094         /* Be certain we don't underflow */
2095         VERIFY((curproc->p_usrstack - (size + sp_slew)) < curproc->p_usrstack);
2096         exec_set_sp(size + sp_slew);
2097
2098         as = as_alloc();
2099         p->p_as = as;
2100         as->a_proc = p;
2101         if (p->p_model == DATAMODEL_ILP32 || args->addr32)
2102                 as->a_userlimit = (caddr_t)USERLIMIT32;
2103         (void) hat_setup(as->a_hat, HAT_ALLOC);
2104         hat_join_srd(as->a_hat, args->ex_vp);
2105
2106         /* Write out the contents of the new stack. */
2107         error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
2108         kmem_free(args->stk_base, args->stk_size);
2109
2110 #if defined(_LP64)
2111         /* Add stack guard segment (if needed) after successful copyout */
2112         if (error == 0 && p->p_model == DATAMODEL_LP64 && sg_sz != 0) {
2113                 seghole_crargs_t sca;
2114                 caddr_t addr_end = (caddr_t)(((uintptr_t)usrstack -
2115                     p->p_stk_ctl) & PAGEMASK);
2116                 caddr_t addr_start = addr_end - sg_sz;
2117
2118                 DTRACE_PROBE4(stack__guard__chk, proc_t *, p,
2119                     caddr_t, addr_start, caddr_t, addr_end, size_t, sg_sz);
2120
2121                 if (addr_end >= usrstack || addr_start >= addr_end ||
2122                     valid_usr_range(addr_start, sg_sz, PROT_NONE, as,
2123                     as->a_userlimit) != RANGE_OKAY) {
2124                         return (E2BIG);
2125                 }
2126
2127                 /* Create un-mappable area in AS with seg_hole */
2128                 sca.name = "stack_guard";
2129                 error = as_map(as, addr_start, sg_sz, seghole_create, &sca);
2130                 if (error == 0) {
2131                         p->p_stkg_start = (uintptr_t)addr_start;
2132                         p->p_stkg_end = (uintptr_t)addr_start + sg_sz;
2133                 }
2134         }
2135 #endif /* defined(_LP64) */
2136
2137         return (error);
2138 }