14236 signed math leads getelfshdr astray
[illumos-gate.git] / usr / src / uts / common / exec / elf / elf.c
blob73d302aaa52e69f714eb58b97c7a0331ec41f1c1
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
29 * Copyright 2019, Joyent, Inc.
30 * Copyright 2022 Oxide Computer Company
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/thread.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/vnode.h>
42 #include <sys/mman.h>
43 #include <sys/kmem.h>
44 #include <sys/proc.h>
45 #include <sys/pathname.h>
46 #include <sys/policy.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/elf.h>
50 #include <sys/vmsystm.h>
51 #include <sys/debug.h>
52 #include <sys/auxv.h>
53 #include <sys/exec.h>
54 #include <sys/prsystm.h>
55 #include <vm/as.h>
56 #include <vm/rm.h>
57 #include <vm/seg.h>
58 #include <vm/seg_vn.h>
59 #include <sys/modctl.h>
60 #include <sys/systeminfo.h>
61 #include <sys/vmparam.h>
62 #include <sys/machelf.h>
63 #include <sys/shm_impl.h>
64 #include <sys/archsystm.h>
65 #include <sys/fasttrap.h>
66 #include <sys/brand.h>
67 #include "elf_impl.h"
68 #include <sys/sdt.h>
69 #include <sys/siginfo.h>
70 #include <sys/random.h>
72 #include <core_shstrtab.h>
74 #if defined(__x86)
75 #include <sys/comm_page_util.h>
76 #include <sys/fp.h>
77 #endif /* defined(__x86) */
80 extern int at_flags;
81 extern volatile size_t aslr_max_brk_skew;
83 #define ORIGIN_STR "ORIGIN"
84 #define ORIGIN_STR_SIZE 6
86 static int getelfhead(vnode_t *, cred_t *, Ehdr *, uint_t *, uint_t *,
87 uint_t *);
88 static int getelfphdr(vnode_t *, cred_t *, const Ehdr *, uint_t, caddr_t *,
89 size_t *);
90 static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, uint_t, uint_t,
91 caddr_t *, size_t *, caddr_t *, size_t *);
92 static size_t elfsize(const Ehdr *, uint_t, const caddr_t, uintptr_t *);
93 static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **,
94 Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *,
95 size_t, size_t *, size_t *);
98 #ifdef _ELF32_COMPAT
99 /* Link against the non-compat instances when compiling the 32-bit version. */
100 extern size_t elf_datasz_max;
101 extern size_t elf_zeropg_sz;
102 extern void elf_ctx_resize_scratch(elf_core_ctx_t *, size_t);
103 extern uint_t elf_nphdr_max;
104 extern uint_t elf_nshdr_max;
105 extern size_t elf_shstrtab_max;
106 #else
107 size_t elf_datasz_max = 1 * 1024 * 1024;
108 size_t elf_zeropg_sz = 4 * 1024;
109 uint_t elf_nphdr_max = 1000;
110 uint_t elf_nshdr_max = 10000;
111 size_t elf_shstrtab_max = 100 * 1024;
112 #endif
114 static int
115 dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base)
117 ASSERT(phdrp->p_type == PT_SUNWDTRACE);
120 * See the comment in fasttrap.h for information on how to safely
121 * update this program header.
123 if (phdrp->p_memsz < PT_SUNWDTRACE_SIZE ||
124 (phdrp->p_flags & (PF_R | PF_W | PF_X)) != (PF_R | PF_W | PF_X))
125 return (-1);
127 args->thrptr = phdrp->p_vaddr + base;
129 return (0);
132 static int
133 handle_secflag_dt(proc_t *p, uint_t dt, uint_t val)
135 uint_t flag;
137 switch (dt) {
138 case DT_SUNW_ASLR:
139 flag = PROC_SEC_ASLR;
140 break;
141 default:
142 return (EINVAL);
145 if (val == 0) {
146 if (secflag_isset(p->p_secflags.psf_lower, flag))
147 return (EPERM);
148 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
149 secflag_isset(p->p_secflags.psf_inherit, flag))
150 return (EPERM);
152 secflag_clear(&p->p_secflags.psf_effective, flag);
153 } else {
154 if (!secflag_isset(p->p_secflags.psf_upper, flag))
155 return (EPERM);
157 if ((secpolicy_psecflags(CRED(), p, p) != 0) &&
158 !secflag_isset(p->p_secflags.psf_inherit, flag))
159 return (EPERM);
161 secflag_set(&p->p_secflags.psf_effective, flag);
164 return (0);
167 #ifndef _ELF32_COMPAT
168 void
169 elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz)
171 size_t target = MIN(sz, elf_datasz_max);
173 if (target > ctx->ecc_bufsz) {
174 if (ctx->ecc_buf != NULL) {
175 kmem_free(ctx->ecc_buf, ctx->ecc_bufsz);
177 ctx->ecc_buf = kmem_alloc(target, KM_SLEEP);
178 ctx->ecc_bufsz = target;
181 #endif /* _ELF32_COMPAT */
184 * Map in the executable pointed to by vp. Returns 0 on success.
187 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
188 intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
189 caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
191 size_t len, phdrsize;
192 struct vattr vat;
193 caddr_t phdrbase = NULL;
194 uint_t nshdrs, shstrndx, nphdrs;
195 int error = 0;
196 Phdr *uphdr = NULL;
197 Phdr *junk = NULL;
198 Phdr *dynphdr = NULL;
199 Phdr *dtrphdr = NULL;
200 uintptr_t lddata, minaddr;
201 size_t execsz;
203 if (lddatap != NULL)
204 *lddatap = 0;
206 if (error = execpermissions(vp, &vat, args)) {
207 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
208 return (error);
211 if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx,
212 &nphdrs)) != 0 ||
213 (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase,
214 &phdrsize)) != 0) {
215 uprintf("%s: Cannot read %s\n", exec_file, args->pathname);
216 return (error);
219 if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) {
220 uprintf("%s: Nothing to load in %s", exec_file, args->pathname);
221 kmem_free(phdrbase, phdrsize);
222 return (ENOEXEC);
224 if (lddatap != NULL)
225 *lddatap = lddata;
227 if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
228 &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
229 len, &execsz, brksize)) {
230 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
231 if (uphdr != NULL && uphdr->p_flags == 0)
232 kmem_free(uphdr, sizeof (Phdr));
233 kmem_free(phdrbase, phdrsize);
234 return (error);
238 * Inform our caller if the executable needs an interpreter.
240 *interp = (dynphdr == NULL) ? 0 : 1;
243 * If this is a statically linked executable, voffset should indicate
244 * the address of the executable itself (it normally holds the address
245 * of the interpreter).
247 if (ehdr->e_type == ET_EXEC && *interp == 0)
248 *voffset = minaddr;
250 if (uphdr != NULL) {
251 *uphdr_vaddr = uphdr->p_vaddr;
253 if (uphdr->p_flags == 0)
254 kmem_free(uphdr, sizeof (Phdr));
255 } else {
256 *uphdr_vaddr = (Addr)-1;
259 kmem_free(phdrbase, phdrsize);
260 return (error);
264 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
265 int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred,
266 int brand_action)
268 caddr_t phdrbase = NULL;
269 caddr_t bssbase = 0;
270 caddr_t brkbase = 0;
271 size_t brksize = 0;
272 size_t dlnsize;
273 aux_entry_t *aux;
274 int error;
275 ssize_t resid;
276 int fd = -1;
277 intptr_t voffset;
278 Phdr *intphdr = NULL;
279 Phdr *dynamicphdr = NULL;
280 Phdr *stphdr = NULL;
281 Phdr *uphdr = NULL;
282 Phdr *junk = NULL;
283 size_t len;
284 size_t postfixsize = 0;
285 size_t i;
286 Phdr *phdrp;
287 Phdr *dataphdrp = NULL;
288 Phdr *dtrphdr;
289 Phdr *capphdr = NULL;
290 Cap *cap = NULL;
291 size_t capsize;
292 int hasu = 0;
293 int hasauxv = 0;
294 int hasintp = 0;
295 int branded = 0;
296 boolean_t dynuphdr = B_FALSE;
298 struct proc *p = ttoproc(curthread);
299 struct user *up = PTOU(p);
300 struct bigwad {
301 Ehdr ehdr;
302 aux_entry_t elfargs[__KERN_NAUXV_IMPL];
303 char dl_name[MAXPATHLEN];
304 char pathbuf[MAXPATHLEN];
305 struct vattr vattr;
306 struct execenv exenv;
307 } *bigwad; /* kmem_alloc this behemoth so we don't blow stack */
308 Ehdr *ehdrp;
309 uint_t nshdrs, shstrndx, nphdrs;
310 size_t phdrsize;
311 char *dlnp;
312 char *pathbufp;
313 rlim64_t limit;
314 rlim64_t roundlimit;
316 ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64);
318 bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP);
319 ehdrp = &bigwad->ehdr;
320 dlnp = bigwad->dl_name;
321 pathbufp = bigwad->pathbuf;
324 * Obtain ELF and program header information.
326 if ((error = getelfhead(vp, CRED(), ehdrp, &nshdrs, &shstrndx,
327 &nphdrs)) != 0 ||
328 (error = getelfphdr(vp, CRED(), ehdrp, nphdrs, &phdrbase,
329 &phdrsize)) != 0)
330 goto out;
333 * Prevent executing an ELF file that has no entry point.
335 if (ehdrp->e_entry == 0) {
336 uprintf("%s: Bad entry point\n", exec_file);
337 goto bad;
341 * Put data model that we're exec-ing to into the args passed to
342 * exec_args(), so it will know what it is copying to on new stack.
343 * Now that we know whether we are exec-ing a 32-bit or 64-bit
344 * executable, we can set execsz with the appropriate NCARGS.
346 #ifdef _LP64
347 if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
348 args->to_model = DATAMODEL_ILP32;
349 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
350 } else {
351 args->to_model = DATAMODEL_LP64;
352 args->stk_prot &= ~PROT_EXEC;
353 #if defined(__x86)
354 args->dat_prot &= ~PROT_EXEC;
355 #endif
356 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
358 #else /* _LP64 */
359 args->to_model = DATAMODEL_ILP32;
360 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
361 #endif /* _LP64 */
364 * We delay invoking the brand callback until we've figured out
365 * what kind of elf binary we're trying to run, 32-bit or 64-bit.
366 * We do this because now the brand library can just check
367 * args->to_model to see if the target is 32-bit or 64-bit without
368 * having do duplicate all the code above.
370 * The level checks associated with brand handling below are used to
371 * prevent a loop since the brand elfexec function typically comes back
372 * through this function. We must check <= here since the nested
373 * handling in the #! interpreter code will increment the level before
374 * calling gexec to run the final elfexec interpreter.
376 if ((level <= INTP_MAXDEPTH) &&
377 (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
378 error = BROP(p)->b_elfexec(vp, uap, args,
379 idatap, level + 1, execsz, setid, exec_file, cred,
380 brand_action);
381 goto out;
385 * Determine aux size now so that stack can be built
386 * in one shot (except actual copyout of aux image),
387 * determine any non-default stack protections,
388 * and still have this code be machine independent.
390 const uint_t hsize = ehdrp->e_phentsize;
391 phdrp = (Phdr *)phdrbase;
392 for (i = nphdrs; i > 0; i--) {
393 switch (phdrp->p_type) {
394 case PT_INTERP:
395 hasauxv = hasintp = 1;
396 break;
397 case PT_PHDR:
398 hasu = 1;
399 break;
400 case PT_SUNWSTACK:
401 args->stk_prot = PROT_USER;
402 if (phdrp->p_flags & PF_R)
403 args->stk_prot |= PROT_READ;
404 if (phdrp->p_flags & PF_W)
405 args->stk_prot |= PROT_WRITE;
406 if (phdrp->p_flags & PF_X)
407 args->stk_prot |= PROT_EXEC;
408 break;
409 case PT_LOAD:
410 dataphdrp = phdrp;
411 break;
412 case PT_SUNWCAP:
413 capphdr = phdrp;
414 break;
415 case PT_DYNAMIC:
416 dynamicphdr = phdrp;
417 break;
419 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
422 if (ehdrp->e_type != ET_EXEC) {
423 dataphdrp = NULL;
424 hasauxv = 1;
427 /* Copy BSS permissions to args->dat_prot */
428 if (dataphdrp != NULL) {
429 args->dat_prot = PROT_USER;
430 if (dataphdrp->p_flags & PF_R)
431 args->dat_prot |= PROT_READ;
432 if (dataphdrp->p_flags & PF_W)
433 args->dat_prot |= PROT_WRITE;
434 if (dataphdrp->p_flags & PF_X)
435 args->dat_prot |= PROT_EXEC;
439 * If a auxvector will be required - reserve the space for
440 * it now. This may be increased by exec_args if there are
441 * ISA-specific types (included in __KERN_NAUXV_IMPL).
443 if (hasauxv) {
445 * If a AUX vector is being built - the base AUX
446 * entries are:
448 * AT_BASE
449 * AT_FLAGS
450 * AT_PAGESZ
451 * AT_SUN_AUXFLAGS
452 * AT_SUN_HWCAP
453 * AT_SUN_HWCAP2
454 * AT_SUN_HWCAP3
455 * AT_SUN_PLATFORM (added in stk_copyout)
456 * AT_SUN_EXECNAME (added in stk_copyout)
457 * AT_NULL
459 * total == 10
461 if (hasintp && hasu) {
463 * Has PT_INTERP & PT_PHDR - the auxvectors that
464 * will be built are:
466 * AT_PHDR
467 * AT_PHENT
468 * AT_PHNUM
469 * AT_ENTRY
470 * AT_LDDATA
472 * total = 5
474 args->auxsize = (10 + 5) * sizeof (aux_entry_t);
475 } else if (hasintp) {
477 * Has PT_INTERP but no PT_PHDR
479 * AT_EXECFD
480 * AT_LDDATA
482 * total = 2
484 args->auxsize = (10 + 2) * sizeof (aux_entry_t);
485 } else {
486 args->auxsize = 10 * sizeof (aux_entry_t);
488 } else {
489 args->auxsize = 0;
493 * If this binary is using an emulator, we need to add an
494 * AT_SUN_EMULATOR aux entry.
496 if (args->emulator != NULL)
497 args->auxsize += sizeof (aux_entry_t);
500 * On supported kernels (x86_64) make room in the auxv for the
501 * AT_SUN_COMMPAGE entry. This will go unpopulated on i86xpv systems
502 * which do not provide such functionality.
504 * Additionally cover the floating point information AT_SUN_FPSIZE and
505 * AT_SUN_FPTYPE.
507 #if defined(__amd64)
508 args->auxsize += 3 * sizeof (aux_entry_t);
509 #endif /* defined(__amd64) */
511 if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
512 branded = 1;
514 * We will be adding 4 entries to the aux vectors. One for
515 * the the brandname and 3 for the brand specific aux vectors.
517 args->auxsize += 4 * sizeof (aux_entry_t);
520 /* If the binary has an explicit ASLR flag, it must be honoured */
521 if ((dynamicphdr != NULL) && (dynamicphdr->p_filesz > 0)) {
522 const size_t dynfilesz = dynamicphdr->p_filesz;
523 const size_t dynoffset = dynamicphdr->p_offset;
524 Dyn *dyn, *dp;
526 if (dynoffset > MAXOFFSET_T ||
527 dynfilesz > MAXOFFSET_T ||
528 dynoffset + dynfilesz > MAXOFFSET_T) {
529 uprintf("%s: cannot read full .dynamic section\n",
530 exec_file);
531 error = EINVAL;
532 goto out;
535 #define DYN_STRIDE 100
536 for (i = 0; i < dynfilesz; i += sizeof (*dyn) * DYN_STRIDE) {
537 const size_t remdyns = (dynfilesz - i) / sizeof (*dyn);
538 const size_t ndyns = MIN(DYN_STRIDE, remdyns);
539 const size_t dynsize = ndyns * sizeof (*dyn);
541 dyn = kmem_alloc(dynsize, KM_SLEEP);
543 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)dyn,
544 (ssize_t)dynsize, (offset_t)(dynoffset + i),
545 UIO_SYSSPACE, 0, (rlim64_t)0,
546 CRED(), NULL)) != 0) {
547 uprintf("%s: cannot read .dynamic section\n",
548 exec_file);
549 goto out;
552 for (dp = dyn; dp < (dyn + ndyns); dp++) {
553 if (dp->d_tag == DT_SUNW_ASLR) {
554 if ((error = handle_secflag_dt(p,
555 DT_SUNW_ASLR,
556 dp->d_un.d_val)) != 0) {
557 uprintf("%s: error setting "
558 "security-flag from "
559 "DT_SUNW_ASLR: %d\n",
560 exec_file, error);
561 goto out;
566 kmem_free(dyn, dynsize);
570 /* Hardware/Software capabilities */
571 if (capphdr != NULL &&
572 (capsize = capphdr->p_filesz) > 0 &&
573 capsize <= 16 * sizeof (*cap)) {
574 const uint_t ncaps = capsize / sizeof (*cap);
575 Cap *cp;
577 cap = kmem_alloc(capsize, KM_SLEEP);
578 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)cap,
579 (ssize_t)capsize, (offset_t)capphdr->p_offset,
580 UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), NULL)) != 0) {
581 uprintf("%s: Cannot read capabilities section\n",
582 exec_file);
583 goto out;
585 for (cp = cap; cp < cap + ncaps; cp++) {
586 if (cp->c_tag == CA_SUNW_SF_1 &&
587 (cp->c_un.c_val & SF1_SUNW_ADDR32)) {
588 if (args->to_model == DATAMODEL_LP64)
589 args->addr32 = 1;
590 break;
595 aux = bigwad->elfargs;
597 * Move args to the user's stack.
598 * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries.
600 if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) {
601 if (error == -1) {
602 error = ENOEXEC;
603 goto bad;
605 goto out;
607 /* we're single threaded after this point */
610 * If this is an ET_DYN executable (shared object),
611 * determine its memory size so that mapelfexec() can load it.
613 if (ehdrp->e_type == ET_DYN)
614 len = elfsize(ehdrp, nphdrs, phdrbase, NULL);
615 else
616 len = 0;
618 dtrphdr = NULL;
620 error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &intphdr,
621 &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
622 len, execsz, &brksize);
625 * Our uphdr has been dynamically allocated if (and only if) its
626 * program header flags are clear. To avoid leaks, this must be
627 * checked regardless of whether mapelfexec() emitted an error.
629 dynuphdr = (uphdr != NULL && uphdr->p_flags == 0);
631 if (error != 0)
632 goto bad;
634 if (uphdr != NULL && intphdr == NULL)
635 goto bad;
637 if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
638 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
639 goto bad;
642 if (intphdr != NULL) {
643 size_t len;
644 uintptr_t lddata;
645 char *p;
646 struct vnode *nvp;
648 dlnsize = intphdr->p_filesz;
651 * Make sure none of the component pieces of dlnsize result in
652 * an oversized or zeroed result.
654 if (intphdr->p_filesz > MAXPATHLEN || dlnsize > MAXPATHLEN ||
655 dlnsize == 0 || dlnsize < intphdr->p_filesz) {
656 goto bad;
660 * Read in "interpreter" pathname.
662 if ((error = vn_rdwr(UIO_READ, vp, dlnp,
663 (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset,
664 UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) {
665 uprintf("%s: Cannot obtain interpreter pathname\n",
666 exec_file);
667 goto bad;
670 if (resid != 0 || dlnp[dlnsize - 1] != '\0')
671 goto bad;
674 * Search for '$ORIGIN' token in interpreter path.
675 * If found, expand it.
677 for (p = dlnp; p = strchr(p, '$'); ) {
678 uint_t len, curlen;
679 char *_ptr;
681 if (strncmp(++p, ORIGIN_STR, ORIGIN_STR_SIZE))
682 continue;
685 * We don't support $ORIGIN on setid programs to close
686 * a potential attack vector.
688 if ((setid & EXECSETID_SETID) != 0) {
689 error = ENOEXEC;
690 goto bad;
693 curlen = 0;
694 len = p - dlnp - 1;
695 if (len) {
696 bcopy(dlnp, pathbufp, len);
697 curlen += len;
699 if (_ptr = strrchr(args->pathname, '/')) {
700 len = _ptr - args->pathname;
701 if ((curlen + len) > MAXPATHLEN)
702 break;
704 bcopy(args->pathname, &pathbufp[curlen], len);
705 curlen += len;
706 } else {
708 * executable is a basename found in the
709 * current directory. So - just substitue
710 * '.' for ORIGIN.
712 pathbufp[curlen] = '.';
713 curlen++;
715 p += ORIGIN_STR_SIZE;
716 len = strlen(p);
718 if ((curlen + len) > MAXPATHLEN)
719 break;
720 bcopy(p, &pathbufp[curlen], len);
721 curlen += len;
722 pathbufp[curlen++] = '\0';
723 bcopy(pathbufp, dlnp, curlen);
727 * /usr/lib/ld.so.1 is known to be a symlink to /lib/ld.so.1
728 * (and /usr/lib/64/ld.so.1 is a symlink to /lib/64/ld.so.1).
729 * Just in case /usr is not mounted, change it now.
731 if (strcmp(dlnp, USR_LIB_RTLD) == 0)
732 dlnp += 4;
733 error = lookupname(dlnp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp);
734 if (error && dlnp != bigwad->dl_name) {
735 /* new kernel, old user-level */
736 error = lookupname(dlnp -= 4, UIO_SYSSPACE, FOLLOW,
737 NULLVPP, &nvp);
739 if (error) {
740 uprintf("%s: Cannot find %s\n", exec_file, dlnp);
741 goto bad;
745 * Setup the "aux" vector.
747 if (uphdr) {
748 if (ehdrp->e_type == ET_DYN) {
749 /* don't use the first page */
750 bigwad->exenv.ex_brkbase = (caddr_t)PAGESIZE;
751 bigwad->exenv.ex_bssbase = (caddr_t)PAGESIZE;
752 } else {
753 bigwad->exenv.ex_bssbase = bssbase;
754 bigwad->exenv.ex_brkbase = brkbase;
756 bigwad->exenv.ex_brksize = brksize;
757 bigwad->exenv.ex_magic = elfmagic;
758 bigwad->exenv.ex_vp = vp;
759 setexecenv(&bigwad->exenv);
761 ADDAUX(aux, AT_PHDR, uphdr->p_vaddr + voffset)
762 ADDAUX(aux, AT_PHENT, ehdrp->e_phentsize)
763 ADDAUX(aux, AT_PHNUM, nphdrs)
764 ADDAUX(aux, AT_ENTRY, ehdrp->e_entry + voffset)
765 } else {
766 if ((error = execopen(&vp, &fd)) != 0) {
767 VN_RELE(nvp);
768 goto bad;
771 ADDAUX(aux, AT_EXECFD, fd)
774 if ((error = execpermissions(nvp, &bigwad->vattr, args)) != 0) {
775 VN_RELE(nvp);
776 uprintf("%s: Cannot execute %s\n", exec_file, dlnp);
777 goto bad;
781 * Now obtain the ELF header along with the entire program
782 * header contained in "nvp".
784 kmem_free(phdrbase, phdrsize);
785 phdrbase = NULL;
786 if ((error = getelfhead(nvp, CRED(), ehdrp, &nshdrs,
787 &shstrndx, &nphdrs)) != 0 ||
788 (error = getelfphdr(nvp, CRED(), ehdrp, nphdrs, &phdrbase,
789 &phdrsize)) != 0) {
790 VN_RELE(nvp);
791 uprintf("%s: Cannot read %s\n", exec_file, dlnp);
792 goto bad;
796 * Determine memory size of the "interpreter's" loadable
797 * sections. This size is then used to obtain the virtual
798 * address of a hole, in the user's address space, large
799 * enough to map the "interpreter".
801 if ((len = elfsize(ehdrp, nphdrs, phdrbase, &lddata)) == 0) {
802 VN_RELE(nvp);
803 uprintf("%s: Nothing to load in %s\n", exec_file, dlnp);
804 goto bad;
807 dtrphdr = NULL;
809 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
810 &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
811 execsz, NULL);
813 if (error || junk != NULL) {
814 VN_RELE(nvp);
815 uprintf("%s: Cannot map %s\n", exec_file, dlnp);
816 goto bad;
820 * We use the DTrace program header to initialize the
821 * architecture-specific user per-LWP location. The dtrace
822 * fasttrap provider requires ready access to per-LWP scratch
823 * space. We assume that there is only one such program header
824 * in the interpreter.
826 if (dtrphdr != NULL &&
827 dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
828 VN_RELE(nvp);
829 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, dlnp);
830 goto bad;
833 VN_RELE(nvp);
834 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
837 if (hasauxv) {
838 int auxf = AF_SUN_HWCAPVERIFY;
839 #if defined(__amd64)
840 size_t fpsize;
841 int fptype;
842 #endif /* defined(__amd64) */
845 * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
846 * exec_args()
848 ADDAUX(aux, AT_BASE, voffset)
849 ADDAUX(aux, AT_FLAGS, at_flags)
850 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
852 * Linker flags. (security)
853 * p_flag not yet set at this time.
854 * We rely on gexec() to provide us with the information.
855 * If the application is set-uid but this is not reflected
856 * in a mismatch between real/effective uids/gids, then
857 * don't treat this as a set-uid exec. So we care about
858 * the EXECSETID_UGIDS flag but not the ...SETID flag.
860 if ((setid &= ~EXECSETID_SETID) != 0)
861 auxf |= AF_SUN_SETUGID;
864 * If we're running a native process from within a branded
865 * zone under pfexec then we clear the AF_SUN_SETUGID flag so
866 * that the native ld.so.1 is able to link with the native
867 * libraries instead of using the brand libraries that are
868 * installed in the zone. We only do this for processes
869 * which we trust because we see they are already running
870 * under pfexec (where uid != euid). This prevents a
871 * malicious user within the zone from crafting a wrapper to
872 * run native suid commands with unsecure libraries interposed.
874 if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
875 (setid &= ~EXECSETID_SETID) != 0))
876 auxf &= ~AF_SUN_SETUGID;
879 * Record the user addr of the auxflags aux vector entry
880 * since brands may optionally want to manipulate this field.
882 args->auxp_auxflags =
883 (char *)((char *)args->stackend +
884 ((char *)&aux->a_type -
885 (char *)bigwad->elfargs));
886 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
889 * Hardware capability flag word (performance hints)
890 * Used for choosing faster library routines.
891 * (Potentially different between 32-bit and 64-bit ABIs)
893 if (args->to_model == DATAMODEL_NATIVE) {
894 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap)
895 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap_2)
896 ADDAUX(aux, AT_SUN_HWCAP3, auxv_hwcap_3)
897 } else {
898 ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap32)
899 ADDAUX(aux, AT_SUN_HWCAP2, auxv_hwcap32_2)
900 ADDAUX(aux, AT_SUN_HWCAP3, auxv_hwcap32_3)
903 if (branded) {
905 * Reserve space for the brand-private aux vectors,
906 * and record the user addr of that space.
908 args->auxp_brand =
909 (char *)((char *)args->stackend +
910 ((char *)&aux->a_type -
911 (char *)bigwad->elfargs));
912 ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
913 ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
914 ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
918 * Add the comm page auxv entry, mapping it in if needed. Also
919 * take care of the FPU entries.
921 #if defined(__amd64)
922 if (args->commpage != (uintptr_t)NULL ||
923 (args->commpage = (uintptr_t)comm_page_mapin()) !=
924 (uintptr_t)NULL) {
925 ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
926 } else {
928 * If the comm page cannot be mapped, pad out the auxv
929 * to satisfy later size checks.
931 ADDAUX(aux, AT_NULL, 0)
934 fptype = AT_386_FPINFO_NONE;
935 fpu_auxv_info(&fptype, &fpsize);
936 if (fptype != AT_386_FPINFO_NONE) {
937 ADDAUX(aux, AT_SUN_FPTYPE, fptype)
938 ADDAUX(aux, AT_SUN_FPSIZE, fpsize)
939 } else {
940 ADDAUX(aux, AT_NULL, 0)
941 ADDAUX(aux, AT_NULL, 0)
943 #endif /* defined(__amd64) */
945 ADDAUX(aux, AT_NULL, 0)
946 postfixsize = (uintptr_t)aux - (uintptr_t)bigwad->elfargs;
949 * We make assumptions above when we determine how many aux
950 * vector entries we will be adding. However, if we have an
951 * invalid elf file, it is possible that mapelfexec might
952 * behave differently (but not return an error), in which case
953 * the number of aux entries we actually add will be different.
954 * We detect that now and error out.
956 if (postfixsize != args->auxsize) {
957 DTRACE_PROBE2(elfexec_badaux, size_t, postfixsize,
958 size_t, args->auxsize);
959 goto bad;
961 ASSERT(postfixsize <= __KERN_NAUXV_IMPL * sizeof (aux_entry_t));
965 * For the 64-bit kernel, the limit is big enough that rounding it up
966 * to a page can overflow the 64-bit limit, so we check for btopr()
967 * overflowing here by comparing it with the unrounded limit in pages.
968 * If it hasn't overflowed, compare the exec size with the rounded up
969 * limit in pages. Otherwise, just compare with the unrounded limit.
971 limit = btop(p->p_vmem_ctl);
972 roundlimit = btopr(p->p_vmem_ctl);
973 if ((roundlimit > limit && *execsz > roundlimit) ||
974 (roundlimit < limit && *execsz > limit)) {
975 mutex_enter(&p->p_lock);
976 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
977 RCA_SAFE);
978 mutex_exit(&p->p_lock);
979 error = ENOMEM;
980 goto bad;
983 bzero(up->u_auxv, sizeof (up->u_auxv));
984 up->u_commpagep = args->commpage;
985 if (postfixsize) {
986 size_t num_auxv;
989 * Copy the aux vector to the user stack.
991 error = execpoststack(args, bigwad->elfargs, postfixsize);
992 if (error)
993 goto bad;
996 * Copy auxv to the process's user structure for use by /proc.
997 * If this is a branded process, the brand's exec routine will
998 * copy it's private entries to the user structure later. It
999 * relies on the fact that the blank entries are at the end.
1001 num_auxv = postfixsize / sizeof (aux_entry_t);
1002 ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t));
1003 aux = bigwad->elfargs;
1004 for (i = 0; i < num_auxv; i++) {
1005 up->u_auxv[i].a_type = aux[i].a_type;
1006 up->u_auxv[i].a_un.a_val = (aux_val_t)aux[i].a_un.a_val;
1011 * Pass back the starting address so we can set the program counter.
1013 args->entry = (uintptr_t)(ehdrp->e_entry + voffset);
1015 if (!uphdr) {
1016 if (ehdrp->e_type == ET_DYN) {
1018 * If we are executing a shared library which doesn't
1019 * have a interpreter (probably ld.so.1) then
1020 * we don't set the brkbase now. Instead we
1021 * delay it's setting until the first call
1022 * via grow.c::brk(). This permits ld.so.1 to
1023 * initialize brkbase to the tail of the executable it
1024 * loads (which is where it needs to be).
1026 bigwad->exenv.ex_brkbase = (caddr_t)0;
1027 bigwad->exenv.ex_bssbase = (caddr_t)0;
1028 bigwad->exenv.ex_brksize = 0;
1029 } else {
1030 bigwad->exenv.ex_brkbase = brkbase;
1031 bigwad->exenv.ex_bssbase = bssbase;
1032 bigwad->exenv.ex_brksize = brksize;
1034 bigwad->exenv.ex_magic = elfmagic;
1035 bigwad->exenv.ex_vp = vp;
1036 setexecenv(&bigwad->exenv);
1039 ASSERT(error == 0);
1040 goto out;
1042 bad:
1043 if (fd != -1) /* did we open the a.out yet */
1044 (void) execclose(fd);
1046 psignal(p, SIGKILL);
1048 if (error == 0)
1049 error = ENOEXEC;
1050 out:
1051 if (dynuphdr)
1052 kmem_free(uphdr, sizeof (Phdr));
1053 if (phdrbase != NULL)
1054 kmem_free(phdrbase, phdrsize);
1055 if (cap != NULL)
1056 kmem_free(cap, capsize);
1057 kmem_free(bigwad, sizeof (struct bigwad));
1058 return (error);
1062 * Compute the memory size requirement for the ELF file.
1064 static size_t
1065 elfsize(const Ehdr *ehdrp, uint_t nphdrs, const caddr_t phdrbase,
1066 uintptr_t *lddata)
1068 const Phdr *phdrp = (Phdr *)phdrbase;
1069 const uint_t hsize = ehdrp->e_phentsize;
1070 boolean_t dfirst = B_TRUE;
1071 uintptr_t loaddr = UINTPTR_MAX;
1072 uintptr_t hiaddr = 0;
1073 uint_t i;
1075 for (i = nphdrs; i > 0; i--) {
1076 if (phdrp->p_type == PT_LOAD) {
1077 const uintptr_t lo = phdrp->p_vaddr;
1078 const uintptr_t hi = lo + phdrp->p_memsz;
1080 loaddr = MIN(lo, loaddr);
1081 hiaddr = MAX(hi, hiaddr);
1084 * save the address of the first data segment
1085 * of a object - used for the AT_SUNW_LDDATA
1086 * aux entry.
1088 if ((lddata != NULL) && dfirst &&
1089 (phdrp->p_flags & PF_W)) {
1090 *lddata = lo;
1091 dfirst = B_FALSE;
1094 phdrp = (Phdr *)((caddr_t)phdrp + hsize);
1097 if (hiaddr <= loaddr) {
1098 /* No non-zero PT_LOAD segment found */
1099 return (0);
1102 return (roundup(hiaddr - (loaddr & PAGEMASK), PAGESIZE));
1106 * Read in the ELF header and program header table.
1107 * SUSV3 requires:
1108 * ENOEXEC File format is not recognized
1109 * EINVAL Format recognized but execution not supported
1111 static int
1112 getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs,
1113 uint_t *shstrndx, uint_t *nphdrs)
1115 int error;
1116 ssize_t resid;
1119 * We got here by the first two bytes in ident,
1120 * now read the entire ELF header.
1122 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr,
1123 sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0,
1124 (rlim64_t)0, credp, &resid)) != 0)
1125 return (error);
1128 * Since a separate version is compiled for handling 32-bit and
1129 * 64-bit ELF executables on a 64-bit kernel, the 64-bit version
1130 * doesn't need to be able to deal with 32-bit ELF files.
1132 if (resid != 0 ||
1133 ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
1134 ehdr->e_ident[EI_MAG3] != ELFMAG3)
1135 return (ENOEXEC);
1137 if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) ||
1138 #if defined(_ILP32) || defined(_ELF32_COMPAT)
1139 ehdr->e_ident[EI_CLASS] != ELFCLASS32 ||
1140 #else
1141 ehdr->e_ident[EI_CLASS] != ELFCLASS64 ||
1142 #endif
1143 !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine,
1144 ehdr->e_flags))
1145 return (EINVAL);
1147 *nshdrs = ehdr->e_shnum;
1148 *shstrndx = ehdr->e_shstrndx;
1149 *nphdrs = ehdr->e_phnum;
1152 * If e_shnum, e_shstrndx, or e_phnum is its sentinel value, we need
1153 * to read in the section header at index zero to access the true
1154 * values for those fields.
1156 if ((*nshdrs == 0 && ehdr->e_shoff != 0) ||
1157 *shstrndx == SHN_XINDEX || *nphdrs == PN_XNUM) {
1158 Shdr shdr;
1160 if (ehdr->e_shoff == 0)
1161 return (EINVAL);
1163 if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr,
1164 sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0,
1165 (rlim64_t)0, credp, NULL)) != 0) {
1166 return (error);
1169 if (*nshdrs == 0)
1170 *nshdrs = shdr.sh_size;
1171 if (*shstrndx == SHN_XINDEX)
1172 *shstrndx = shdr.sh_link;
1173 if (*nphdrs == PN_XNUM && shdr.sh_info != 0)
1174 *nphdrs = shdr.sh_info;
1177 return (0);
1181 * We use members through p_flags on 32-bit files and p_memsz on 64-bit files,
1182 * so e_phentsize must be at least large enough to include those members.
1184 #if !defined(_LP64) || defined(_ELF32_COMPAT)
1185 #define MINPHENTSZ (offsetof(Phdr, p_flags) + \
1186 sizeof (((Phdr *)NULL)->p_flags))
1187 #else
1188 #define MINPHENTSZ (offsetof(Phdr, p_memsz) + \
1189 sizeof (((Phdr *)NULL)->p_memsz))
1190 #endif
1192 static int
1193 getelfphdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nphdrs,
1194 caddr_t *phbasep, size_t *phsizep)
1196 int err;
1199 * Ensure that e_phentsize is large enough for required fields to be
1200 * accessible and will maintain 8-byte alignment.
1202 if (ehdr->e_phentsize < MINPHENTSZ || (ehdr->e_phentsize & 3))
1203 return (EINVAL);
1205 *phsizep = nphdrs * ehdr->e_phentsize;
1207 if (*phsizep > sizeof (Phdr) * elf_nphdr_max) {
1208 if ((*phbasep = kmem_alloc(*phsizep, KM_NOSLEEP)) == NULL)
1209 return (ENOMEM);
1210 } else {
1211 *phbasep = kmem_alloc(*phsizep, KM_SLEEP);
1214 if ((err = vn_rdwr(UIO_READ, vp, *phbasep, (ssize_t)*phsizep,
1215 (offset_t)ehdr->e_phoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1216 credp, NULL)) != 0) {
1217 kmem_free(*phbasep, *phsizep);
1218 *phbasep = NULL;
1219 return (err);
1222 return (0);
1225 #define MINSHDRSZ (offsetof(Shdr, sh_entsize) + \
1226 sizeof (((Shdr *)NULL)->sh_entsize))
1228 static int
1229 getelfshdr(vnode_t *vp, cred_t *credp, const Ehdr *ehdr, uint_t nshdrs,
1230 uint_t shstrndx, caddr_t *shbasep, size_t *shsizep, char **shstrbasep,
1231 size_t *shstrsizep)
1233 int err;
1234 Shdr *shdr;
1237 * Since we're going to be using e_shentsize to iterate down the
1238 * array of section headers, it must be 8-byte aligned or else
1239 * a we might cause a misaligned access. We use all members through
1240 * sh_entsize (on both 32- and 64-bit ELF files) so e_shentsize
1241 * must be at least large enough to include that member. The index
1242 * of the string table section must also be valid.
1244 if (ehdr->e_shentsize < MINSHDRSZ || (ehdr->e_shentsize & 3) ||
1245 nshdrs == 0 || shstrndx >= nshdrs) {
1246 return (EINVAL);
1249 *shsizep = nshdrs * ehdr->e_shentsize;
1251 if (*shsizep > sizeof (Shdr) * elf_nshdr_max) {
1252 if ((*shbasep = kmem_alloc(*shsizep, KM_NOSLEEP)) == NULL)
1253 return (ENOMEM);
1254 } else {
1255 *shbasep = kmem_alloc(*shsizep, KM_SLEEP);
1258 if ((err = vn_rdwr(UIO_READ, vp, *shbasep, (ssize_t)*shsizep,
1259 (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, (rlim64_t)0,
1260 credp, NULL)) != 0) {
1261 kmem_free(*shbasep, *shsizep);
1262 return (err);
1266 * Grab the section string table. Walking through the shdrs is
1267 * pointless if their names cannot be interrogated.
1269 shdr = (Shdr *)(*shbasep + shstrndx * ehdr->e_shentsize);
1270 if ((*shstrsizep = shdr->sh_size) == 0) {
1271 kmem_free(*shbasep, *shsizep);
1272 return (EINVAL);
1275 if (*shstrsizep > elf_shstrtab_max) {
1276 if ((*shstrbasep = kmem_alloc(*shstrsizep,
1277 KM_NOSLEEP)) == NULL) {
1278 kmem_free(*shbasep, *shsizep);
1279 return (ENOMEM);
1281 } else {
1282 *shstrbasep = kmem_alloc(*shstrsizep, KM_SLEEP);
1285 if ((err = vn_rdwr(UIO_READ, vp, *shstrbasep, (ssize_t)*shstrsizep,
1286 (offset_t)shdr->sh_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
1287 credp, NULL)) != 0) {
1288 kmem_free(*shbasep, *shsizep);
1289 kmem_free(*shstrbasep, *shstrsizep);
1290 return (err);
1294 * Make sure the strtab is null-terminated to make sure we
1295 * don't run off the end of the table.
1297 (*shstrbasep)[*shstrsizep - 1] = '\0';
1299 return (0);
1303 elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, uint_t *nphdrs,
1304 caddr_t *phbasep, size_t *phsizep)
1306 int error;
1307 uint_t nshdrs, shstrndx;
1309 if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
1310 nphdrs)) != 0 ||
1311 (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
1312 phsizep)) != 0) {
1313 return (error);
1315 return (0);
1318 static int
1319 mapelfexec(
1320 vnode_t *vp,
1321 Ehdr *ehdr,
1322 uint_t nphdrs,
1323 caddr_t phdrbase,
1324 Phdr **uphdr,
1325 Phdr **intphdr,
1326 Phdr **stphdr,
1327 Phdr **dtphdr,
1328 Phdr *dataphdrp,
1329 caddr_t *bssbase,
1330 caddr_t *brkbase,
1331 intptr_t *voffset,
1332 uintptr_t *minaddrp,
1333 size_t len,
1334 size_t *execsz,
1335 size_t *brksize)
1337 Phdr *phdr;
1338 int error, page, prot;
1339 caddr_t addr = NULL;
1340 caddr_t minaddr = (caddr_t)UINTPTR_MAX;
1341 uint_t i;
1342 size_t zfodsz, memsz;
1343 boolean_t ptload = B_FALSE;
1344 off_t offset;
1345 const uint_t hsize = ehdr->e_phentsize;
1346 extern int use_brk_lpg;
1348 if (ehdr->e_type == ET_DYN) {
1349 secflagset_t flags = 0;
1351 * Obtain the virtual address of a hole in the
1352 * address space to map the "interpreter".
1354 if (secflag_enabled(curproc, PROC_SEC_ASLR))
1355 flags |= _MAP_RANDOMIZE;
1357 map_addr(&addr, len, (offset_t)0, 1, flags);
1358 if (addr == NULL)
1359 return (ENOMEM);
1360 *voffset = (intptr_t)addr;
1363 * Calculate the minimum vaddr so it can be subtracted out.
1364 * According to the ELF specification, since PT_LOAD sections
1365 * must be sorted by increasing p_vaddr values, this is
1366 * guaranteed to be the first PT_LOAD section.
1368 phdr = (Phdr *)phdrbase;
1369 for (i = nphdrs; i > 0; i--) {
1370 if (phdr->p_type == PT_LOAD) {
1371 *voffset -= (uintptr_t)phdr->p_vaddr;
1372 break;
1374 phdr = (Phdr *)((caddr_t)phdr + hsize);
1377 } else {
1378 *voffset = 0;
1381 phdr = (Phdr *)phdrbase;
1382 for (i = nphdrs; i > 0; i--) {
1383 switch (phdr->p_type) {
1384 case PT_LOAD:
1385 ptload = B_TRUE;
1386 prot = PROT_USER;
1387 if (phdr->p_flags & PF_R)
1388 prot |= PROT_READ;
1389 if (phdr->p_flags & PF_W)
1390 prot |= PROT_WRITE;
1391 if (phdr->p_flags & PF_X)
1392 prot |= PROT_EXEC;
1394 addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
1396 if (*intphdr != NULL && uphdr != NULL &&
1397 *uphdr == NULL) {
1399 * The PT_PHDR program header is, strictly
1400 * speaking, optional. If we find that this
1401 * is missing, we will determine the location
1402 * of the program headers based on the address
1403 * of the lowest PT_LOAD segment (namely, this
1404 * one): we subtract the p_offset to get to
1405 * the ELF header and then add back the program
1406 * header offset to get to the program headers.
1407 * We then cons up a Phdr that corresponds to
1408 * the (missing) PT_PHDR, setting the flags
1409 * to 0 to denote that this is artificial and
1410 * should (must) be freed by the caller.
1412 Phdr *cons;
1414 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
1416 cons->p_flags = 0;
1417 cons->p_type = PT_PHDR;
1418 cons->p_vaddr = ((uintptr_t)addr -
1419 phdr->p_offset) + ehdr->e_phoff;
1421 *uphdr = cons;
1425 * The ELF spec dictates that p_filesz may not be
1426 * larger than p_memsz in PT_LOAD segments.
1428 if (phdr->p_filesz > phdr->p_memsz) {
1429 error = EINVAL;
1430 goto bad;
1434 * Keep track of the segment with the lowest starting
1435 * address.
1437 if (addr < minaddr)
1438 minaddr = addr;
1440 zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
1442 offset = phdr->p_offset;
1443 if (((uintptr_t)offset & PAGEOFFSET) ==
1444 ((uintptr_t)addr & PAGEOFFSET) &&
1445 (!(vp->v_flag & VNOMAP))) {
1446 page = 1;
1447 } else {
1448 page = 0;
1452 * Set the heap pagesize for OOB when the bss size
1453 * is known and use_brk_lpg is not 0.
1455 if (brksize != NULL && use_brk_lpg &&
1456 zfodsz != 0 && phdr == dataphdrp &&
1457 (prot & PROT_WRITE)) {
1458 const size_t tlen = P2NPHASE((uintptr_t)addr +
1459 phdr->p_filesz, PAGESIZE);
1461 if (zfodsz > tlen) {
1462 const caddr_t taddr = addr +
1463 phdr->p_filesz + tlen;
1466 * Since a hole in the AS large enough
1467 * for this object as calculated by
1468 * elfsize() is available, we do not
1469 * need to fear overflow for 'taddr'.
1471 curproc->p_brkpageszc =
1472 page_szc(map_pgsz(MAPPGSZ_HEAP,
1473 curproc, taddr, zfodsz - tlen, 0));
1477 if (curproc->p_brkpageszc != 0 && phdr == dataphdrp &&
1478 (prot & PROT_WRITE)) {
1479 uint_t szc = curproc->p_brkpageszc;
1480 size_t pgsz = page_get_pagesize(szc);
1481 caddr_t ebss = addr + phdr->p_memsz;
1483 * If we need extra space to keep the BSS an
1484 * integral number of pages in size, some of
1485 * that space may fall beyond p_brkbase, so we
1486 * need to set p_brksize to account for it
1487 * being (logically) part of the brk.
1489 size_t extra_zfodsz;
1491 ASSERT(pgsz > PAGESIZE);
1493 extra_zfodsz = P2NPHASE((uintptr_t)ebss, pgsz);
1495 if (error = execmap(vp, addr, phdr->p_filesz,
1496 zfodsz + extra_zfodsz, phdr->p_offset,
1497 prot, page, szc))
1498 goto bad;
1499 if (brksize != NULL)
1500 *brksize = extra_zfodsz;
1501 } else {
1502 if (error = execmap(vp, addr, phdr->p_filesz,
1503 zfodsz, phdr->p_offset, prot, page, 0))
1504 goto bad;
1507 if (bssbase != NULL && addr >= *bssbase &&
1508 phdr == dataphdrp) {
1509 *bssbase = addr + phdr->p_filesz;
1511 if (brkbase != NULL && addr >= *brkbase) {
1512 *brkbase = addr + phdr->p_memsz;
1515 memsz = btopr(phdr->p_memsz);
1516 if ((*execsz + memsz) < *execsz) {
1517 error = ENOMEM;
1518 goto bad;
1520 *execsz += memsz;
1521 break;
1523 case PT_INTERP:
1524 if (ptload)
1525 goto bad;
1526 *intphdr = phdr;
1527 break;
1529 case PT_SHLIB:
1530 *stphdr = phdr;
1531 break;
1533 case PT_PHDR:
1534 if (ptload || phdr->p_flags == 0)
1535 goto bad;
1537 if (uphdr != NULL)
1538 *uphdr = phdr;
1540 break;
1542 case PT_NULL:
1543 case PT_DYNAMIC:
1544 case PT_NOTE:
1545 break;
1547 case PT_SUNWDTRACE:
1548 if (dtphdr != NULL)
1549 *dtphdr = phdr;
1550 break;
1552 default:
1553 break;
1555 phdr = (Phdr *)((caddr_t)phdr + hsize);
1558 if (minaddrp != NULL) {
1559 ASSERT(minaddr != (caddr_t)UINTPTR_MAX);
1560 *minaddrp = (uintptr_t)minaddr;
1563 if (brkbase != NULL && secflag_enabled(curproc, PROC_SEC_ASLR)) {
1564 size_t off;
1565 uintptr_t base = (uintptr_t)*brkbase;
1566 uintptr_t oend = base + *brksize;
1568 ASSERT(ISP2(aslr_max_brk_skew));
1570 (void) random_get_pseudo_bytes((uint8_t *)&off, sizeof (off));
1571 base += P2PHASE(off, aslr_max_brk_skew);
1572 base = P2ROUNDUP(base, PAGESIZE);
1573 *brkbase = (caddr_t)base;
1575 * Above, we set *brksize to account for the possibility we
1576 * had to grow the 'brk' in padding out the BSS to a page
1577 * boundary.
1579 * We now need to adjust that based on where we now are
1580 * actually putting the brk.
1582 if (oend > base)
1583 *brksize = oend - base;
1584 else
1585 *brksize = 0;
1588 return (0);
1589 bad:
1590 if (error == 0)
1591 error = EINVAL;
1592 return (error);
1596 elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc,
1597 rlim64_t rlimit, cred_t *credp)
1599 Note note;
1600 int error;
1602 bzero(&note, sizeof (note));
1603 bcopy("CORE", note.name, 4);
1604 note.nhdr.n_type = type;
1606 * The System V ABI states that n_namesz must be the length of the
1607 * string that follows the Nhdr structure including the terminating
1608 * null. The ABI also specifies that sufficient padding should be
1609 * included so that the description that follows the name string
1610 * begins on a 4- or 8-byte boundary for 32- and 64-bit binaries
1611 * respectively. However, since this change was not made correctly
1612 * at the time of the 64-bit port, both 32- and 64-bit binaries
1613 * descriptions are only guaranteed to begin on a 4-byte boundary.
1615 note.nhdr.n_namesz = 5;
1616 note.nhdr.n_descsz = roundup(descsz, sizeof (Word));
1618 if (error = core_write(vp, UIO_SYSSPACE, *offsetp, &note,
1619 sizeof (note), rlimit, credp))
1620 return (error);
1622 *offsetp += sizeof (note);
1624 if (error = core_write(vp, UIO_SYSSPACE, *offsetp, desc,
1625 note.nhdr.n_descsz, rlimit, credp))
1626 return (error);
1628 *offsetp += note.nhdr.n_descsz;
1629 return (0);
1633 * Copy the section data from one vnode to the section of another vnode.
1635 static void
1636 elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst)
1638 size_t n = src->sh_size;
1639 u_offset_t off = 0;
1640 const u_offset_t soff = src->sh_offset;
1641 const u_offset_t doff = ctx->ecc_doffset;
1642 void *buf = ctx->ecc_buf;
1643 vnode_t *dst_vp = ctx->ecc_vp;
1644 cred_t *credp = ctx->ecc_credp;
1646 /* Protect the copy loop below from overflow on the offsets */
1647 if (n > OFF_MAX || (n + soff) > OFF_MAX || (n + doff) > OFF_MAX ||
1648 (n + soff) < n || (n + doff) < n) {
1649 dst->sh_size = 0;
1650 dst->sh_offset = 0;
1651 return;
1654 while (n != 0) {
1655 const size_t len = MIN(ctx->ecc_bufsz, n);
1656 ssize_t resid;
1658 if (vn_rdwr(UIO_READ, src_vp, buf, (ssize_t)len,
1659 (offset_t)(soff + off),
1660 UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid) != 0 ||
1661 resid >= len || resid < 0 ||
1662 core_write(dst_vp, UIO_SYSSPACE, (offset_t)(doff + off),
1663 buf, len - resid, ctx->ecc_rlimit, credp) != 0) {
1664 dst->sh_size = 0;
1665 dst->sh_offset = 0;
1666 return;
1669 ASSERT(n >= len - resid);
1671 n -= len - resid;
1672 off += len - resid;
1675 ctx->ecc_doffset += src->sh_size;
1679 * Walk sections for a given ELF object, counting (or copying) those of
1680 * interest (CTF, symtab, strtab, .debug_*).
1682 static int
1683 elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr,
1684 Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab, uint_t *countp)
1686 Ehdr ehdr;
1687 const core_content_t content = ctx->ecc_content;
1688 cred_t *credp = ctx->ecc_credp;
1689 Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL;
1690 uintptr_t off = 0;
1691 uint_t nshdrs, shstrndx, nphdrs, count = 0;
1692 u_offset_t *doffp = &ctx->ecc_doffset;
1693 boolean_t ctf_link = B_FALSE;
1694 caddr_t shbase;
1695 size_t shsize, shstrsize;
1696 char *shstrbase;
1697 int error = 0;
1698 const boolean_t justcounting = v == NULL;
1700 *countp = 0;
1702 if ((content &
1703 (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG)) == 0) {
1704 return (0);
1707 if (getelfhead(mvp, credp, &ehdr, &nshdrs, &shstrndx, &nphdrs) != 0 ||
1708 getelfshdr(mvp, credp, &ehdr, nshdrs, shstrndx, &shbase, &shsize,
1709 &shstrbase, &shstrsize) != 0) {
1710 return (0);
1713 /* Starting at index 1 skips SHT_NULL which is expected at index 0 */
1714 off = ehdr.e_shentsize;
1715 for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) {
1716 Shdr *shdr, *symchk = NULL, *strchk;
1717 const char *name;
1719 shdr = (Shdr *)(shbase + off);
1720 if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL)
1721 continue;
1723 name = shstrbase + shdr->sh_name;
1725 if (ctf == NULL &&
1726 (content & CC_CONTENT_CTF) != 0 &&
1727 strcmp(name, shstrtab_data[STR_CTF]) == 0) {
1728 ctf = shdr;
1729 if (ctf->sh_link != 0 && ctf->sh_link < nshdrs) {
1730 /* check linked symtab below */
1731 symchk = (Shdr *)(shbase +
1732 shdr->sh_link * ehdr.e_shentsize);
1733 ctf_link = B_TRUE;
1734 } else {
1735 continue;
1737 } else if (symtab == NULL &&
1738 (content & CC_CONTENT_SYMTAB) != 0 &&
1739 strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) {
1740 symchk = shdr;
1741 } else if ((content & CC_CONTENT_DEBUG) != 0 &&
1742 strncmp(name, ".debug_", strlen(".debug_")) == 0) {
1744 * The design of the above check is intentional. In
1745 * particular, we want to capture any sections that
1746 * begin with '.debug_' for a few reasons:
1748 * 1) Various revisions to the DWARF spec end up
1749 * changing the set of section headers that exist. This
1750 * ensures that we don't need to change the kernel to
1751 * get a new version.
1753 * 2) Other software uses .debug_ sections for things
1754 * which aren't DWARF. This allows them to be captured
1755 * as well.
1757 count++;
1759 if (!justcounting) {
1760 if (count > remain) {
1761 error = ENOMEM;
1762 goto done;
1765 elf_ctx_resize_scratch(ctx, shdr->sh_size);
1767 if (!shstrtab_ndx(shstrtab,
1768 name, &v[idx].sh_name)) {
1769 error = ENOMEM;
1770 goto done;
1773 v[idx].sh_addr = (Addr)(uintptr_t)saddr;
1774 v[idx].sh_type = shdr->sh_type;
1775 v[idx].sh_addralign = shdr->sh_addralign;
1776 *doffp = roundup(*doffp, v[idx].sh_addralign);
1777 v[idx].sh_offset = *doffp;
1778 v[idx].sh_size = shdr->sh_size;
1779 v[idx].sh_link = 0;
1780 v[idx].sh_entsize = shdr->sh_entsize;
1781 v[idx].sh_info = shdr->sh_info;
1783 elf_copy_scn(ctx, shdr, mvp, &v[idx]);
1784 idx++;
1787 continue;
1788 } else {
1789 continue;
1792 ASSERT(symchk != NULL);
1793 if ((symchk->sh_type != SHT_DYNSYM &&
1794 symchk->sh_type != SHT_SYMTAB) ||
1795 symchk->sh_link == 0 || symchk->sh_link >= nshdrs) {
1796 ctf_link = B_FALSE;
1797 continue;
1799 strchk = (Shdr *)(shbase + symchk->sh_link * ehdr.e_shentsize);
1800 if (strchk->sh_type != SHT_STRTAB) {
1801 ctf_link = B_FALSE;
1802 continue;
1804 symtab = symchk;
1805 strtab = strchk;
1807 if (symtab != NULL && ctf != NULL &&
1808 (content & CC_CONTENT_DEBUG) == 0) {
1809 /* No other shdrs are of interest at this point */
1810 break;
1814 if (ctf != NULL)
1815 count += 1;
1816 if (symtab != NULL)
1817 count += 2;
1819 if (count > remain) {
1820 count = remain;
1821 if (!justcounting)
1822 error = ENOMEM;
1823 goto done;
1826 if (justcounting)
1827 goto done;
1829 /* output CTF section */
1830 if (ctf != NULL) {
1831 elf_ctx_resize_scratch(ctx, ctf->sh_size);
1833 if (!shstrtab_ndx(shstrtab,
1834 shstrtab_data[STR_CTF], &v[idx].sh_name)) {
1835 error = ENOMEM;
1836 goto done;
1838 v[idx].sh_addr = (Addr)(uintptr_t)saddr;
1839 v[idx].sh_type = SHT_PROGBITS;
1840 v[idx].sh_addralign = 4;
1841 *doffp = roundup(*doffp, v[idx].sh_addralign);
1842 v[idx].sh_offset = *doffp;
1843 v[idx].sh_size = ctf->sh_size;
1845 if (ctf_link) {
1847 * The linked symtab (and strtab) will be output
1848 * immediately after this CTF section. Its shdr index
1849 * directly follows this one.
1851 v[idx].sh_link = idx + 1;
1852 ASSERT(symtab != NULL);
1853 } else {
1854 v[idx].sh_link = 0;
1856 elf_copy_scn(ctx, ctf, mvp, &v[idx]);
1857 idx++;
1860 /* output SYMTAB/STRTAB sections */
1861 if (symtab != NULL) {
1862 shstrtype_t symtab_type, strtab_type;
1863 uint_t symtab_name, strtab_name;
1865 elf_ctx_resize_scratch(ctx,
1866 MAX(symtab->sh_size, strtab->sh_size));
1868 if (symtab->sh_type == SHT_DYNSYM) {
1869 symtab_type = STR_DYNSYM;
1870 strtab_type = STR_DYNSTR;
1871 } else {
1872 symtab_type = STR_SYMTAB;
1873 strtab_type = STR_STRTAB;
1876 if (!shstrtab_ndx(shstrtab,
1877 shstrtab_data[symtab_type], &symtab_name)) {
1878 error = ENOMEM;
1879 goto done;
1881 if (!shstrtab_ndx(shstrtab,
1882 shstrtab_data[strtab_type], &strtab_name)) {
1883 error = ENOMEM;
1884 goto done;
1887 v[idx].sh_name = symtab_name;
1888 v[idx].sh_type = symtab->sh_type;
1889 v[idx].sh_addr = symtab->sh_addr;
1890 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
1891 v[idx].sh_addr += (Addr)(uintptr_t)saddr;
1892 v[idx].sh_addralign = symtab->sh_addralign;
1893 *doffp = roundup(*doffp, v[idx].sh_addralign);
1894 v[idx].sh_offset = *doffp;
1895 v[idx].sh_size = symtab->sh_size;
1896 v[idx].sh_link = idx + 1;
1897 v[idx].sh_entsize = symtab->sh_entsize;
1898 v[idx].sh_info = symtab->sh_info;
1900 elf_copy_scn(ctx, symtab, mvp, &v[idx]);
1901 idx++;
1903 v[idx].sh_name = strtab_name;
1904 v[idx].sh_type = SHT_STRTAB;
1905 v[idx].sh_flags = SHF_STRINGS;
1906 v[idx].sh_addr = strtab->sh_addr;
1907 if (ehdr.e_type == ET_DYN || v[idx].sh_addr == 0)
1908 v[idx].sh_addr += (Addr)(uintptr_t)saddr;
1909 v[idx].sh_addralign = strtab->sh_addralign;
1910 *doffp = roundup(*doffp, v[idx].sh_addralign);
1911 v[idx].sh_offset = *doffp;
1912 v[idx].sh_size = strtab->sh_size;
1914 elf_copy_scn(ctx, strtab, mvp, &v[idx]);
1915 idx++;
1918 done:
1919 kmem_free(shstrbase, shstrsize);
1920 kmem_free(shbase, shsize);
1922 if (error == 0)
1923 *countp = count;
1925 return (error);
1929 * Walk mappings in process address space, examining those which correspond to
1930 * loaded objects. It is called twice from elfcore: Once to simply count
1931 * relevant sections, and again later to copy those sections once an adequate
1932 * buffer has been allocated for the shdr details.
1934 static int
1935 elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp)
1937 vnode_t *lastvp = NULL;
1938 struct seg *seg;
1939 uint_t idx = 0, remain;
1940 shstrtab_t shstrtab;
1941 struct as *as = ctx->ecc_p->p_as;
1942 int error = 0;
1944 ASSERT(AS_WRITE_HELD(as));
1946 if (v != NULL) {
1947 ASSERT(nv != 0);
1949 if (!shstrtab_init(&shstrtab))
1950 return (ENOMEM);
1951 remain = nv;
1952 } else {
1953 ASSERT(nv == 0);
1956 * The shdrs are being counted, rather than outputting them
1957 * into a buffer. Leave room for two entries: the SHT_NULL at
1958 * index 0 and the shstrtab at the end.
1960 remain = UINT_MAX - 2;
1963 /* Per the ELF spec, shdr index 0 is reserved. */
1964 idx = 1;
1965 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
1966 vnode_t *mvp;
1967 void *tmp = NULL;
1968 caddr_t saddr = seg->s_base, naddr, eaddr;
1969 size_t segsize;
1970 uint_t count, prot;
1973 * Since we're just looking for text segments of load
1974 * objects, we only care about the protection bits; we don't
1975 * care about the actual size of the segment so we use the
1976 * reserved size. If the segment's size is zero, there's
1977 * something fishy going on so we ignore this segment.
1979 if (seg->s_ops != &segvn_ops ||
1980 SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
1981 mvp == lastvp || mvp == NULL || mvp->v_type != VREG ||
1982 (segsize = pr_getsegsize(seg, 1)) == 0)
1983 continue;
1985 eaddr = saddr + segsize;
1986 prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr);
1987 pr_getprot_done(&tmp);
1990 * Skip this segment unless the protection bits look like
1991 * what we'd expect for a text segment.
1993 if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC)
1994 continue;
1996 error = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain,
1997 &shstrtab, &count);
1998 if (error != 0)
1999 goto done;
2001 ASSERT(count <= remain);
2002 ASSERT(v == NULL || (idx + count) < nv);
2004 remain -= count;
2005 idx += count;
2006 lastvp = mvp;
2009 if (v == NULL) {
2010 if (idx == 1) {
2011 *nshdrsp = 0;
2012 } else {
2013 /* Include room for the shrstrtab at the end */
2014 *nshdrsp = idx + 1;
2016 return (0);
2019 if (idx != nv - 1) {
2020 cmn_err(CE_WARN, "elfcore: core dump failed for "
2021 "process %d; address space is changing",
2022 ctx->ecc_p->p_pid);
2023 error = EIO;
2024 goto done;
2027 if (!shstrtab_ndx(&shstrtab, shstrtab_data[STR_SHSTRTAB],
2028 &v[idx].sh_name)) {
2029 error = ENOMEM;
2030 goto done;
2032 v[idx].sh_size = shstrtab_size(&shstrtab);
2033 v[idx].sh_addralign = 1;
2034 v[idx].sh_offset = ctx->ecc_doffset;
2035 v[idx].sh_flags = SHF_STRINGS;
2036 v[idx].sh_type = SHT_STRTAB;
2038 elf_ctx_resize_scratch(ctx, v[idx].sh_size);
2039 VERIFY3U(ctx->ecc_bufsz, >=, v[idx].sh_size);
2040 shstrtab_dump(&shstrtab, ctx->ecc_buf);
2042 error = core_write(ctx->ecc_vp, UIO_SYSSPACE, ctx->ecc_doffset,
2043 ctx->ecc_buf, v[idx].sh_size, ctx->ecc_rlimit, ctx->ecc_credp);
2044 if (error == 0) {
2045 ctx->ecc_doffset += v[idx].sh_size;
2048 done:
2049 if (v != NULL)
2050 shstrtab_fini(&shstrtab);
2052 return (error);
2056 elfcore(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig,
2057 core_content_t content)
2059 u_offset_t poffset, soffset, doffset;
2060 int error;
2061 uint_t i, nphdrs, nshdrs;
2062 struct seg *seg;
2063 struct as *as = p->p_as;
2064 void *bigwad, *zeropg = NULL;
2065 size_t bigsize, phdrsz, shdrsz;
2066 Ehdr *ehdr;
2067 Phdr *phdr;
2068 Shdr shdr0;
2069 caddr_t brkbase, stkbase;
2070 size_t brksize, stksize;
2071 boolean_t overflowed = B_FALSE, retried = B_FALSE;
2072 klwp_t *lwp = ttolwp(curthread);
2073 elf_core_ctx_t ctx = {
2074 .ecc_vp = vp,
2075 .ecc_p = p,
2076 .ecc_credp = credp,
2077 .ecc_rlimit = rlimit,
2078 .ecc_content = content,
2079 .ecc_doffset = 0,
2080 .ecc_buf = NULL,
2081 .ecc_bufsz = 0
2084 top:
2086 * Make sure we have everything we need (registers, etc.).
2087 * All other lwps have already stopped and are in an orderly state.
2089 ASSERT(p == ttoproc(curthread));
2090 prstop(0, 0);
2092 AS_LOCK_ENTER(as, RW_WRITER);
2093 nphdrs = prnsegs(as, 0) + 2; /* two CORE note sections */
2096 * Count the number of section headers we're going to need.
2098 nshdrs = 0;
2099 if (content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG)) {
2100 VERIFY0(elf_process_scns(&ctx, NULL, 0, &nshdrs));
2102 AS_LOCK_EXIT(as);
2105 * The core file contents may require zero section headers, but if
2106 * we overflow the 16 bits allotted to the program header count in
2107 * the ELF header, we'll need that program header at index zero.
2109 if (nshdrs == 0 && nphdrs >= PN_XNUM)
2110 nshdrs = 1;
2113 * Allocate a buffer which is sized adequately to hold the ehdr, phdrs
2114 * or shdrs needed to produce the core file. It is used for the three
2115 * tasks sequentially, not simultaneously, so it does not need space
2116 * for all three data at once, only the largest one.
2118 VERIFY(nphdrs >= 2);
2119 phdrsz = nphdrs * sizeof (Phdr);
2120 shdrsz = nshdrs * sizeof (Shdr);
2121 bigsize = MAX(sizeof (Ehdr), MAX(phdrsz, shdrsz));
2122 bigwad = kmem_alloc(bigsize, KM_SLEEP);
2124 ehdr = (Ehdr *)bigwad;
2125 bzero(ehdr, sizeof (*ehdr));
2127 ehdr->e_ident[EI_MAG0] = ELFMAG0;
2128 ehdr->e_ident[EI_MAG1] = ELFMAG1;
2129 ehdr->e_ident[EI_MAG2] = ELFMAG2;
2130 ehdr->e_ident[EI_MAG3] = ELFMAG3;
2131 ehdr->e_ident[EI_CLASS] = ELFCLASS;
2132 ehdr->e_type = ET_CORE;
2134 #if !defined(_LP64) || defined(_ELF32_COMPAT)
2136 #if defined(__sparc)
2137 ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2138 ehdr->e_machine = EM_SPARC;
2139 #elif defined(__i386_COMPAT)
2140 ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2141 ehdr->e_machine = EM_386;
2142 #else
2143 #error "no recognized machine type is defined"
2144 #endif
2146 #else /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2148 #if defined(__sparc)
2149 ehdr->e_ident[EI_DATA] = ELFDATA2MSB;
2150 ehdr->e_machine = EM_SPARCV9;
2151 #elif defined(__amd64)
2152 ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
2153 ehdr->e_machine = EM_AMD64;
2154 #else
2155 #error "no recognized 64-bit machine type is defined"
2156 #endif
2158 #endif /* !defined(_LP64) || defined(_ELF32_COMPAT) */
2160 poffset = sizeof (Ehdr);
2161 soffset = sizeof (Ehdr) + phdrsz;
2162 doffset = sizeof (Ehdr) + phdrsz + shdrsz;
2163 bzero(&shdr0, sizeof (shdr0));
2166 * If the count of program headers or section headers or the index
2167 * of the section string table can't fit in the mere 16 bits
2168 * shortsightedly allotted to them in the ELF header, we use the
2169 * extended formats and put the real values in the section header
2170 * as index 0.
2172 if (nphdrs >= PN_XNUM) {
2173 ehdr->e_phnum = PN_XNUM;
2174 shdr0.sh_info = nphdrs;
2175 } else {
2176 ehdr->e_phnum = (unsigned short)nphdrs;
2179 if (nshdrs > 0) {
2180 if (nshdrs >= SHN_LORESERVE) {
2181 ehdr->e_shnum = 0;
2182 shdr0.sh_size = nshdrs;
2183 } else {
2184 ehdr->e_shnum = (unsigned short)nshdrs;
2187 if (nshdrs - 1 >= SHN_LORESERVE) {
2188 ehdr->e_shstrndx = SHN_XINDEX;
2189 shdr0.sh_link = nshdrs - 1;
2190 } else {
2191 ehdr->e_shstrndx = (unsigned short)(nshdrs - 1);
2194 ehdr->e_shoff = soffset;
2195 ehdr->e_shentsize = sizeof (Shdr);
2198 ehdr->e_ident[EI_VERSION] = EV_CURRENT;
2199 ehdr->e_version = EV_CURRENT;
2200 ehdr->e_ehsize = sizeof (Ehdr);
2201 ehdr->e_phoff = poffset;
2202 ehdr->e_phentsize = sizeof (Phdr);
2204 if (error = core_write(vp, UIO_SYSSPACE, (offset_t)0, ehdr,
2205 sizeof (Ehdr), rlimit, credp)) {
2206 goto done;
2209 phdr = (Phdr *)bigwad;
2210 bzero(phdr, phdrsz);
2212 setup_old_note_header(&phdr[0], p);
2213 phdr[0].p_offset = doffset = roundup(doffset, sizeof (Word));
2214 doffset += phdr[0].p_filesz;
2216 setup_note_header(&phdr[1], p);
2217 phdr[1].p_offset = doffset = roundup(doffset, sizeof (Word));
2218 doffset += phdr[1].p_filesz;
2220 mutex_enter(&p->p_lock);
2222 brkbase = p->p_brkbase;
2223 brksize = p->p_brksize;
2225 stkbase = p->p_usrstack - p->p_stksize;
2226 stksize = p->p_stksize;
2228 mutex_exit(&p->p_lock);
2230 AS_LOCK_ENTER(as, RW_WRITER);
2231 i = 2;
2232 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2233 caddr_t eaddr = seg->s_base + pr_getsegsize(seg, 0);
2234 caddr_t saddr, naddr;
2235 void *tmp = NULL;
2236 extern struct seg_ops segspt_shmops;
2238 if ((seg->s_flags & S_HOLE) != 0) {
2239 continue;
2242 for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) {
2243 uint_t prot;
2244 size_t size;
2245 int type;
2246 vnode_t *mvp;
2248 prot = pr_getprot(seg, 0, &tmp, &saddr, &naddr, eaddr);
2249 prot &= PROT_READ | PROT_WRITE | PROT_EXEC;
2250 if ((size = (size_t)(naddr - saddr)) == 0) {
2251 ASSERT(tmp == NULL);
2252 continue;
2253 } else if (i == nphdrs) {
2254 pr_getprot_done(&tmp);
2255 overflowed = B_TRUE;
2256 break;
2258 phdr[i].p_type = PT_LOAD;
2259 phdr[i].p_vaddr = (Addr)(uintptr_t)saddr;
2260 phdr[i].p_memsz = size;
2261 if (prot & PROT_READ)
2262 phdr[i].p_flags |= PF_R;
2263 if (prot & PROT_WRITE)
2264 phdr[i].p_flags |= PF_W;
2265 if (prot & PROT_EXEC)
2266 phdr[i].p_flags |= PF_X;
2269 * Figure out which mappings to include in the core.
2271 type = SEGOP_GETTYPE(seg, saddr);
2273 if (saddr == stkbase && size == stksize) {
2274 if (!(content & CC_CONTENT_STACK))
2275 goto exclude;
2277 } else if (saddr == brkbase && size == brksize) {
2278 if (!(content & CC_CONTENT_HEAP))
2279 goto exclude;
2281 } else if (seg->s_ops == &segspt_shmops) {
2282 if (type & MAP_NORESERVE) {
2283 if (!(content & CC_CONTENT_DISM))
2284 goto exclude;
2285 } else {
2286 if (!(content & CC_CONTENT_ISM))
2287 goto exclude;
2290 } else if (seg->s_ops != &segvn_ops) {
2291 goto exclude;
2293 } else if (type & MAP_SHARED) {
2294 if (shmgetid(p, saddr) != SHMID_NONE) {
2295 if (!(content & CC_CONTENT_SHM))
2296 goto exclude;
2298 } else if (SEGOP_GETVP(seg, seg->s_base,
2299 &mvp) != 0 || mvp == NULL ||
2300 mvp->v_type != VREG) {
2301 if (!(content & CC_CONTENT_SHANON))
2302 goto exclude;
2304 } else {
2305 if (!(content & CC_CONTENT_SHFILE))
2306 goto exclude;
2309 } else if (SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 ||
2310 mvp == NULL || mvp->v_type != VREG) {
2311 if (!(content & CC_CONTENT_ANON))
2312 goto exclude;
2314 } else if (prot == (PROT_READ | PROT_EXEC)) {
2315 if (!(content & CC_CONTENT_TEXT))
2316 goto exclude;
2318 } else if (prot == PROT_READ) {
2319 if (!(content & CC_CONTENT_RODATA))
2320 goto exclude;
2322 } else {
2323 if (!(content & CC_CONTENT_DATA))
2324 goto exclude;
2327 doffset = roundup(doffset, sizeof (Word));
2328 phdr[i].p_offset = doffset;
2329 phdr[i].p_filesz = size;
2330 doffset += size;
2331 exclude:
2332 i++;
2334 VERIFY(tmp == NULL);
2335 if (overflowed)
2336 break;
2338 AS_LOCK_EXIT(as);
2340 if (overflowed || i != nphdrs) {
2341 if (!retried) {
2342 retried = B_TRUE;
2343 overflowed = B_FALSE;
2344 kmem_free(bigwad, bigsize);
2345 goto top;
2347 cmn_err(CE_WARN, "elfcore: core dump failed for "
2348 "process %d; address space is changing", p->p_pid);
2349 error = EIO;
2350 goto done;
2353 if ((error = core_write(vp, UIO_SYSSPACE, poffset,
2354 phdr, phdrsz, rlimit, credp)) != 0) {
2355 goto done;
2358 if ((error = write_old_elfnotes(p, sig, vp, phdr[0].p_offset, rlimit,
2359 credp)) != 0) {
2360 goto done;
2362 if ((error = write_elfnotes(p, sig, vp, phdr[1].p_offset, rlimit,
2363 credp, content)) != 0) {
2364 goto done;
2367 for (i = 2; i < nphdrs; i++) {
2368 prkillinfo_t killinfo;
2369 sigqueue_t *sq;
2370 int sig, j;
2372 if (phdr[i].p_filesz == 0)
2373 continue;
2376 * If we hit a region that was mapped PROT_NONE then we cannot
2377 * continue dumping this normally as the kernel would be unable
2378 * to read from the page and that would result in us failing to
2379 * dump the page. As such, any region mapped PROT_NONE, we dump
2380 * as a zero-filled page such that this is still represented in
2381 * the map.
2383 * If dumping out this segment fails, rather than failing
2384 * the core dump entirely, we reset the size of the mapping
2385 * to zero to indicate that the data is absent from the core
2386 * file and or in the PF_SUNW_FAILURE flag to differentiate
2387 * this from mappings that were excluded due to the core file
2388 * content settings.
2390 if ((phdr[i].p_flags & (PF_R | PF_W | PF_X)) == 0) {
2391 size_t towrite = phdr[i].p_filesz;
2392 size_t curoff = 0;
2394 if (zeropg == NULL) {
2395 zeropg = kmem_zalloc(elf_zeropg_sz, KM_SLEEP);
2398 error = 0;
2399 while (towrite != 0) {
2400 size_t len = MIN(towrite, elf_zeropg_sz);
2402 error = core_write(vp, UIO_SYSSPACE,
2403 phdr[i].p_offset + curoff, zeropg, len,
2404 rlimit, credp);
2405 if (error != 0)
2406 break;
2408 towrite -= len;
2409 curoff += len;
2411 } else {
2412 error = core_seg(p, vp, phdr[i].p_offset,
2413 (caddr_t)(uintptr_t)phdr[i].p_vaddr,
2414 phdr[i].p_filesz, rlimit, credp);
2416 if (error == 0)
2417 continue;
2419 if ((sig = lwp->lwp_cursig) == 0) {
2421 * We failed due to something other than a signal.
2422 * Since the space reserved for the segment is now
2423 * unused, we stash the errno in the first four
2424 * bytes. This undocumented interface will let us
2425 * understand the nature of the failure.
2427 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2428 &error, sizeof (error), rlimit, credp);
2430 phdr[i].p_filesz = 0;
2431 phdr[i].p_flags |= PF_SUNW_FAILURE;
2432 if ((error = core_write(vp, UIO_SYSSPACE,
2433 poffset + sizeof (Phdr) * i, &phdr[i],
2434 sizeof (Phdr), rlimit, credp)) != 0)
2435 goto done;
2437 continue;
2441 * We took a signal. We want to abort the dump entirely, but
2442 * we also want to indicate what failed and why. We therefore
2443 * use the space reserved for the first failing segment to
2444 * write our error (which, for purposes of compatability with
2445 * older core dump readers, we set to EINTR) followed by any
2446 * siginfo associated with the signal.
2448 bzero(&killinfo, sizeof (killinfo));
2449 killinfo.prk_error = EINTR;
2451 sq = sig == SIGKILL ? curproc->p_killsqp : lwp->lwp_curinfo;
2453 if (sq != NULL) {
2454 bcopy(&sq->sq_info, &killinfo.prk_info,
2455 sizeof (sq->sq_info));
2456 } else {
2457 killinfo.prk_info.si_signo = lwp->lwp_cursig;
2458 killinfo.prk_info.si_code = SI_NOINFO;
2461 #if (defined(_SYSCALL32_IMPL) || defined(_LP64))
2463 * If this is a 32-bit process, we need to translate from the
2464 * native siginfo to the 32-bit variant. (Core readers must
2465 * always have the same data model as their target or must
2466 * be aware of -- and compensate for -- data model differences.)
2468 if (curproc->p_model == DATAMODEL_ILP32) {
2469 siginfo32_t si32;
2471 siginfo_kto32((k_siginfo_t *)&killinfo.prk_info, &si32);
2472 bcopy(&si32, &killinfo.prk_info, sizeof (si32));
2474 #endif
2476 (void) core_write(vp, UIO_SYSSPACE, phdr[i].p_offset,
2477 &killinfo, sizeof (killinfo), rlimit, credp);
2480 * For the segment on which we took the signal, indicate that
2481 * its data now refers to a siginfo.
2483 phdr[i].p_filesz = 0;
2484 phdr[i].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED |
2485 PF_SUNW_SIGINFO;
2488 * And for every other segment, indicate that its absence
2489 * is due to a signal.
2491 for (j = i + 1; j < nphdrs; j++) {
2492 phdr[j].p_filesz = 0;
2493 phdr[j].p_flags |= PF_SUNW_FAILURE | PF_SUNW_KILLED;
2497 * Finally, write out our modified program headers.
2499 if ((error = core_write(vp, UIO_SYSSPACE,
2500 poffset + sizeof (Phdr) * i, &phdr[i],
2501 sizeof (Phdr) * (nphdrs - i), rlimit, credp)) != 0) {
2502 goto done;
2505 break;
2508 if (nshdrs > 0) {
2509 Shdr *shdr = (Shdr *)bigwad;
2511 bzero(shdr, shdrsz);
2512 if (nshdrs > 1) {
2513 ctx.ecc_doffset = doffset;
2514 AS_LOCK_ENTER(as, RW_WRITER);
2515 error = elf_process_scns(&ctx, shdr, nshdrs, NULL);
2516 AS_LOCK_EXIT(as);
2517 if (error != 0) {
2518 goto done;
2521 /* Copy any extended format data destined for the first shdr */
2522 bcopy(&shdr0, shdr, sizeof (shdr0));
2524 error = core_write(vp, UIO_SYSSPACE, soffset, shdr, shdrsz,
2525 rlimit, credp);
2528 done:
2529 if (zeropg != NULL)
2530 kmem_free(zeropg, elf_zeropg_sz);
2531 if (ctx.ecc_bufsz != 0)
2532 kmem_free(ctx.ecc_buf, ctx.ecc_bufsz);
2533 kmem_free(bigwad, bigsize);
2534 return (error);
2537 #ifndef _ELF32_COMPAT
2539 static struct execsw esw = {
2540 #ifdef _LP64
2541 elf64magicstr,
2542 #else /* _LP64 */
2543 elf32magicstr,
2544 #endif /* _LP64 */
2547 elfexec,
2548 elfcore
2551 static struct modlexec modlexec = {
2552 &mod_execops, "exec module for elf", &esw
2555 #ifdef _LP64
2556 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
2557 intpdata_t *idatap, int level, size_t *execsz,
2558 int setid, caddr_t exec_file, cred_t *cred,
2559 int brand_action);
2560 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
2561 rlim64_t rlimit, int sig, core_content_t content);
2563 static struct execsw esw32 = {
2564 elf32magicstr,
2567 elf32exec,
2568 elf32core
2571 static struct modlexec modlexec32 = {
2572 &mod_execops, "32-bit exec module for elf", &esw32
2574 #endif /* _LP64 */
2576 static struct modlinkage modlinkage = {
2577 MODREV_1,
2578 (void *)&modlexec,
2579 #ifdef _LP64
2580 (void *)&modlexec32,
2581 #endif /* _LP64 */
2582 NULL
2586 _init(void)
2588 return (mod_install(&modlinkage));
2592 _fini(void)
2594 return (mod_remove(&modlinkage));
2598 _info(struct modinfo *modinfop)
2600 return (mod_info(&modlinkage, modinfop));
2603 #endif /* !_ELF32_COMPAT */