Unleashed v1.4
[unleashed.git] / kernel / os / grow.c
blob03316342f01eab02211b5afe760e0e334db00f3c
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
24 * Copyright 2017 Joyent, Inc.
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
33 /* All Rights Reserved */
35 #include <sys/types.h>
36 #include <sys/inttypes.h>
37 #include <sys/param.h>
38 #include <sys/sysmacros.h>
39 #include <sys/systm.h>
40 #include <sys/signal.h>
41 #include <sys/user.h>
42 #include <sys/errno.h>
43 #include <sys/var.h>
44 #include <sys/proc.h>
45 #include <sys/tuneable.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/cred.h>
49 #include <sys/vnode.h>
50 #include <sys/vfs.h>
51 #include <sys/vm.h>
52 #include <sys/file.h>
53 #include <sys/mman.h>
54 #include <sys/vmparam.h>
55 #include <sys/fcntl.h>
56 #include <sys/lwpchan_impl.h>
57 #include <sys/nbmlock.h>
59 #include <vm/hat.h>
60 #include <vm/as.h>
61 #include <vm/seg.h>
62 #include <vm/seg_dev.h>
63 #include <vm/seg_vn.h>
65 int use_brk_lpg = 1;
66 int use_stk_lpg = 1;
69 * If set, we will not randomize mappings where the 'addr' argument is
70 * non-NULL and not an alignment.
72 int aslr_respect_mmap_hint = 1;
74 static int brk_lpg(caddr_t nva);
75 static int grow_lpg(caddr_t sp);
77 intptr_t
78 brk(caddr_t nva)
80 int error;
81 proc_t *p = curproc;
84 * Serialize brk operations on an address space.
85 * This also serves as the lock protecting p_brksize
86 * and p_brkpageszc.
88 as_rangelock(p->p_as);
91 * As a special case to aid the implementation of sbrk(3C), if given a
92 * new brk of 0, return the current brk. We'll hide this in brk(3C).
94 if (nva == 0) {
95 intptr_t base = (intptr_t)(p->p_brkbase + p->p_brksize);
96 as_rangeunlock(p->p_as);
97 return (base);
100 if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
101 error = brk_lpg(nva);
102 } else {
103 error = brk_internal(nva, p->p_brkpageszc);
105 as_rangeunlock(p->p_as);
106 return ((error != 0 ? set_errno(error) : 0));
110 * Algorithm: call arch-specific map_pgsz to get best page size to use,
111 * then call brk_internal().
112 * Returns 0 on success.
114 static int
115 brk_lpg(caddr_t nva)
117 struct proc *p = curproc;
118 size_t pgsz, len;
119 caddr_t addr, brkend;
120 caddr_t bssbase = p->p_bssbase;
121 caddr_t brkbase = p->p_brkbase;
122 int oszc, szc;
123 int err;
125 oszc = p->p_brkpageszc;
128 * If p_brkbase has not yet been set, the first call
129 * to brk_internal() will initialize it.
131 if (brkbase == 0) {
132 return (brk_internal(nva, oszc));
135 len = nva - bssbase;
137 pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
138 szc = page_szc(pgsz);
141 * Covers two cases:
142 * 1. page_szc() returns -1 for invalid page size, so we want to
143 * ignore it in that case.
144 * 2. By design we never decrease page size, as it is more stable.
146 if (szc <= oszc) {
147 err = brk_internal(nva, oszc);
148 /* If failed, back off to base page size. */
149 if (err != 0 && oszc != 0) {
150 err = brk_internal(nva, 0);
152 return (err);
155 err = brk_internal(nva, szc);
156 /* If using szc failed, map with base page size and return. */
157 if (err != 0) {
158 if (szc != 0) {
159 err = brk_internal(nva, 0);
161 return (err);
165 * Round up brk base to a large page boundary and remap
166 * anything in the segment already faulted in beyond that
167 * point.
169 addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
170 brkend = brkbase + p->p_brksize;
171 len = brkend - addr;
172 /* Check that len is not negative. Update page size code for heap. */
173 if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
174 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
175 p->p_brkpageszc = szc;
178 ASSERT(err == 0);
179 return (err); /* should always be 0 */
183 * Returns 0 on success.
186 brk_internal(caddr_t nva, uint_t brkszc)
188 caddr_t ova; /* current break address */
189 size_t size;
190 int error;
191 struct proc *p = curproc;
192 struct as *as = p->p_as;
193 size_t pgsz;
194 uint_t szc;
195 rctl_qty_t as_rctl;
198 * extend heap to brkszc alignment but use current p->p_brkpageszc
199 * for the newly created segment. This allows the new extension
200 * segment to be concatenated successfully with the existing brk
201 * segment.
203 if ((szc = brkszc) != 0) {
204 pgsz = page_get_pagesize(szc);
205 ASSERT(pgsz > PAGESIZE);
206 } else {
207 pgsz = PAGESIZE;
210 mutex_enter(&p->p_lock);
211 as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
212 p->p_rctls, p);
213 mutex_exit(&p->p_lock);
216 * If p_brkbase has not yet been set, the first call
217 * to brk() will initialize it.
219 if (p->p_brkbase == 0)
220 p->p_brkbase = nva;
223 * Before multiple page size support existed p_brksize was the value
224 * not rounded to the pagesize (i.e. it stored the exact user request
225 * for heap size). If pgsz is greater than PAGESIZE calculate the
226 * heap size as the real new heap size by rounding it up to pgsz.
227 * This is useful since we may want to know where the heap ends
228 * without knowing heap pagesize (e.g. some old code) and also if
229 * heap pagesize changes we can update p_brkpageszc but delay adding
230 * new mapping yet still know from p_brksize where the heap really
231 * ends. The user requested heap end is stored in libc variable.
233 if (pgsz > PAGESIZE) {
234 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
235 size = tnva - p->p_brkbase;
236 if (tnva < p->p_brkbase || (size > p->p_brksize &&
237 size > (size_t)as_rctl)) {
238 szc = 0;
239 pgsz = PAGESIZE;
240 size = nva - p->p_brkbase;
242 } else {
243 size = nva - p->p_brkbase;
247 * use PAGESIZE to roundup ova because we want to know the real value
248 * of the current heap end in case p_brkpageszc changes since the last
249 * p_brksize was computed.
251 nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
252 ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
253 PAGESIZE);
255 if ((nva < p->p_brkbase) || (size > p->p_brksize &&
256 size > as_rctl)) {
257 mutex_enter(&p->p_lock);
258 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
259 RCA_SAFE);
260 mutex_exit(&p->p_lock);
261 return (ENOMEM);
264 if (nva > ova) {
265 struct segvn_crargs crargs =
266 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
268 if (!(p->p_datprot & PROT_EXEC)) {
269 crargs.prot &= ~PROT_EXEC;
273 * Add new zfod mapping to extend UNIX data segment
274 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
275 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
276 * page sizes if ova is not aligned to szc's pgsz.
278 if (szc > 0) {
279 caddr_t rbss;
281 rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
282 pgsz);
283 if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
284 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
285 AS_MAP_NO_LPOOB;
286 } else if (ova == rbss) {
287 crargs.szc = szc;
288 } else {
289 crargs.szc = AS_MAP_HEAP;
291 } else {
292 crargs.szc = AS_MAP_NO_LPOOB;
294 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
295 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
296 &crargs);
297 if (error) {
298 return (error);
301 } else if (nva < ova) {
303 * Release mapping to shrink UNIX data segment.
305 (void) as_unmap(as, nva, (size_t)(ova - nva));
307 p->p_brksize = size;
308 return (0);
312 * Grow the stack to include sp. Return 1 if successful, 0 otherwise.
313 * This routine assumes that the stack grows downward.
316 grow(caddr_t sp)
318 struct proc *p = curproc;
319 struct as *as = p->p_as;
320 size_t oldsize = p->p_stksize;
321 size_t newsize;
322 int err;
325 * Serialize grow operations on an address space.
326 * This also serves as the lock protecting p_stksize
327 * and p_stkpageszc.
329 as_rangelock(as);
330 if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
331 err = grow_lpg(sp);
332 } else {
333 err = grow_internal(sp, p->p_stkpageszc);
335 newsize = p->p_stksize;
336 as_rangeunlock(as);
338 if (err == 0 && newsize > oldsize) {
339 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
340 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
342 * Set up translations so the process doesn't have to fault in
343 * the stack pages we just gave it.
345 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
346 newsize - oldsize, F_INVAL, S_WRITE);
348 return ((err == 0 ? 1 : 0));
352 * Algorithm: call arch-specific map_pgsz to get best page size to use,
353 * then call grow_internal().
354 * Returns 0 on success.
356 static int
357 grow_lpg(caddr_t sp)
359 struct proc *p = curproc;
360 size_t pgsz;
361 size_t len, newsize;
362 caddr_t addr, saddr;
363 caddr_t growend;
364 int oszc, szc;
365 int err;
367 newsize = p->p_usrstack - sp;
369 oszc = p->p_stkpageszc;
370 pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
371 szc = page_szc(pgsz);
374 * Covers two cases:
375 * 1. page_szc() returns -1 for invalid page size, so we want to
376 * ignore it in that case.
377 * 2. By design we never decrease page size, as it is more stable.
378 * This shouldn't happen as the stack never shrinks.
380 if (szc <= oszc) {
381 err = grow_internal(sp, oszc);
382 /* failed, fall back to base page size */
383 if (err != 0 && oszc != 0) {
384 err = grow_internal(sp, 0);
386 return (err);
390 * We've grown sufficiently to switch to a new page size.
391 * So we are going to remap the whole segment with the new page size.
393 err = grow_internal(sp, szc);
394 /* The grow with szc failed, so fall back to base page size. */
395 if (err != 0) {
396 if (szc != 0) {
397 err = grow_internal(sp, 0);
399 return (err);
403 * Round up stack pointer to a large page boundary and remap
404 * any pgsz pages in the segment already faulted in beyond that
405 * point.
407 saddr = p->p_usrstack - p->p_stksize;
408 addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
409 growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
410 len = growend - addr;
411 /* Check that len is not negative. Update page size code for stack. */
412 if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
413 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
414 p->p_stkpageszc = szc;
417 ASSERT(err == 0);
418 return (err); /* should always be 0 */
422 * This routine assumes that the stack grows downward.
423 * Returns 0 on success, errno on failure.
426 grow_internal(caddr_t sp, uint_t growszc)
428 struct proc *p = curproc;
429 size_t newsize;
430 size_t oldsize;
431 uintptr_t new_start;
432 int error;
433 size_t pgsz;
434 uint_t szc;
435 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
437 ASSERT(sp < p->p_usrstack);
438 sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
441 * grow to growszc alignment but use current p->p_stkpageszc for
442 * the segvn_crargs szc passed to segvn_create. For memcntl to
443 * increase the szc, this allows the new extension segment to be
444 * concatenated successfully with the existing stack segment.
446 if ((szc = growszc) != 0) {
447 pgsz = page_get_pagesize(szc);
448 ASSERT(pgsz > PAGESIZE);
449 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
450 if (newsize > (size_t)p->p_stk_ctl) {
451 szc = 0;
452 pgsz = PAGESIZE;
453 newsize = p->p_usrstack - sp;
455 } else {
456 pgsz = PAGESIZE;
457 newsize = p->p_usrstack - sp;
460 if (newsize > (size_t)p->p_stk_ctl) {
461 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
462 RCA_UNSAFE_ALL);
464 return (ENOMEM);
467 oldsize = p->p_stksize;
468 ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
470 if (newsize <= oldsize) { /* prevent the stack from shrinking */
471 return (0);
474 if (!(p->p_stkprot & PROT_EXEC)) {
475 crargs.prot &= ~PROT_EXEC;
478 * extend stack with the proposed new growszc, which is different
479 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
480 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
481 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
482 * if not aligned to szc's pgsz.
484 if (szc > 0) {
485 caddr_t oldsp = p->p_usrstack - oldsize;
486 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
487 pgsz);
489 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
490 crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
491 AS_MAP_NO_LPOOB;
492 } else if (oldsp == austk) {
493 crargs.szc = szc;
494 } else {
495 crargs.szc = AS_MAP_STACK;
497 } else {
498 crargs.szc = AS_MAP_NO_LPOOB;
500 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
503 * The stack is about to grow into its guard. This can be acceptable
504 * if the size restriction on the stack has been expanded since its
505 * initialization during exec(). In such cases, the guard segment will
506 * be shrunk, provided the new size is reasonable.
508 new_start = (uintptr_t)p->p_usrstack - newsize;
509 if (p->p_stkg_start != 0 && new_start > p->p_stkg_start &&
510 new_start < p->p_stkg_end) {
511 const size_t unmap_sz = p->p_stkg_end - new_start;
512 const size_t remain_sz = new_start - p->p_stkg_start;
513 extern size_t stack_guard_min_sz;
515 /* Do not allow the guard to shrink below minimum size */
516 if (remain_sz < stack_guard_min_sz) {
517 return (ENOMEM);
520 error = as_unmap(p->p_as, (caddr_t)new_start, unmap_sz);
521 if (error != 0) {
522 return (error);
524 p->p_stkg_end -= unmap_sz;
527 if ((error = as_map(p->p_as, (caddr_t)new_start, newsize - oldsize,
528 segvn_create, &crargs)) != 0) {
529 if (error == EAGAIN) {
530 cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
531 "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
533 return (error);
535 p->p_stksize = newsize;
536 return (0);
540 * Find address for user to map. If MAP_FIXED is not specified, we can pick
541 * any address we want, but we will first try the value in *addrp if it is
542 * non-NULL and _MAP_RANDOMIZE is not set. Thus this is implementing a way to
543 * try and get a preferred address.
546 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
547 int vacalign, uint_t flags)
549 caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
550 size_t lenp = len;
552 ASSERT(AS_ISCLAIMGAP(as)); /* searches should be serialized */
553 if (flags & MAP_FIXED) {
554 (void) as_unmap(as, *addrp, len);
555 return (0);
556 } else if (basep != NULL &&
557 ((flags & (MAP_ALIGN | _MAP_RANDOMIZE)) == 0) &&
558 !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
559 /* User supplied address was available */
560 *addrp = basep;
561 } else {
563 * No user supplied address or the address supplied was not
564 * available.
566 map_addr(addrp, len, off, vacalign, flags);
568 if (*addrp == NULL)
569 return (ENOMEM);
570 return (0);
575 * Used for MAP_ANON - fast way to get anonymous pages
577 static int
578 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
579 offset_t pos)
581 struct segvn_crargs vn_a;
582 int error;
584 if (((PROT_ALL & uprot) != uprot))
585 return (EACCES);
587 if ((flags & MAP_FIXED) != 0) {
588 caddr_t userlimit;
591 * Use the user address. First verify that
592 * the address to be used is page aligned.
593 * Then make some simple bounds checks.
595 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
596 return (EINVAL);
598 userlimit = flags & _MAP_LOW32 ?
599 (caddr_t)USERLIMIT32 : as->a_userlimit;
600 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
601 case RANGE_OKAY:
602 break;
603 case RANGE_BADPROT:
604 return (ENOTSUP);
605 case RANGE_BADADDR:
606 default:
607 return (ENOMEM);
611 * No need to worry about vac alignment for anonymous
612 * pages since this is a "clone" object that doesn't
613 * yet exist.
615 error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
616 if (error != 0) {
617 return (error);
621 * Use the seg_vn segment driver; passing in the NULL amp
622 * gives the desired "cloning" effect.
624 vn_a.vp = NULL;
625 vn_a.offset = 0;
626 vn_a.type = flags & MAP_TYPE;
627 vn_a.prot = uprot;
628 vn_a.maxprot = PROT_ALL;
629 vn_a.flags = flags & ~MAP_TYPE;
630 vn_a.cred = CRED();
631 vn_a.amp = NULL;
632 vn_a.szc = 0;
633 vn_a.lgrp_mem_policy_flags = 0;
635 return (as_map(as, *addrp, len, segvn_create, &vn_a));
638 #define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
639 !(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
641 static int
642 smmap_common(caddr_t *addrp, size_t len,
643 int prot, int flags, struct file *fp, offset_t pos)
645 struct vnode *vp;
646 struct as *as = curproc->p_as;
647 uint_t uprot, maxprot, type;
648 int error;
649 int in_crit = 0;
651 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED |
652 _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
653 MAP_TEXT | MAP_INITDATA)) != 0) {
654 /* | MAP_RENAME */ /* not implemented, let user know */
655 return (EINVAL);
658 if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
659 return (EINVAL);
662 if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
663 return (EINVAL);
666 if ((flags & (MAP_FIXED | _MAP_RANDOMIZE)) ==
667 (MAP_FIXED | _MAP_RANDOMIZE)) {
668 return (EINVAL);
672 * If it's not a fixed allocation and mmap ASLR is enabled, randomize
673 * it.
675 if (RANDOMIZABLE_MAPPING(*addrp, flags) &&
676 secflag_enabled(curproc, PROC_SEC_ASLR))
677 flags |= _MAP_RANDOMIZE;
679 type = flags & MAP_TYPE;
680 if (type != MAP_PRIVATE && type != MAP_SHARED)
681 return (EINVAL);
684 if (flags & MAP_ALIGN) {
685 if (flags & MAP_FIXED)
686 return (EINVAL);
688 /* alignment needs to be a power of 2 >= page size */
689 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
690 !ISP2((uintptr_t)*addrp))
691 return (EINVAL);
694 * Check for bad lengths and file position.
695 * We let the fop_map routine check for negative lengths
696 * since on some vnode types this might be appropriate.
698 if (len == 0 || (pos & (uoff_t)PAGEOFFSET) != 0)
699 return (EINVAL);
701 maxprot = PROT_ALL; /* start out allowing all accesses */
702 uprot = prot | PROT_USER;
704 if (fp == NULL) {
705 ASSERT(flags & MAP_ANON);
706 /* discard lwpchan mappings, like munmap() */
707 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
708 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
709 as_rangelock(as);
710 error = zmap(as, addrp, len, uprot, flags, pos);
711 as_rangeunlock(as);
713 * Tell machine specific code that lwp has mapped shared memory
715 if (error == 0 && (flags & MAP_SHARED)) {
716 /* EMPTY */
717 LWP_MMODEL_SHARED_AS(*addrp, len);
719 return (error);
720 } else if ((flags & MAP_ANON) != 0)
721 return (EINVAL);
723 vp = fp->f_vnode;
725 /* Can't execute code from "noexec" mounted filesystem. */
726 if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
727 maxprot &= ~PROT_EXEC;
730 * These checks were added as part of large files.
732 * Return ENXIO if the initial position is negative; return EOVERFLOW
733 * if (offset + len) would overflow the maximum allowed offset for the
734 * type of file descriptor being used.
736 if (vp->v_type == VREG) {
737 if (pos < 0)
738 return (ENXIO);
739 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
740 return (EOVERFLOW);
743 if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
744 /* no write access allowed */
745 maxprot &= ~PROT_WRITE;
749 * XXX - Do we also adjust maxprot based on protections
750 * of the vnode? E.g. if no execute permission is given
751 * on the vnode for the current user, maxprot probably
752 * should disallow PROT_EXEC also? This is different
753 * from the write access as this would be a per vnode
754 * test as opposed to a per fd test for writability.
758 * Verify that the specified protections are not greater than
759 * the maximum allowable protections. Also test to make sure
760 * that the file descriptor does allows for read access since
761 * "write only" mappings are hard to do since normally we do
762 * the read from the file before the page can be written.
764 if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
765 return (EACCES);
768 * If the user specified an address, do some simple checks here
770 if ((flags & MAP_FIXED) != 0) {
771 caddr_t userlimit;
774 * Use the user address. First verify that
775 * the address to be used is page aligned.
776 * Then make some simple bounds checks.
778 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
779 return (EINVAL);
781 userlimit = flags & _MAP_LOW32 ?
782 (caddr_t)USERLIMIT32 : as->a_userlimit;
783 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
784 case RANGE_OKAY:
785 break;
786 case RANGE_BADPROT:
787 return (ENOTSUP);
788 case RANGE_BADADDR:
789 default:
790 return (ENOMEM);
794 if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
795 nbl_need_check(vp)) {
796 int svmand;
797 nbl_op_t nop;
799 nbl_start_crit(vp, RW_READER);
800 in_crit = 1;
801 error = nbl_svmand(vp, fp->f_cred, &svmand);
802 if (error != 0)
803 goto done;
804 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
805 if (prot & (PROT_READ | PROT_EXEC)) {
806 nop = NBL_READWRITE;
807 } else {
808 nop = NBL_WRITE;
810 } else {
811 nop = NBL_READ;
813 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
814 error = EACCES;
815 goto done;
819 /* discard lwpchan mappings, like munmap() */
820 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
821 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
824 * Ok, now let the vnode map routine do its thing to set things up.
826 error = fop_map(vp, pos, as,
827 addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
829 if (error == 0) {
831 * Tell machine specific code that lwp has mapped shared memory
833 if (flags & MAP_SHARED) {
834 /* EMPTY */
835 LWP_MMODEL_SHARED_AS(*addrp, len);
837 if (vp->v_type == VREG &&
838 (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
840 * Mark this as an executable vnode
842 mutex_enter(&vp->v_lock);
843 vp->v_flag |= VVMEXEC;
844 mutex_exit(&vp->v_lock);
848 done:
849 if (in_crit)
850 nbl_end_crit(vp);
851 return (error);
854 #ifdef _LP64
856 * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
858 * The "large file" mmap routine mmap64(2) is also mapped to this routine
859 * by the 64-bit version of libc.
861 * Eventually, this should be the only version, and have smmap_common()
862 * folded back into it again. Some day.
864 caddr_t
865 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
867 struct file *fp;
868 int error;
870 if (fd == -1 && (flags & MAP_ANON) != 0)
871 error = smmap_common(&addr, len, prot, flags,
872 NULL, (offset_t)pos);
873 else if ((fp = getf(fd)) != NULL) {
874 error = smmap_common(&addr, len, prot, flags,
875 fp, (offset_t)pos);
876 releasef(fd);
877 } else
878 error = EBADF;
880 return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
882 #endif /* _LP64 */
884 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
887 * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
889 * Now things really get ugly because we can't use the C-style
890 * calling convention for more than 6 args, and 64-bit parameter
891 * passing on 32-bit systems is less than clean.
894 struct mmaplf32a {
895 caddr_t addr;
896 size_t len;
897 #ifdef _LP64
899 * 32-bit contents, 64-bit cells
901 uint64_t prot;
902 uint64_t flags;
903 uint64_t fd;
904 uint64_t offhi;
905 uint64_t offlo;
906 #else
908 * 32-bit contents, 32-bit cells
910 uint32_t prot;
911 uint32_t flags;
912 uint32_t fd;
913 uint32_t offhi;
914 uint32_t offlo;
915 #endif
919 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
921 struct file *fp;
922 int error;
923 caddr_t a = uap->addr;
924 int flags = (int)uap->flags;
925 int fd = (int)uap->fd;
926 #ifdef _BIG_ENDIAN
927 offset_t off = ((uoff_t)uap->offhi << 32) | (uoff_t)uap->offlo;
928 #else
929 offset_t off = ((uoff_t)uap->offlo << 32) | (uoff_t)uap->offhi;
930 #endif
932 if (flags & _MAP_LOW32)
933 error = EINVAL;
934 else if (fd == -1 && (flags & MAP_ANON) != 0)
935 error = smmap_common(&a, uap->len, (int)uap->prot,
936 flags | _MAP_LOW32, NULL, off);
937 else if ((fp = getf(fd)) != NULL) {
938 error = smmap_common(&a, uap->len, (int)uap->prot,
939 flags | _MAP_LOW32, fp, off);
940 releasef(fd);
941 } else
942 error = EBADF;
944 if (error == 0)
945 rvp->r_val1 = (uintptr_t)a;
946 return (error);
949 #endif /* _SYSCALL32_IMPL || _ILP32 */
952 munmap(caddr_t addr, size_t len)
954 struct proc *p = curproc;
955 struct as *as = p->p_as;
957 if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
958 return (set_errno(EINVAL));
960 if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
961 return (set_errno(EINVAL));
964 * Discard lwpchan mappings.
966 if (p->p_lcp != NULL)
967 lwpchan_delete_mapping(p, addr, addr + len);
968 if (as_unmap(as, addr, len) != 0)
969 return (set_errno(EINVAL));
971 return (0);
975 mprotect(caddr_t addr, size_t len, int prot)
977 struct as *as = curproc->p_as;
978 uint_t uprot = prot | PROT_USER;
979 int error;
981 if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
982 return (set_errno(EINVAL));
984 switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
985 case RANGE_OKAY:
986 break;
987 case RANGE_BADPROT:
988 return (set_errno(ENOTSUP));
989 case RANGE_BADADDR:
990 default:
991 return (set_errno(ENOMEM));
994 error = as_setprot(as, addr, len, uprot);
995 if (error)
996 return (set_errno(error));
997 return (0);
1000 #define MC_CACHE 128 /* internal result buffer */
1001 #define MC_QUANTUM (MC_CACHE * PAGESIZE) /* addresses covered in loop */
1004 mincore(caddr_t addr, size_t len, char *vecp)
1006 struct as *as = curproc->p_as;
1007 caddr_t ea; /* end address of loop */
1008 size_t rl; /* inner result length */
1009 char vec[MC_CACHE]; /* local vector cache */
1010 int error;
1011 model_t model;
1012 long llen;
1014 model = get_udatamodel();
1016 * Validate form of address parameters.
1018 if (model == DATAMODEL_NATIVE) {
1019 llen = (long)len;
1020 } else {
1021 llen = (int32_t)(size32_t)len;
1023 if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1024 return (set_errno(EINVAL));
1026 if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1027 return (set_errno(ENOMEM));
1030 * Loop over subranges of interval [addr : addr + len), recovering
1031 * results internally and then copying them out to caller. Subrange
1032 * is based on the size of MC_CACHE, defined above.
1034 for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1035 error = as_incore(as, addr,
1036 (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1037 if (rl != 0) {
1038 rl = (rl + PAGESIZE - 1) / PAGESIZE;
1039 if (copyout(vec, vecp, rl) != 0)
1040 return (set_errno(EFAULT));
1041 vecp += rl;
1043 if (error != 0)
1044 return (set_errno(ENOMEM));
1046 return (0);