usr/src/uts/common/os/grow.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29
  30 #include <sys/types.h>
  31 #include <sys/inttypes.h>
  32 #include <sys/param.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/systm.h>
  35 #include <sys/signal.h>
  36 #include <sys/user.h>
  37 #include <sys/errno.h>
  38 #include <sys/var.h>
  39 #include <sys/proc.h>
  40 #include <sys/tuneable.h>
  41 #include <sys/debug.h>
  42 #include <sys/cmn_err.h>
  43 #include <sys/cred.h>
  44 #include <sys/vnode.h>
  45 #include <sys/vfs.h>
  46 #include <sys/vm.h>
  47 #include <sys/file.h>
  48 #include <sys/mman.h>
  49 #include <sys/vmparam.h>
  50 #include <sys/fcntl.h>
  51 #include <sys/lwpchan_impl.h>
  52 #include <sys/nbmlock.h>
  53
  54 #include <vm/hat.h>
  55 #include <vm/as.h>
  56 #include <vm/seg.h>
  57 #include <vm/seg_dev.h>
  58 #include <vm/seg_vn.h>
  59
  60 int use_brk_lpg = 1;
  61 int use_stk_lpg = 1;
  62
  63 static int brk_lpg(caddr_t nva);
  64 static int grow_lpg(caddr_t sp);
  65
  66 int
  67 brk(caddr_t nva)
  68 {
  69         int error;
  70         proc_t *p = curproc;
  71
  72         /*
  73          * Serialize brk operations on an address space.
  74          * This also serves as the lock protecting p_brksize
  75          * and p_brkpageszc.
  76          */
  77         as_rangelock(p->p_as);
  78         if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  79                 error = brk_lpg(nva);
  80         } else {
  81                 error = brk_internal(nva, p->p_brkpageszc);
  82         }
  83         as_rangeunlock(p->p_as);
  84         return ((error != 0 ? set_errno(error) : 0));
  85 }
  86
  87 /*
  88  * Algorithm: call arch-specific map_pgsz to get best page size to use,
  89  * then call brk_internal().
  90  * Returns 0 on success.
  91  */
  92 static int
  93 brk_lpg(caddr_t nva)
  94 {
  95         struct proc *p = curproc;
  96         size_t pgsz, len;
  97         caddr_t addr, brkend;
  98         caddr_t bssbase = p->p_bssbase;
  99         caddr_t brkbase = p->p_brkbase;
 100         int oszc, szc;
 101         int err;
 102
 103         oszc = p->p_brkpageszc;
 104
 105         /*
 106          * If p_brkbase has not yet been set, the first call
 107          * to brk_internal() will initialize it.
 108          */
 109         if (brkbase == 0) {
 110                 return (brk_internal(nva, oszc));
 111         }
 112
 113         len = nva - bssbase;
 114
 115         pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 116         szc = page_szc(pgsz);
 117
 118         /*
 119          * Covers two cases:
 120          * 1. page_szc() returns -1 for invalid page size, so we want to
 121          * ignore it in that case.
 122          * 2. By design we never decrease page size, as it is more stable.
 123          */
 124         if (szc <= oszc) {
 125                 err = brk_internal(nva, oszc);
 126                 /* If failed, back off to base page size. */
 127                 if (err != 0 && oszc != 0) {
 128                         err = brk_internal(nva, 0);
 129                 }
 130                 return (err);
 131         }
 132
 133         err = brk_internal(nva, szc);
 134         /* If using szc failed, map with base page size and return. */
 135         if (err != 0) {
 136                 if (szc != 0) {
 137                         err = brk_internal(nva, 0);
 138                 }
 139                 return (err);
 140         }
 141
 142         /*
 143          * Round up brk base to a large page boundary and remap
 144          * anything in the segment already faulted in beyond that
 145          * point.
 146          */
 147         addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 148         brkend = brkbase + p->p_brksize;
 149         len = brkend - addr;
 150         /* Check that len is not negative. Update page size code for heap. */
 151         if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 152                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 153                 p->p_brkpageszc = szc;
 154         }
 155
 156         ASSERT(err == 0);
 157         return (err);           /* should always be 0 */
 158 }
 159
 160 /*
 161  * Returns 0 on success.
 162  */
 163 int
 164 brk_internal(caddr_t nva, uint_t brkszc)
 165 {
 166         caddr_t ova;                    /* current break address */
 167         size_t size;
 168         int     error;
 169         struct proc *p = curproc;
 170         struct as *as = p->p_as;
 171         size_t pgsz;
 172         uint_t szc;
 173         rctl_qty_t as_rctl;
 174
 175         /*
 176          * extend heap to brkszc alignment but use current p->p_brkpageszc
 177          * for the newly created segment. This allows the new extension
 178          * segment to be concatenated successfully with the existing brk
 179          * segment.
 180          */
 181         if ((szc = brkszc) != 0) {
 182                 pgsz = page_get_pagesize(szc);
 183                 ASSERT(pgsz > PAGESIZE);
 184         } else {
 185                 pgsz = PAGESIZE;
 186         }
 187
 188         mutex_enter(&p->p_lock);
 189         as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 190             p->p_rctls, p);
 191         mutex_exit(&p->p_lock);
 192
 193         /*
 194          * If p_brkbase has not yet been set, the first call
 195          * to brk() will initialize it.
 196          */
 197         if (p->p_brkbase == 0)
 198                 p->p_brkbase = nva;
 199
 200         /*
 201          * Before multiple page size support existed p_brksize was the value
 202          * not rounded to the pagesize (i.e. it stored the exact user request
 203          * for heap size). If pgsz is greater than PAGESIZE calculate the
 204          * heap size as the real new heap size by rounding it up to pgsz.
 205          * This is useful since we may want to know where the heap ends
 206          * without knowing heap pagesize (e.g. some old code) and also if
 207          * heap pagesize changes we can update p_brkpageszc but delay adding
 208          * new mapping yet still know from p_brksize where the heap really
 209          * ends. The user requested heap end is stored in libc variable.
 210          */
 211         if (pgsz > PAGESIZE) {
 212                 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 213                 size = tnva - p->p_brkbase;
 214                 if (tnva < p->p_brkbase || (size > p->p_brksize &&
 215                     size > (size_t)as_rctl)) {
 216                         szc = 0;
 217                         pgsz = PAGESIZE;
 218                         size = nva - p->p_brkbase;
 219                 }
 220         } else {
 221                 size = nva - p->p_brkbase;
 222         }
 223
 224         /*
 225          * use PAGESIZE to roundup ova because we want to know the real value
 226          * of the current heap end in case p_brkpageszc changes since the last
 227          * p_brksize was computed.
 228          */
 229         nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 230         ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 231             PAGESIZE);
 232
 233         if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 234             size > as_rctl)) {
 235                 mutex_enter(&p->p_lock);
 236                 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 237                     RCA_SAFE);
 238                 mutex_exit(&p->p_lock);
 239                 return (ENOMEM);
 240         }
 241
 242         if (nva > ova) {
 243                 struct segvn_crargs crargs =
 244                     SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 245
 246                 if (!(p->p_datprot & PROT_EXEC)) {
 247                         crargs.prot &= ~PROT_EXEC;
 248                 }
 249
 250                 /*
 251                  * Add new zfod mapping to extend UNIX data segment
 252                  * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 253                  * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 254                  * page sizes if ova is not aligned to szc's pgsz.
 255                  */
 256                 if (szc > 0) {
 257                         caddr_t rbss;
 258
 259                         rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 260                             pgsz);
 261                         if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 262                                 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 263                                     AS_MAP_NO_LPOOB;
 264                         } else if (ova == rbss) {
 265                                 crargs.szc = szc;
 266                         } else {
 267                                 crargs.szc = AS_MAP_HEAP;
 268                         }
 269                 } else {
 270                         crargs.szc = AS_MAP_NO_LPOOB;
 271                 }
 272                 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 273                 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 274                     &crargs);
 275                 if (error) {
 276                         return (error);
 277                 }
 278
 279         } else if (nva < ova) {
 280                 /*
 281                  * Release mapping to shrink UNIX data segment.
 282                  */
 283                 (void) as_unmap(as, nva, (size_t)(ova - nva));
 284         }
 285         p->p_brksize = size;
 286         return (0);
 287 }
 288
 289 /*
 290  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 291  * This routine assumes that the stack grows downward.
 292  */
 293 int
 294 grow(caddr_t sp)
 295 {
 296         struct proc *p = curproc;
 297         struct as *as = p->p_as;
 298         size_t oldsize = p->p_stksize;
 299         size_t newsize;
 300         int err;
 301
 302         /*
 303          * Serialize grow operations on an address space.
 304          * This also serves as the lock protecting p_stksize
 305          * and p_stkpageszc.
 306          */
 307         as_rangelock(as);
 308         if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 309                 err = grow_lpg(sp);
 310         } else {
 311                 err = grow_internal(sp, p->p_stkpageszc);
 312         }
 313         as_rangeunlock(as);
 314
 315         if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 316                 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 317                 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 318                 /*
 319                  * Set up translations so the process doesn't have to fault in
 320                  * the stack pages we just gave it.
 321                  */
 322                 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 323                     newsize - oldsize, F_INVAL, S_WRITE);
 324         }
 325         return ((err == 0 ? 1 : 0));
 326 }
 327
 328 /*
 329  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 330  * then call grow_internal().
 331  * Returns 0 on success.
 332  */
 333 static int
 334 grow_lpg(caddr_t sp)
 335 {
 336         struct proc *p = curproc;
 337         size_t pgsz;
 338         size_t len, newsize;
 339         caddr_t addr, saddr;
 340         caddr_t growend;
 341         int oszc, szc;
 342         int err;
 343
 344         newsize = p->p_usrstack - sp;
 345
 346         oszc = p->p_stkpageszc;
 347         pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 348         szc = page_szc(pgsz);
 349
 350         /*
 351          * Covers two cases:
 352          * 1. page_szc() returns -1 for invalid page size, so we want to
 353          * ignore it in that case.
 354          * 2. By design we never decrease page size, as it is more stable.
 355          * This shouldn't happen as the stack never shrinks.
 356          */
 357         if (szc <= oszc) {
 358                 err = grow_internal(sp, oszc);
 359                 /* failed, fall back to base page size */
 360                 if (err != 0 && oszc != 0) {
 361                         err = grow_internal(sp, 0);
 362                 }
 363                 return (err);
 364         }
 365
 366         /*
 367          * We've grown sufficiently to switch to a new page size.
 368          * So we are going to remap the whole segment with the new page size.
 369          */
 370         err = grow_internal(sp, szc);
 371         /* The grow with szc failed, so fall back to base page size. */
 372         if (err != 0) {
 373                 if (szc != 0) {
 374                         err = grow_internal(sp, 0);
 375                 }
 376                 return (err);
 377         }
 378
 379         /*
 380          * Round up stack pointer to a large page boundary and remap
 381          * any pgsz pages in the segment already faulted in beyond that
 382          * point.
 383          */
 384         saddr = p->p_usrstack - p->p_stksize;
 385         addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 386         growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 387         len = growend - addr;
 388         /* Check that len is not negative. Update page size code for stack. */
 389         if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 390                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 391                 p->p_stkpageszc = szc;
 392         }
 393
 394         ASSERT(err == 0);
 395         return (err);           /* should always be 0 */
 396 }
 397
 398 /*
 399  * This routine assumes that the stack grows downward.
 400  * Returns 0 on success, errno on failure.
 401  */
 402 int
 403 grow_internal(caddr_t sp, uint_t growszc)
 404 {
 405         struct proc *p = curproc;
 406         size_t newsize;
 407         size_t oldsize;
 408         int    error;
 409         size_t pgsz;
 410         uint_t szc;
 411         struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 412
 413         ASSERT(sp < p->p_usrstack);
 414         sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 415
 416         /*
 417          * grow to growszc alignment but use current p->p_stkpageszc for
 418          * the segvn_crargs szc passed to segvn_create. For memcntl to
 419          * increase the szc, this allows the new extension segment to be
 420          * concatenated successfully with the existing stack segment.
 421          */
 422         if ((szc = growszc) != 0) {
 423                 pgsz = page_get_pagesize(szc);
 424                 ASSERT(pgsz > PAGESIZE);
 425                 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 426                 if (newsize > (size_t)p->p_stk_ctl) {
 427                         szc = 0;
 428                         pgsz = PAGESIZE;
 429                         newsize = p->p_usrstack - sp;
 430                 }
 431         } else {
 432                 pgsz = PAGESIZE;
 433                 newsize = p->p_usrstack - sp;
 434         }
 435
 436         if (newsize > (size_t)p->p_stk_ctl) {
 437                 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 438                     RCA_UNSAFE_ALL);
 439
 440                 return (ENOMEM);
 441         }
 442
 443         oldsize = p->p_stksize;
 444         ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 445
 446         if (newsize <= oldsize) {       /* prevent the stack from shrinking */
 447                 return (0);
 448         }
 449
 450         if (!(p->p_stkprot & PROT_EXEC)) {
 451                 crargs.prot &= ~PROT_EXEC;
 452         }
 453         /*
 454          * extend stack with the proposed new growszc, which is different
 455          * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 456          * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 457          * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 458          * if not aligned to szc's pgsz.
 459          */
 460         if (szc > 0) {
 461                 caddr_t oldsp = p->p_usrstack - oldsize;
 462                 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 463                     pgsz);
 464
 465                 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 466                         crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 467                             AS_MAP_NO_LPOOB;
 468                 } else if (oldsp == austk) {
 469                         crargs.szc = szc;
 470                 } else {
 471                         crargs.szc = AS_MAP_STACK;
 472                 }
 473         } else {
 474                 crargs.szc = AS_MAP_NO_LPOOB;
 475         }
 476         crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 477
 478         if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 479             segvn_create, &crargs)) != 0) {
 480                 if (error == EAGAIN) {
 481                         cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 482                             "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 483                 }
 484                 return (error);
 485         }
 486         p->p_stksize = newsize;
 487         return (0);
 488 }
 489
 490 /*
 491  * Find address for user to map.
 492  * If MAP_FIXED is not specified, we can pick any address we want, but we will
 493  * first try the value in *addrp if it is non-NULL.  Thus this is implementing
 494  * a way to try and get a preferred address.
 495  */
 496 int
 497 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 498     int vacalign, uint_t flags)
 499 {
 500         caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 501         size_t lenp = len;
 502
 503         ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 504         if (flags & MAP_FIXED) {
 505                 (void) as_unmap(as, *addrp, len);
 506                 return (0);
 507         } else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
 508             !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 509                 /* User supplied address was available */
 510                 *addrp = basep;
 511         } else {
 512                 /*
 513                  * No user supplied address or the address supplied was not
 514                  * available.
 515                  */
 516                 map_addr(addrp, len, off, vacalign, flags);
 517         }
 518         if (*addrp == NULL)
 519                 return (ENOMEM);
 520         return (0);
 521 }
 522
 523
 524 /*
 525  * Used for MAP_ANON - fast way to get anonymous pages
 526  */
 527 static int
 528 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 529     offset_t pos)
 530 {
 531         struct segvn_crargs vn_a;
 532         int error;
 533
 534         if (((PROT_ALL & uprot) != uprot))
 535                 return (EACCES);
 536
 537         if ((flags & MAP_FIXED) != 0) {
 538                 caddr_t userlimit;
 539
 540                 /*
 541                  * Use the user address.  First verify that
 542                  * the address to be used is page aligned.
 543                  * Then make some simple bounds checks.
 544                  */
 545                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 546                         return (EINVAL);
 547
 548                 userlimit = flags & _MAP_LOW32 ?
 549                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 550                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 551                 case RANGE_OKAY:
 552                         break;
 553                 case RANGE_BADPROT:
 554                         return (ENOTSUP);
 555                 case RANGE_BADADDR:
 556                 default:
 557                         return (ENOMEM);
 558                 }
 559         }
 560         /*
 561          * No need to worry about vac alignment for anonymous
 562          * pages since this is a "clone" object that doesn't
 563          * yet exist.
 564          */
 565         error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 566         if (error != 0) {
 567                 return (error);
 568         }
 569
 570         /*
 571          * Use the seg_vn segment driver; passing in the NULL amp
 572          * gives the desired "cloning" effect.
 573          */
 574         vn_a.vp = NULL;
 575         vn_a.offset = 0;
 576         vn_a.type = flags & MAP_TYPE;
 577         vn_a.prot = uprot;
 578         vn_a.maxprot = PROT_ALL;
 579         vn_a.flags = flags & ~MAP_TYPE;
 580         vn_a.cred = CRED();
 581         vn_a.amp = NULL;
 582         vn_a.szc = 0;
 583         vn_a.lgrp_mem_policy_flags = 0;
 584
 585         return (as_map(as, *addrp, len, segvn_create, &vn_a));
 586 }
 587
 588 static int
 589 smmap_common(caddr_t *addrp, size_t len,
 590     int prot, int flags, struct file *fp, offset_t pos)
 591 {
 592         struct vnode *vp;
 593         struct as *as = curproc->p_as;
 594         uint_t uprot, maxprot, type;
 595         int error;
 596         int in_crit = 0;
 597
 598         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
 599             _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 600             MAP_TEXT | MAP_INITDATA)) != 0) {
 601                 /* | MAP_RENAME */      /* not implemented, let user know */
 602                 return (EINVAL);
 603         }
 604
 605         if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 606                 return (EINVAL);
 607         }
 608
 609         if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 610                 return (EINVAL);
 611         }
 612
 613 #if defined(__sparc)
 614         /*
 615          * See if this is an "old mmap call".  If so, remember this
 616          * fact and convert the flags value given to mmap to indicate
 617          * the specified address in the system call must be used.
 618          * _MAP_NEW is turned set by all new uses of mmap.
 619          */
 620         if ((flags & _MAP_NEW) == 0)
 621                 flags |= MAP_FIXED;
 622 #endif
 623         flags &= ~_MAP_NEW;
 624
 625         type = flags & MAP_TYPE;
 626         if (type != MAP_PRIVATE && type != MAP_SHARED)
 627                 return (EINVAL);
 628
 629
 630         if (flags & MAP_ALIGN) {
 631
 632                 if (flags & MAP_FIXED)
 633                         return (EINVAL);
 634
 635                 /* alignment needs to be a power of 2 >= page size */
 636                 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 637                     !ISP2((uintptr_t)*addrp))
 638                         return (EINVAL);
 639         }
 640         /*
 641          * Check for bad lengths and file position.
 642          * We let the VOP_MAP routine check for negative lengths
 643          * since on some vnode types this might be appropriate.
 644          */
 645         if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
 646                 return (EINVAL);
 647
 648         maxprot = PROT_ALL;             /* start out allowing all accesses */
 649         uprot = prot | PROT_USER;
 650
 651         if (fp == NULL) {
 652                 ASSERT(flags & MAP_ANON);
 653                 /* discard lwpchan mappings, like munmap() */
 654                 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 655                         lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 656                 as_rangelock(as);
 657                 error = zmap(as, addrp, len, uprot, flags, pos);
 658                 as_rangeunlock(as);
 659                 /*
 660                  * Tell machine specific code that lwp has mapped shared memory
 661                  */
 662                 if (error == 0 && (flags & MAP_SHARED)) {
 663                         /* EMPTY */
 664                         LWP_MMODEL_SHARED_AS(*addrp, len);
 665                 }
 666                 return (error);
 667         } else if ((flags & MAP_ANON) != 0)
 668                 return (EINVAL);
 669
 670         vp = fp->f_vnode;
 671
 672         /* Can't execute code from "noexec" mounted filesystem. */
 673         if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 674                 maxprot &= ~PROT_EXEC;
 675
 676         /*
 677          * These checks were added as part of large files.
 678          *
 679          * Return ENXIO if the initial position is negative; return EOVERFLOW
 680          * if (offset + len) would overflow the maximum allowed offset for the
 681          * type of file descriptor being used.
 682          */
 683         if (vp->v_type == VREG) {
 684                 if (pos < 0)
 685                         return (ENXIO);
 686                 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 687                         return (EOVERFLOW);
 688         }
 689
 690         if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 691                 /* no write access allowed */
 692                 maxprot &= ~PROT_WRITE;
 693         }
 694
 695         /*
 696          * XXX - Do we also adjust maxprot based on protections
 697          * of the vnode?  E.g. if no execute permission is given
 698          * on the vnode for the current user, maxprot probably
 699          * should disallow PROT_EXEC also?  This is different
 700          * from the write access as this would be a per vnode
 701          * test as opposed to a per fd test for writability.
 702          */
 703
 704         /*
 705          * Verify that the specified protections are not greater than
 706          * the maximum allowable protections.  Also test to make sure
 707          * that the file descriptor does allows for read access since
 708          * "write only" mappings are hard to do since normally we do
 709          * the read from the file before the page can be written.
 710          */
 711         if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 712                 return (EACCES);
 713
 714         /*
 715          * If the user specified an address, do some simple checks here
 716          */
 717         if ((flags & MAP_FIXED) != 0) {
 718                 caddr_t userlimit;
 719
 720                 /*
 721                  * Use the user address.  First verify that
 722                  * the address to be used is page aligned.
 723                  * Then make some simple bounds checks.
 724                  */
 725                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 726                         return (EINVAL);
 727
 728                 userlimit = flags & _MAP_LOW32 ?
 729                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 730                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 731                 case RANGE_OKAY:
 732                         break;
 733                 case RANGE_BADPROT:
 734                         return (ENOTSUP);
 735                 case RANGE_BADADDR:
 736                 default:
 737                         return (ENOMEM);
 738                 }
 739         }
 740
 741         if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 742             nbl_need_check(vp)) {
 743                 int svmand;
 744                 nbl_op_t nop;
 745
 746                 nbl_start_crit(vp, RW_READER);
 747                 in_crit = 1;
 748                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 749                 if (error != 0)
 750                         goto done;
 751                 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 752                         if (prot & (PROT_READ | PROT_EXEC)) {
 753                                 nop = NBL_READWRITE;
 754                         } else {
 755                                 nop = NBL_WRITE;
 756                         }
 757                 } else {
 758                         nop = NBL_READ;
 759                 }
 760                 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 761                         error = EACCES;
 762                         goto done;
 763                 }
 764         }
 765
 766         /* discard lwpchan mappings, like munmap() */
 767         if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 768                 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 769
 770         /*
 771          * Ok, now let the vnode map routine do its thing to set things up.
 772          */
 773         error = VOP_MAP(vp, pos, as,
 774             addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 775
 776         if (error == 0) {
 777                 /*
 778                  * Tell machine specific code that lwp has mapped shared memory
 779                  */
 780                 if (flags & MAP_SHARED) {
 781                         /* EMPTY */
 782                         LWP_MMODEL_SHARED_AS(*addrp, len);
 783                 }
 784                 if (vp->v_type == VREG &&
 785                     (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 786                         /*
 787                          * Mark this as an executable vnode
 788                          */
 789                         mutex_enter(&vp->v_lock);
 790                         vp->v_flag |= VVMEXEC;
 791                         mutex_exit(&vp->v_lock);
 792                 }
 793         }
 794
 795 done:
 796         if (in_crit)
 797                 nbl_end_crit(vp);
 798         return (error);
 799 }
 800
 801 #ifdef _LP64
 802 /*
 803  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 804  *
 805  * The "large file" mmap routine mmap64(2) is also mapped to this routine
 806  * by the 64-bit version of libc.
 807  *
 808  * Eventually, this should be the only version, and have smmap_common()
 809  * folded back into it again.  Some day.
 810  */
 811 caddr_t
 812 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 813 {
 814         struct file *fp;
 815         int error;
 816
 817         if (flags & _MAP_LOW32)
 818                 error = EINVAL;
 819         else if (fd == -1 && (flags & MAP_ANON) != 0)
 820                 error = smmap_common(&addr, len, prot, flags,
 821                     NULL, (offset_t)pos);
 822         else if ((fp = getf(fd)) != NULL) {
 823                 error = smmap_common(&addr, len, prot, flags,
 824                     fp, (offset_t)pos);
 825                 releasef(fd);
 826         } else
 827                 error = EBADF;
 828
 829         return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 830 }
 831 #endif  /* _LP64 */
 832
 833 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 834
 835 /*
 836  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 837  */
 838 caddr_t
 839 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 840 {
 841         struct file *fp;
 842         int error;
 843         caddr_t a = (caddr_t)(uintptr_t)addr;
 844
 845         if (flags & _MAP_LOW32)
 846                 error = EINVAL;
 847         else if (fd == -1 && (flags & MAP_ANON) != 0)
 848                 error = smmap_common(&a, (size_t)len, prot,
 849                     flags | _MAP_LOW32, NULL, (offset_t)pos);
 850         else if ((fp = getf(fd)) != NULL) {
 851                 error = smmap_common(&a, (size_t)len, prot,
 852                     flags | _MAP_LOW32, fp, (offset_t)pos);
 853                 releasef(fd);
 854         } else
 855                 error = EBADF;
 856
 857         ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 858
 859         return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 860 }
 861
 862 /*
 863  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 864  *
 865  * Now things really get ugly because we can't use the C-style
 866  * calling convention for more than 6 args, and 64-bit parameter
 867  * passing on 32-bit systems is less than clean.
 868  */
 869
 870 struct mmaplf32a {
 871         caddr_t addr;
 872         size_t len;
 873 #ifdef _LP64
 874         /*
 875          * 32-bit contents, 64-bit cells
 876          */
 877         uint64_t prot;
 878         uint64_t flags;
 879         uint64_t fd;
 880         uint64_t offhi;
 881         uint64_t offlo;
 882 #else
 883         /*
 884          * 32-bit contents, 32-bit cells
 885          */
 886         uint32_t prot;
 887         uint32_t flags;
 888         uint32_t fd;
 889         uint32_t offhi;
 890         uint32_t offlo;
 891 #endif
 892 };
 893
 894 int
 895 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 896 {
 897         struct file *fp;
 898         int error;
 899         caddr_t a = uap->addr;
 900         int flags = (int)uap->flags;
 901         int fd = (int)uap->fd;
 902 #ifdef _BIG_ENDIAN
 903         offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
 904 #else
 905         offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
 906 #endif
 907
 908         if (flags & _MAP_LOW32)
 909                 error = EINVAL;
 910         else if (fd == -1 && (flags & MAP_ANON) != 0)
 911                 error = smmap_common(&a, uap->len, (int)uap->prot,
 912                     flags | _MAP_LOW32, NULL, off);
 913         else if ((fp = getf(fd)) != NULL) {
 914                 error = smmap_common(&a, uap->len, (int)uap->prot,
 915                     flags | _MAP_LOW32, fp, off);
 916                 releasef(fd);
 917         } else
 918                 error = EBADF;
 919
 920         if (error == 0)
 921                 rvp->r_val1 = (uintptr_t)a;
 922         return (error);
 923 }
 924
 925 #endif  /* _SYSCALL32_IMPL || _ILP32 */
 926
 927 int
 928 munmap(caddr_t addr, size_t len)
 929 {
 930         struct proc *p = curproc;
 931         struct as *as = p->p_as;
 932
 933         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 934                 return (set_errno(EINVAL));
 935
 936         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 937                 return (set_errno(EINVAL));
 938
 939         /*
 940          * Discard lwpchan mappings.
 941          */
 942         if (p->p_lcp != NULL)
 943                 lwpchan_delete_mapping(p, addr, addr + len);
 944         if (as_unmap(as, addr, len) != 0)
 945                 return (set_errno(EINVAL));
 946
 947         return (0);
 948 }
 949
 950 int
 951 mprotect(caddr_t addr, size_t len, int prot)
 952 {
 953         struct as *as = curproc->p_as;
 954         uint_t uprot = prot | PROT_USER;
 955         int error;
 956
 957         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 958                 return (set_errno(EINVAL));
 959
 960         switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 961         case RANGE_OKAY:
 962                 break;
 963         case RANGE_BADPROT:
 964                 return (set_errno(ENOTSUP));
 965         case RANGE_BADADDR:
 966         default:
 967                 return (set_errno(ENOMEM));
 968         }
 969
 970         error = as_setprot(as, addr, len, uprot);
 971         if (error)
 972                 return (set_errno(error));
 973         return (0);
 974 }
 975
 976 #define MC_CACHE        128                     /* internal result buffer */
 977 #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
 978
 979 int
 980 mincore(caddr_t addr, size_t len, char *vecp)
 981 {
 982         struct as *as = curproc->p_as;
 983         caddr_t ea;                     /* end address of loop */
 984         size_t rl;                      /* inner result length */
 985         char vec[MC_CACHE];             /* local vector cache */
 986         int error;
 987         model_t model;
 988         long    llen;
 989
 990         model = get_udatamodel();
 991         /*
 992          * Validate form of address parameters.
 993          */
 994         if (model == DATAMODEL_NATIVE) {
 995                 llen = (long)len;
 996         } else {
 997                 llen = (int32_t)(size32_t)len;
 998         }
 999         if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1000                 return (set_errno(EINVAL));
1001
1002         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1003                 return (set_errno(ENOMEM));
1004
1005         /*
1006          * Loop over subranges of interval [addr : addr + len), recovering
1007          * results internally and then copying them out to caller.  Subrange
1008          * is based on the size of MC_CACHE, defined above.
1009          */
1010         for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1011                 error = as_incore(as, addr,
1012                     (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1013                 if (rl != 0) {
1014                         rl = (rl + PAGESIZE - 1) / PAGESIZE;
1015                         if (copyout(vec, vecp, rl) != 0)
1016                                 return (set_errno(EFAULT));
1017                         vecp += rl;
1018                 }
1019                 if (error != 0)
1020                         return (set_errno(ENOMEM));
1021         }
1022         return (0);
1023 }