kernel/os/grow.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
  23
  24 /*
  25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30 /*        All Rights Reserved   */
  31
  32 #include <sys/types.h>
  33 #include <sys/inttypes.h>
  34 #include <sys/param.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/systm.h>
  37 #include <sys/signal.h>
  38 #include <sys/user.h>
  39 #include <sys/errno.h>
  40 #include <sys/var.h>
  41 #include <sys/proc.h>
  42 #include <sys/tuneable.h>
  43 #include <sys/debug.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/cred.h>
  46 #include <sys/vnode.h>
  47 #include <sys/vfs.h>
  48 #include <sys/vm.h>
  49 #include <sys/file.h>
  50 #include <sys/mman.h>
  51 #include <sys/vmparam.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/lwpchan_impl.h>
  54 #include <sys/nbmlock.h>
  55
  56 #include <vm/hat.h>
  57 #include <vm/as.h>
  58 #include <vm/seg.h>
  59 #include <vm/seg_dev.h>
  60 #include <vm/seg_vn.h>
  61
  62 int use_brk_lpg = 1;
  63 int use_stk_lpg = 1;
  64
  65 /*
  66  * If set, we will not randomize mappings where the 'addr' argument is
  67  * non-NULL and not an alignment.
  68  */
  69 int aslr_respect_mmap_hint = 1;
  70
  71 static int brk_lpg(caddr_t nva);
  72 static int grow_lpg(caddr_t sp);
  73
  74 intptr_t
  75 brk(caddr_t nva)
  76 {
  77         int error;
  78         proc_t *p = curproc;
  79
  80         /*
  81          * Serialize brk operations on an address space.
  82          * This also serves as the lock protecting p_brksize
  83          * and p_brkpageszc.
  84          */
  85         as_rangelock(p->p_as);
  86
  87         /*
  88          * As a special case to aid the implementation of sbrk(3C), if given a
  89          * new brk of 0, return the current brk.  We'll hide this in brk(3C).
  90          */
  91         if (nva == 0) {
  92                 intptr_t base = (intptr_t)(p->p_brkbase + p->p_brksize);
  93                 as_rangeunlock(p->p_as);
  94                 return (base);
  95         }
  96
  97         if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  98                 error = brk_lpg(nva);
  99         } else {
 100                 error = brk_internal(nva, p->p_brkpageszc);
 101         }
 102         as_rangeunlock(p->p_as);
 103         return ((error != 0 ? set_errno(error) : 0));
 104 }
 105
 106 /*
 107  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 108  * then call brk_internal().
 109  * Returns 0 on success.
 110  */
 111 static int
 112 brk_lpg(caddr_t nva)
 113 {
 114         struct proc *p = curproc;
 115         size_t pgsz, len;
 116         caddr_t addr, brkend;
 117         caddr_t bssbase = p->p_bssbase;
 118         caddr_t brkbase = p->p_brkbase;
 119         int oszc, szc;
 120         int err;
 121
 122         oszc = p->p_brkpageszc;
 123
 124         /*
 125          * If p_brkbase has not yet been set, the first call
 126          * to brk_internal() will initialize it.
 127          */
 128         if (brkbase == 0) {
 129                 return (brk_internal(nva, oszc));
 130         }
 131
 132         len = nva - bssbase;
 133
 134         pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 135         szc = page_szc(pgsz);
 136
 137         /*
 138          * Covers two cases:
 139          * 1. page_szc() returns -1 for invalid page size, so we want to
 140          * ignore it in that case.
 141          * 2. By design we never decrease page size, as it is more stable.
 142          */
 143         if (szc <= oszc) {
 144                 err = brk_internal(nva, oszc);
 145                 /* If failed, back off to base page size. */
 146                 if (err != 0 && oszc != 0) {
 147                         err = brk_internal(nva, 0);
 148                 }
 149                 return (err);
 150         }
 151
 152         err = brk_internal(nva, szc);
 153         /* If using szc failed, map with base page size and return. */
 154         if (err != 0) {
 155                 if (szc != 0) {
 156                         err = brk_internal(nva, 0);
 157                 }
 158                 return (err);
 159         }
 160
 161         /*
 162          * Round up brk base to a large page boundary and remap
 163          * anything in the segment already faulted in beyond that
 164          * point.
 165          */
 166         addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 167         brkend = brkbase + p->p_brksize;
 168         len = brkend - addr;
 169         /* Check that len is not negative. Update page size code for heap. */
 170         if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 171                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 172                 p->p_brkpageszc = szc;
 173         }
 174
 175         ASSERT(err == 0);
 176         return (err);           /* should always be 0 */
 177 }
 178
 179 /*
 180  * Returns 0 on success.
 181  */
 182 int
 183 brk_internal(caddr_t nva, uint_t brkszc)
 184 {
 185         caddr_t ova;                    /* current break address */
 186         size_t size;
 187         int     error;
 188         struct proc *p = curproc;
 189         struct as *as = p->p_as;
 190         size_t pgsz;
 191         uint_t szc;
 192         rctl_qty_t as_rctl;
 193
 194         /*
 195          * extend heap to brkszc alignment but use current p->p_brkpageszc
 196          * for the newly created segment. This allows the new extension
 197          * segment to be concatenated successfully with the existing brk
 198          * segment.
 199          */
 200         if ((szc = brkszc) != 0) {
 201                 pgsz = page_get_pagesize(szc);
 202                 ASSERT(pgsz > PAGESIZE);
 203         } else {
 204                 pgsz = PAGESIZE;
 205         }
 206
 207         mutex_enter(&p->p_lock);
 208         as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 209             p->p_rctls, p);
 210         mutex_exit(&p->p_lock);
 211
 212         /*
 213          * If p_brkbase has not yet been set, the first call
 214          * to brk() will initialize it.
 215          */
 216         if (p->p_brkbase == 0)
 217                 p->p_brkbase = nva;
 218
 219         /*
 220          * Before multiple page size support existed p_brksize was the value
 221          * not rounded to the pagesize (i.e. it stored the exact user request
 222          * for heap size). If pgsz is greater than PAGESIZE calculate the
 223          * heap size as the real new heap size by rounding it up to pgsz.
 224          * This is useful since we may want to know where the heap ends
 225          * without knowing heap pagesize (e.g. some old code) and also if
 226          * heap pagesize changes we can update p_brkpageszc but delay adding
 227          * new mapping yet still know from p_brksize where the heap really
 228          * ends. The user requested heap end is stored in libc variable.
 229          */
 230         if (pgsz > PAGESIZE) {
 231                 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 232                 size = tnva - p->p_brkbase;
 233                 if (tnva < p->p_brkbase || (size > p->p_brksize &&
 234                     size > (size_t)as_rctl)) {
 235                         szc = 0;
 236                         pgsz = PAGESIZE;
 237                         size = nva - p->p_brkbase;
 238                 }
 239         } else {
 240                 size = nva - p->p_brkbase;
 241         }
 242
 243         /*
 244          * use PAGESIZE to roundup ova because we want to know the real value
 245          * of the current heap end in case p_brkpageszc changes since the last
 246          * p_brksize was computed.
 247          */
 248         nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 249         ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 250             PAGESIZE);
 251
 252         if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 253             size > as_rctl)) {
 254                 mutex_enter(&p->p_lock);
 255                 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 256                     RCA_SAFE);
 257                 mutex_exit(&p->p_lock);
 258                 return (ENOMEM);
 259         }
 260
 261         if (nva > ova) {
 262                 struct segvn_crargs crargs =
 263                     SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 264
 265                 if (!(p->p_datprot & PROT_EXEC)) {
 266                         crargs.prot &= ~PROT_EXEC;
 267                 }
 268
 269                 /*
 270                  * Add new zfod mapping to extend UNIX data segment
 271                  * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 272                  * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 273                  * page sizes if ova is not aligned to szc's pgsz.
 274                  */
 275                 if (szc > 0) {
 276                         caddr_t rbss;
 277
 278                         rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 279                             pgsz);
 280                         if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 281                                 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 282                                     AS_MAP_NO_LPOOB;
 283                         } else if (ova == rbss) {
 284                                 crargs.szc = szc;
 285                         } else {
 286                                 crargs.szc = AS_MAP_HEAP;
 287                         }
 288                 } else {
 289                         crargs.szc = AS_MAP_NO_LPOOB;
 290                 }
 291                 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 292                 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 293                     &crargs);
 294                 if (error) {
 295                         return (error);
 296                 }
 297
 298         } else if (nva < ova) {
 299                 /*
 300                  * Release mapping to shrink UNIX data segment.
 301                  */
 302                 (void) as_unmap(as, nva, (size_t)(ova - nva));
 303         }
 304         p->p_brksize = size;
 305         return (0);
 306 }
 307
 308 /*
 309  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 310  * This routine assumes that the stack grows downward.
 311  */
 312 int
 313 grow(caddr_t sp)
 314 {
 315         struct proc *p = curproc;
 316         struct as *as = p->p_as;
 317         size_t oldsize = p->p_stksize;
 318         size_t newsize;
 319         int err;
 320
 321         /*
 322          * Serialize grow operations on an address space.
 323          * This also serves as the lock protecting p_stksize
 324          * and p_stkpageszc.
 325          */
 326         as_rangelock(as);
 327         if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 328                 err = grow_lpg(sp);
 329         } else {
 330                 err = grow_internal(sp, p->p_stkpageszc);
 331         }
 332         as_rangeunlock(as);
 333
 334         if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 335                 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 336                 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 337                 /*
 338                  * Set up translations so the process doesn't have to fault in
 339                  * the stack pages we just gave it.
 340                  */
 341                 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 342                     newsize - oldsize, F_INVAL, S_WRITE);
 343         }
 344         return ((err == 0 ? 1 : 0));
 345 }
 346
 347 /*
 348  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 349  * then call grow_internal().
 350  * Returns 0 on success.
 351  */
 352 static int
 353 grow_lpg(caddr_t sp)
 354 {
 355         struct proc *p = curproc;
 356         size_t pgsz;
 357         size_t len, newsize;
 358         caddr_t addr, saddr;
 359         caddr_t growend;
 360         int oszc, szc;
 361         int err;
 362
 363         newsize = p->p_usrstack - sp;
 364
 365         oszc = p->p_stkpageszc;
 366         pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 367         szc = page_szc(pgsz);
 368
 369         /*
 370          * Covers two cases:
 371          * 1. page_szc() returns -1 for invalid page size, so we want to
 372          * ignore it in that case.
 373          * 2. By design we never decrease page size, as it is more stable.
 374          * This shouldn't happen as the stack never shrinks.
 375          */
 376         if (szc <= oszc) {
 377                 err = grow_internal(sp, oszc);
 378                 /* failed, fall back to base page size */
 379                 if (err != 0 && oszc != 0) {
 380                         err = grow_internal(sp, 0);
 381                 }
 382                 return (err);
 383         }
 384
 385         /*
 386          * We've grown sufficiently to switch to a new page size.
 387          * So we are going to remap the whole segment with the new page size.
 388          */
 389         err = grow_internal(sp, szc);
 390         /* The grow with szc failed, so fall back to base page size. */
 391         if (err != 0) {
 392                 if (szc != 0) {
 393                         err = grow_internal(sp, 0);
 394                 }
 395                 return (err);
 396         }
 397
 398         /*
 399          * Round up stack pointer to a large page boundary and remap
 400          * any pgsz pages in the segment already faulted in beyond that
 401          * point.
 402          */
 403         saddr = p->p_usrstack - p->p_stksize;
 404         addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 405         growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 406         len = growend - addr;
 407         /* Check that len is not negative. Update page size code for stack. */
 408         if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 409                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 410                 p->p_stkpageszc = szc;
 411         }
 412
 413         ASSERT(err == 0);
 414         return (err);           /* should always be 0 */
 415 }
 416
 417 /*
 418  * This routine assumes that the stack grows downward.
 419  * Returns 0 on success, errno on failure.
 420  */
 421 int
 422 grow_internal(caddr_t sp, uint_t growszc)
 423 {
 424         struct proc *p = curproc;
 425         size_t newsize;
 426         size_t oldsize;
 427         int    error;
 428         size_t pgsz;
 429         uint_t szc;
 430         struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 431
 432         ASSERT(sp < p->p_usrstack);
 433         sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 434
 435         /*
 436          * grow to growszc alignment but use current p->p_stkpageszc for
 437          * the segvn_crargs szc passed to segvn_create. For memcntl to
 438          * increase the szc, this allows the new extension segment to be
 439          * concatenated successfully with the existing stack segment.
 440          */
 441         if ((szc = growszc) != 0) {
 442                 pgsz = page_get_pagesize(szc);
 443                 ASSERT(pgsz > PAGESIZE);
 444                 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 445                 if (newsize > (size_t)p->p_stk_ctl) {
 446                         szc = 0;
 447                         pgsz = PAGESIZE;
 448                         newsize = p->p_usrstack - sp;
 449                 }
 450         } else {
 451                 pgsz = PAGESIZE;
 452                 newsize = p->p_usrstack - sp;
 453         }
 454
 455         if (newsize > (size_t)p->p_stk_ctl) {
 456                 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 457                     RCA_UNSAFE_ALL);
 458
 459                 return (ENOMEM);
 460         }
 461
 462         oldsize = p->p_stksize;
 463         ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 464
 465         if (newsize <= oldsize) {       /* prevent the stack from shrinking */
 466                 return (0);
 467         }
 468
 469         if (!(p->p_stkprot & PROT_EXEC)) {
 470                 crargs.prot &= ~PROT_EXEC;
 471         }
 472         /*
 473          * extend stack with the proposed new growszc, which is different
 474          * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 475          * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 476          * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 477          * if not aligned to szc's pgsz.
 478          */
 479         if (szc > 0) {
 480                 caddr_t oldsp = p->p_usrstack - oldsize;
 481                 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 482                     pgsz);
 483
 484                 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 485                         crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 486                             AS_MAP_NO_LPOOB;
 487                 } else if (oldsp == austk) {
 488                         crargs.szc = szc;
 489                 } else {
 490                         crargs.szc = AS_MAP_STACK;
 491                 }
 492         } else {
 493                 crargs.szc = AS_MAP_NO_LPOOB;
 494         }
 495         crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 496
 497         if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 498             segvn_create, &crargs)) != 0) {
 499                 if (error == EAGAIN) {
 500                         cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 501                             "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 502                 }
 503                 return (error);
 504         }
 505         p->p_stksize = newsize;
 506         return (0);
 507 }
 508
 509 /*
 510  * Find address for user to map.  If MAP_FIXED is not specified, we can pick
 511  * any address we want, but we will first try the value in *addrp if it is
 512  * non-NULL and _MAP_RANDOMIZE is not set.  Thus this is implementing a way to
 513  * try and get a preferred address.
 514  */
 515 int
 516 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 517     int vacalign, uint_t flags)
 518 {
 519         caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 520         size_t lenp = len;
 521
 522         ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 523         if (flags & MAP_FIXED) {
 524                 (void) as_unmap(as, *addrp, len);
 525                 return (0);
 526         } else if (basep != NULL &&
 527             ((flags & (MAP_ALIGN | _MAP_RANDOMIZE)) == 0) &&
 528             !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 529                 /* User supplied address was available */
 530                 *addrp = basep;
 531         } else {
 532                 /*
 533                  * No user supplied address or the address supplied was not
 534                  * available.
 535                  */
 536                 map_addr(addrp, len, off, vacalign, flags);
 537         }
 538         if (*addrp == NULL)
 539                 return (ENOMEM);
 540         return (0);
 541 }
 542
 543
 544 /*
 545  * Used for MAP_ANON - fast way to get anonymous pages
 546  */
 547 static int
 548 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 549     offset_t pos)
 550 {
 551         struct segvn_crargs vn_a;
 552         int error;
 553
 554         if (((PROT_ALL & uprot) != uprot))
 555                 return (EACCES);
 556
 557         if ((flags & MAP_FIXED) != 0) {
 558                 caddr_t userlimit;
 559
 560                 /*
 561                  * Use the user address.  First verify that
 562                  * the address to be used is page aligned.
 563                  * Then make some simple bounds checks.
 564                  */
 565                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 566                         return (EINVAL);
 567
 568                 userlimit = flags & _MAP_LOW32 ?
 569                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 570                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 571                 case RANGE_OKAY:
 572                         break;
 573                 case RANGE_BADPROT:
 574                         return (ENOTSUP);
 575                 case RANGE_BADADDR:
 576                 default:
 577                         return (ENOMEM);
 578                 }
 579         }
 580         /*
 581          * No need to worry about vac alignment for anonymous
 582          * pages since this is a "clone" object that doesn't
 583          * yet exist.
 584          */
 585         error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 586         if (error != 0) {
 587                 return (error);
 588         }
 589
 590         /*
 591          * Use the seg_vn segment driver; passing in the NULL amp
 592          * gives the desired "cloning" effect.
 593          */
 594         vn_a.vp = NULL;
 595         vn_a.offset = 0;
 596         vn_a.type = flags & MAP_TYPE;
 597         vn_a.prot = uprot;
 598         vn_a.maxprot = PROT_ALL;
 599         vn_a.flags = flags & ~MAP_TYPE;
 600         vn_a.cred = CRED();
 601         vn_a.amp = NULL;
 602         vn_a.szc = 0;
 603         vn_a.lgrp_mem_policy_flags = 0;
 604
 605         return (as_map(as, *addrp, len, segvn_create, &vn_a));
 606 }
 607
 608 #define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
 609         !(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
 610
 611 static int
 612 smmap_common(caddr_t *addrp, size_t len,
 613     int prot, int flags, struct file *fp, offset_t pos)
 614 {
 615         struct vnode *vp;
 616         struct as *as = curproc->p_as;
 617         uint_t uprot, maxprot, type;
 618         int error;
 619         int in_crit = 0;
 620
 621         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED |
 622             _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 623             MAP_TEXT | MAP_INITDATA)) != 0) {
 624                 /* | MAP_RENAME */      /* not implemented, let user know */
 625                 return (EINVAL);
 626         }
 627
 628         if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 629                 return (EINVAL);
 630         }
 631
 632         if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 633                 return (EINVAL);
 634         }
 635
 636         if ((flags & (MAP_FIXED | _MAP_RANDOMIZE)) ==
 637             (MAP_FIXED | _MAP_RANDOMIZE)) {
 638                 return (EINVAL);
 639         }
 640
 641         /*
 642          * If it's not a fixed allocation and mmap ASLR is enabled, randomize
 643          * it.
 644          */
 645         if (RANDOMIZABLE_MAPPING(*addrp, flags) &&
 646             secflag_enabled(curproc, PROC_SEC_ASLR))
 647                 flags |= _MAP_RANDOMIZE;
 648
 649         type = flags & MAP_TYPE;
 650         if (type != MAP_PRIVATE && type != MAP_SHARED)
 651                 return (EINVAL);
 652
 653
 654         if (flags & MAP_ALIGN) {
 655                 if (flags & MAP_FIXED)
 656                         return (EINVAL);
 657
 658                 /* alignment needs to be a power of 2 >= page size */
 659                 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 660                     !ISP2((uintptr_t)*addrp))
 661                         return (EINVAL);
 662         }
 663         /*
 664          * Check for bad lengths and file position.
 665          * We let the fop_map routine check for negative lengths
 666          * since on some vnode types this might be appropriate.
 667          */
 668         if (len == 0 || (pos & (uoff_t)PAGEOFFSET) != 0)
 669                 return (EINVAL);
 670
 671         maxprot = PROT_ALL;             /* start out allowing all accesses */
 672         uprot = prot | PROT_USER;
 673
 674         if (fp == NULL) {
 675                 ASSERT(flags & MAP_ANON);
 676                 /* discard lwpchan mappings, like munmap() */
 677                 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 678                         lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 679                 as_rangelock(as);
 680                 error = zmap(as, addrp, len, uprot, flags, pos);
 681                 as_rangeunlock(as);
 682                 /*
 683                  * Tell machine specific code that lwp has mapped shared memory
 684                  */
 685                 if (error == 0 && (flags & MAP_SHARED)) {
 686                         /* EMPTY */
 687                         LWP_MMODEL_SHARED_AS(*addrp, len);
 688                 }
 689                 return (error);
 690         } else if ((flags & MAP_ANON) != 0)
 691                 return (EINVAL);
 692
 693         vp = fp->f_vnode;
 694
 695         /* Can't execute code from "noexec" mounted filesystem. */
 696         if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 697                 maxprot &= ~PROT_EXEC;
 698
 699         /*
 700          * These checks were added as part of large files.
 701          *
 702          * Return ENXIO if the initial position is negative; return EOVERFLOW
 703          * if (offset + len) would overflow the maximum allowed offset for the
 704          * type of file descriptor being used.
 705          */
 706         if (vp->v_type == VREG) {
 707                 if (pos < 0)
 708                         return (ENXIO);
 709                 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 710                         return (EOVERFLOW);
 711         }
 712
 713         if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 714                 /* no write access allowed */
 715                 maxprot &= ~PROT_WRITE;
 716         }
 717
 718         /*
 719          * XXX - Do we also adjust maxprot based on protections
 720          * of the vnode?  E.g. if no execute permission is given
 721          * on the vnode for the current user, maxprot probably
 722          * should disallow PROT_EXEC also?  This is different
 723          * from the write access as this would be a per vnode
 724          * test as opposed to a per fd test for writability.
 725          */
 726
 727         /*
 728          * Verify that the specified protections are not greater than
 729          * the maximum allowable protections.  Also test to make sure
 730          * that the file descriptor does allows for read access since
 731          * "write only" mappings are hard to do since normally we do
 732          * the read from the file before the page can be written.
 733          */
 734         if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 735                 return (EACCES);
 736
 737         /*
 738          * If the user specified an address, do some simple checks here
 739          */
 740         if ((flags & MAP_FIXED) != 0) {
 741                 caddr_t userlimit;
 742
 743                 /*
 744                  * Use the user address.  First verify that
 745                  * the address to be used is page aligned.
 746                  * Then make some simple bounds checks.
 747                  */
 748                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 749                         return (EINVAL);
 750
 751                 userlimit = flags & _MAP_LOW32 ?
 752                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 753                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 754                 case RANGE_OKAY:
 755                         break;
 756                 case RANGE_BADPROT:
 757                         return (ENOTSUP);
 758                 case RANGE_BADADDR:
 759                 default:
 760                         return (ENOMEM);
 761                 }
 762         }
 763
 764         if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 765             nbl_need_check(vp)) {
 766                 int svmand;
 767                 nbl_op_t nop;
 768
 769                 nbl_start_crit(vp, RW_READER);
 770                 in_crit = 1;
 771                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 772                 if (error != 0)
 773                         goto done;
 774                 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 775                         if (prot & (PROT_READ | PROT_EXEC)) {
 776                                 nop = NBL_READWRITE;
 777                         } else {
 778                                 nop = NBL_WRITE;
 779                         }
 780                 } else {
 781                         nop = NBL_READ;
 782                 }
 783                 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 784                         error = EACCES;
 785                         goto done;
 786                 }
 787         }
 788
 789         /* discard lwpchan mappings, like munmap() */
 790         if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 791                 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 792
 793         /*
 794          * Ok, now let the vnode map routine do its thing to set things up.
 795          */
 796         error = fop_map(vp, pos, as,
 797             addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 798
 799         if (error == 0) {
 800                 /*
 801                  * Tell machine specific code that lwp has mapped shared memory
 802                  */
 803                 if (flags & MAP_SHARED) {
 804                         /* EMPTY */
 805                         LWP_MMODEL_SHARED_AS(*addrp, len);
 806                 }
 807                 if (vp->v_type == VREG &&
 808                     (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 809                         /*
 810                          * Mark this as an executable vnode
 811                          */
 812                         mutex_enter(&vp->v_lock);
 813                         vp->v_flag |= VVMEXEC;
 814                         mutex_exit(&vp->v_lock);
 815                 }
 816         }
 817
 818 done:
 819         if (in_crit)
 820                 nbl_end_crit(vp);
 821         return (error);
 822 }
 823
 824 #ifdef _LP64
 825 /*
 826  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 827  *
 828  * The "large file" mmap routine mmap64(2) is also mapped to this routine
 829  * by the 64-bit version of libc.
 830  *
 831  * Eventually, this should be the only version, and have smmap_common()
 832  * folded back into it again.  Some day.
 833  */
 834 caddr_t
 835 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 836 {
 837         struct file *fp;
 838         int error;
 839
 840         if (fd == -1 && (flags & MAP_ANON) != 0)
 841                 error = smmap_common(&addr, len, prot, flags,
 842                     NULL, (offset_t)pos);
 843         else if ((fp = getf(fd)) != NULL) {
 844                 error = smmap_common(&addr, len, prot, flags,
 845                     fp, (offset_t)pos);
 846                 releasef(fd);
 847         } else
 848                 error = EBADF;
 849
 850         return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 851 }
 852 #endif  /* _LP64 */
 853
 854 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 855
 856 /*
 857  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 858  */
 859 caddr_t
 860 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 861 {
 862         struct file *fp;
 863         int error;
 864         caddr_t a = (caddr_t)(uintptr_t)addr;
 865
 866         if (flags & _MAP_LOW32)
 867                 error = EINVAL;
 868         else if (fd == -1 && (flags & MAP_ANON) != 0)
 869                 error = smmap_common(&a, (size_t)len, prot,
 870                     flags | _MAP_LOW32, NULL, (offset_t)pos);
 871         else if ((fp = getf(fd)) != NULL) {
 872                 error = smmap_common(&a, (size_t)len, prot,
 873                     flags | _MAP_LOW32, fp, (offset_t)pos);
 874                 releasef(fd);
 875         } else
 876                 error = EBADF;
 877
 878         ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 879
 880         return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 881 }
 882
 883 /*
 884  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 885  *
 886  * Now things really get ugly because we can't use the C-style
 887  * calling convention for more than 6 args, and 64-bit parameter
 888  * passing on 32-bit systems is less than clean.
 889  */
 890
 891 struct mmaplf32a {
 892         caddr_t addr;
 893         size_t len;
 894 #ifdef _LP64
 895         /*
 896          * 32-bit contents, 64-bit cells
 897          */
 898         uint64_t prot;
 899         uint64_t flags;
 900         uint64_t fd;
 901         uint64_t offhi;
 902         uint64_t offlo;
 903 #else
 904         /*
 905          * 32-bit contents, 32-bit cells
 906          */
 907         uint32_t prot;
 908         uint32_t flags;
 909         uint32_t fd;
 910         uint32_t offhi;
 911         uint32_t offlo;
 912 #endif
 913 };
 914
 915 int
 916 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 917 {
 918         struct file *fp;
 919         int error;
 920         caddr_t a = uap->addr;
 921         int flags = (int)uap->flags;
 922         int fd = (int)uap->fd;
 923 #ifdef _BIG_ENDIAN
 924         offset_t off = ((uoff_t)uap->offhi << 32) | (uoff_t)uap->offlo;
 925 #else
 926         offset_t off = ((uoff_t)uap->offlo << 32) | (uoff_t)uap->offhi;
 927 #endif
 928
 929         if (flags & _MAP_LOW32)
 930                 error = EINVAL;
 931         else if (fd == -1 && (flags & MAP_ANON) != 0)
 932                 error = smmap_common(&a, uap->len, (int)uap->prot,
 933                     flags | _MAP_LOW32, NULL, off);
 934         else if ((fp = getf(fd)) != NULL) {
 935                 error = smmap_common(&a, uap->len, (int)uap->prot,
 936                     flags | _MAP_LOW32, fp, off);
 937                 releasef(fd);
 938         } else
 939                 error = EBADF;
 940
 941         if (error == 0)
 942                 rvp->r_val1 = (uintptr_t)a;
 943         return (error);
 944 }
 945
 946 #endif  /* _SYSCALL32_IMPL || _ILP32 */
 947
 948 int
 949 munmap(caddr_t addr, size_t len)
 950 {
 951         struct proc *p = curproc;
 952         struct as *as = p->p_as;
 953
 954         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 955                 return (set_errno(EINVAL));
 956
 957         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 958                 return (set_errno(EINVAL));
 959
 960         /*
 961          * Discard lwpchan mappings.
 962          */
 963         if (p->p_lcp != NULL)
 964                 lwpchan_delete_mapping(p, addr, addr + len);
 965         if (as_unmap(as, addr, len) != 0)
 966                 return (set_errno(EINVAL));
 967
 968         return (0);
 969 }
 970
 971 int
 972 mprotect(caddr_t addr, size_t len, int prot)
 973 {
 974         struct as *as = curproc->p_as;
 975         uint_t uprot = prot | PROT_USER;
 976         int error;
 977
 978         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 979                 return (set_errno(EINVAL));
 980
 981         switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 982         case RANGE_OKAY:
 983                 break;
 984         case RANGE_BADPROT:
 985                 return (set_errno(ENOTSUP));
 986         case RANGE_BADADDR:
 987         default:
 988                 return (set_errno(ENOMEM));
 989         }
 990
 991         error = as_setprot(as, addr, len, uprot);
 992         if (error)
 993                 return (set_errno(error));
 994         return (0);
 995 }
 996
 997 #define MC_CACHE        128                     /* internal result buffer */
 998 #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
 999
1000 int
1001 mincore(caddr_t addr, size_t len, char *vecp)
1002 {
1003         struct as *as = curproc->p_as;
1004         caddr_t ea;                     /* end address of loop */
1005         size_t rl;                      /* inner result length */
1006         char vec[MC_CACHE];             /* local vector cache */
1007         int error;
1008         model_t model;
1009         long    llen;
1010
1011         model = get_udatamodel();
1012         /*
1013          * Validate form of address parameters.
1014          */
1015         if (model == DATAMODEL_NATIVE) {
1016                 llen = (long)len;
1017         } else {
1018                 llen = (int32_t)(size32_t)len;
1019         }
1020         if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1021                 return (set_errno(EINVAL));
1022
1023         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1024                 return (set_errno(ENOMEM));
1025
1026         /*
1027          * Loop over subranges of interval [addr : addr + len), recovering
1028          * results internally and then copying them out to caller.  Subrange
1029          * is based on the size of MC_CACHE, defined above.
1030          */
1031         for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1032                 error = as_incore(as, addr,
1033                     (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1034                 if (rl != 0) {
1035                         rl = (rl + PAGESIZE - 1) / PAGESIZE;
1036                         if (copyout(vec, vecp, rl) != 0)
1037                                 return (set_errno(EFAULT));
1038                         vecp += rl;
1039                 }
1040                 if (error != 0)
1041                         return (set_errno(ENOMEM));
1042         }
1043         return (0);
1044 }