kernel/os/grow.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
  24  * Copyright 2017 Joyent, Inc.
  25  */
  26
  27 /*
  28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29  * Use is subject to license terms.
  30  */
  31
  32 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  33 /*        All Rights Reserved   */
  34
  35 #include <sys/types.h>
  36 #include <sys/inttypes.h>
  37 #include <sys/param.h>
  38 #include <sys/sysmacros.h>
  39 #include <sys/systm.h>
  40 #include <sys/signal.h>
  41 #include <sys/user.h>
  42 #include <sys/errno.h>
  43 #include <sys/var.h>
  44 #include <sys/proc.h>
  45 #include <sys/tuneable.h>
  46 #include <sys/debug.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/cred.h>
  49 #include <sys/vnode.h>
  50 #include <sys/vfs.h>
  51 #include <sys/vm.h>
  52 #include <sys/file.h>
  53 #include <sys/mman.h>
  54 #include <sys/vmparam.h>
  55 #include <sys/fcntl.h>
  56 #include <sys/lwpchan_impl.h>
  57 #include <sys/nbmlock.h>
  58
  59 #include <vm/hat.h>
  60 #include <vm/as.h>
  61 #include <vm/seg.h>
  62 #include <vm/seg_dev.h>
  63 #include <vm/seg_vn.h>
  64
  65 int use_brk_lpg = 1;
  66 int use_stk_lpg = 1;
  67
  68 /*
  69  * If set, we will not randomize mappings where the 'addr' argument is
  70  * non-NULL and not an alignment.
  71  */
  72 int aslr_respect_mmap_hint = 1;
  73
  74 static int brk_lpg(caddr_t nva);
  75 static int grow_lpg(caddr_t sp);
  76
  77 intptr_t
  78 brk(caddr_t nva)
  79 {
  80         int error;
  81         proc_t *p = curproc;
  82
  83         /*
  84          * Serialize brk operations on an address space.
  85          * This also serves as the lock protecting p_brksize
  86          * and p_brkpageszc.
  87          */
  88         as_rangelock(p->p_as);
  89
  90         /*
  91          * As a special case to aid the implementation of sbrk(3C), if given a
  92          * new brk of 0, return the current brk.  We'll hide this in brk(3C).
  93          */
  94         if (nva == 0) {
  95                 intptr_t base = (intptr_t)(p->p_brkbase + p->p_brksize);
  96                 as_rangeunlock(p->p_as);
  97                 return (base);
  98         }
  99
 100         if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 101                 error = brk_lpg(nva);
 102         } else {
 103                 error = brk_internal(nva, p->p_brkpageszc);
 104         }
 105         as_rangeunlock(p->p_as);
 106         return ((error != 0 ? set_errno(error) : 0));
 107 }
 108
 109 /*
 110  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 111  * then call brk_internal().
 112  * Returns 0 on success.
 113  */
 114 static int
 115 brk_lpg(caddr_t nva)
 116 {
 117         struct proc *p = curproc;
 118         size_t pgsz, len;
 119         caddr_t addr, brkend;
 120         caddr_t bssbase = p->p_bssbase;
 121         caddr_t brkbase = p->p_brkbase;
 122         int oszc, szc;
 123         int err;
 124
 125         oszc = p->p_brkpageszc;
 126
 127         /*
 128          * If p_brkbase has not yet been set, the first call
 129          * to brk_internal() will initialize it.
 130          */
 131         if (brkbase == 0) {
 132                 return (brk_internal(nva, oszc));
 133         }
 134
 135         len = nva - bssbase;
 136
 137         pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 138         szc = page_szc(pgsz);
 139
 140         /*
 141          * Covers two cases:
 142          * 1. page_szc() returns -1 for invalid page size, so we want to
 143          * ignore it in that case.
 144          * 2. By design we never decrease page size, as it is more stable.
 145          */
 146         if (szc <= oszc) {
 147                 err = brk_internal(nva, oszc);
 148                 /* If failed, back off to base page size. */
 149                 if (err != 0 && oszc != 0) {
 150                         err = brk_internal(nva, 0);
 151                 }
 152                 return (err);
 153         }
 154
 155         err = brk_internal(nva, szc);
 156         /* If using szc failed, map with base page size and return. */
 157         if (err != 0) {
 158                 if (szc != 0) {
 159                         err = brk_internal(nva, 0);
 160                 }
 161                 return (err);
 162         }
 163
 164         /*
 165          * Round up brk base to a large page boundary and remap
 166          * anything in the segment already faulted in beyond that
 167          * point.
 168          */
 169         addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 170         brkend = brkbase + p->p_brksize;
 171         len = brkend - addr;
 172         /* Check that len is not negative. Update page size code for heap. */
 173         if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 174                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 175                 p->p_brkpageszc = szc;
 176         }
 177
 178         ASSERT(err == 0);
 179         return (err);           /* should always be 0 */
 180 }
 181
 182 /*
 183  * Returns 0 on success.
 184  */
 185 int
 186 brk_internal(caddr_t nva, uint_t brkszc)
 187 {
 188         caddr_t ova;                    /* current break address */
 189         size_t size;
 190         int     error;
 191         struct proc *p = curproc;
 192         struct as *as = p->p_as;
 193         size_t pgsz;
 194         uint_t szc;
 195         rctl_qty_t as_rctl;
 196
 197         /*
 198          * extend heap to brkszc alignment but use current p->p_brkpageszc
 199          * for the newly created segment. This allows the new extension
 200          * segment to be concatenated successfully with the existing brk
 201          * segment.
 202          */
 203         if ((szc = brkszc) != 0) {
 204                 pgsz = page_get_pagesize(szc);
 205                 ASSERT(pgsz > PAGESIZE);
 206         } else {
 207                 pgsz = PAGESIZE;
 208         }
 209
 210         mutex_enter(&p->p_lock);
 211         as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 212             p->p_rctls, p);
 213         mutex_exit(&p->p_lock);
 214
 215         /*
 216          * If p_brkbase has not yet been set, the first call
 217          * to brk() will initialize it.
 218          */
 219         if (p->p_brkbase == 0)
 220                 p->p_brkbase = nva;
 221
 222         /*
 223          * Before multiple page size support existed p_brksize was the value
 224          * not rounded to the pagesize (i.e. it stored the exact user request
 225          * for heap size). If pgsz is greater than PAGESIZE calculate the
 226          * heap size as the real new heap size by rounding it up to pgsz.
 227          * This is useful since we may want to know where the heap ends
 228          * without knowing heap pagesize (e.g. some old code) and also if
 229          * heap pagesize changes we can update p_brkpageszc but delay adding
 230          * new mapping yet still know from p_brksize where the heap really
 231          * ends. The user requested heap end is stored in libc variable.
 232          */
 233         if (pgsz > PAGESIZE) {
 234                 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 235                 size = tnva - p->p_brkbase;
 236                 if (tnva < p->p_brkbase || (size > p->p_brksize &&
 237                     size > (size_t)as_rctl)) {
 238                         szc = 0;
 239                         pgsz = PAGESIZE;
 240                         size = nva - p->p_brkbase;
 241                 }
 242         } else {
 243                 size = nva - p->p_brkbase;
 244         }
 245
 246         /*
 247          * use PAGESIZE to roundup ova because we want to know the real value
 248          * of the current heap end in case p_brkpageszc changes since the last
 249          * p_brksize was computed.
 250          */
 251         nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 252         ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 253             PAGESIZE);
 254
 255         if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 256             size > as_rctl)) {
 257                 mutex_enter(&p->p_lock);
 258                 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 259                     RCA_SAFE);
 260                 mutex_exit(&p->p_lock);
 261                 return (ENOMEM);
 262         }
 263
 264         if (nva > ova) {
 265                 struct segvn_crargs crargs =
 266                     SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 267
 268                 if (!(p->p_datprot & PROT_EXEC)) {
 269                         crargs.prot &= ~PROT_EXEC;
 270                 }
 271
 272                 /*
 273                  * Add new zfod mapping to extend UNIX data segment
 274                  * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 275                  * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 276                  * page sizes if ova is not aligned to szc's pgsz.
 277                  */
 278                 if (szc > 0) {
 279                         caddr_t rbss;
 280
 281                         rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 282                             pgsz);
 283                         if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 284                                 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 285                                     AS_MAP_NO_LPOOB;
 286                         } else if (ova == rbss) {
 287                                 crargs.szc = szc;
 288                         } else {
 289                                 crargs.szc = AS_MAP_HEAP;
 290                         }
 291                 } else {
 292                         crargs.szc = AS_MAP_NO_LPOOB;
 293                 }
 294                 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 295                 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 296                     &crargs);
 297                 if (error) {
 298                         return (error);
 299                 }
 300
 301         } else if (nva < ova) {
 302                 /*
 303                  * Release mapping to shrink UNIX data segment.
 304                  */
 305                 (void) as_unmap(as, nva, (size_t)(ova - nva));
 306         }
 307         p->p_brksize = size;
 308         return (0);
 309 }
 310
 311 /*
 312  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 313  * This routine assumes that the stack grows downward.
 314  */
 315 int
 316 grow(caddr_t sp)
 317 {
 318         struct proc *p = curproc;
 319         struct as *as = p->p_as;
 320         size_t oldsize = p->p_stksize;
 321         size_t newsize;
 322         int err;
 323
 324         /*
 325          * Serialize grow operations on an address space.
 326          * This also serves as the lock protecting p_stksize
 327          * and p_stkpageszc.
 328          */
 329         as_rangelock(as);
 330         if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 331                 err = grow_lpg(sp);
 332         } else {
 333                 err = grow_internal(sp, p->p_stkpageszc);
 334         }
 335         newsize = p->p_stksize;
 336         as_rangeunlock(as);
 337
 338         if (err == 0 && newsize > oldsize) {
 339                 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 340                 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 341                 /*
 342                  * Set up translations so the process doesn't have to fault in
 343                  * the stack pages we just gave it.
 344                  */
 345                 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 346                     newsize - oldsize, F_INVAL, S_WRITE);
 347         }
 348         return ((err == 0 ? 1 : 0));
 349 }
 350
 351 /*
 352  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 353  * then call grow_internal().
 354  * Returns 0 on success.
 355  */
 356 static int
 357 grow_lpg(caddr_t sp)
 358 {
 359         struct proc *p = curproc;
 360         size_t pgsz;
 361         size_t len, newsize;
 362         caddr_t addr, saddr;
 363         caddr_t growend;
 364         int oszc, szc;
 365         int err;
 366
 367         newsize = p->p_usrstack - sp;
 368
 369         oszc = p->p_stkpageszc;
 370         pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 371         szc = page_szc(pgsz);
 372
 373         /*
 374          * Covers two cases:
 375          * 1. page_szc() returns -1 for invalid page size, so we want to
 376          * ignore it in that case.
 377          * 2. By design we never decrease page size, as it is more stable.
 378          * This shouldn't happen as the stack never shrinks.
 379          */
 380         if (szc <= oszc) {
 381                 err = grow_internal(sp, oszc);
 382                 /* failed, fall back to base page size */
 383                 if (err != 0 && oszc != 0) {
 384                         err = grow_internal(sp, 0);
 385                 }
 386                 return (err);
 387         }
 388
 389         /*
 390          * We've grown sufficiently to switch to a new page size.
 391          * So we are going to remap the whole segment with the new page size.
 392          */
 393         err = grow_internal(sp, szc);
 394         /* The grow with szc failed, so fall back to base page size. */
 395         if (err != 0) {
 396                 if (szc != 0) {
 397                         err = grow_internal(sp, 0);
 398                 }
 399                 return (err);
 400         }
 401
 402         /*
 403          * Round up stack pointer to a large page boundary and remap
 404          * any pgsz pages in the segment already faulted in beyond that
 405          * point.
 406          */
 407         saddr = p->p_usrstack - p->p_stksize;
 408         addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 409         growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 410         len = growend - addr;
 411         /* Check that len is not negative. Update page size code for stack. */
 412         if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 413                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 414                 p->p_stkpageszc = szc;
 415         }
 416
 417         ASSERT(err == 0);
 418         return (err);           /* should always be 0 */
 419 }
 420
 421 /*
 422  * This routine assumes that the stack grows downward.
 423  * Returns 0 on success, errno on failure.
 424  */
 425 int
 426 grow_internal(caddr_t sp, uint_t growszc)
 427 {
 428         struct proc *p = curproc;
 429         size_t newsize;
 430         size_t oldsize;
 431         uintptr_t new_start;
 432         int    error;
 433         size_t pgsz;
 434         uint_t szc;
 435         struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 436
 437         ASSERT(sp < p->p_usrstack);
 438         sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 439
 440         /*
 441          * grow to growszc alignment but use current p->p_stkpageszc for
 442          * the segvn_crargs szc passed to segvn_create. For memcntl to
 443          * increase the szc, this allows the new extension segment to be
 444          * concatenated successfully with the existing stack segment.
 445          */
 446         if ((szc = growszc) != 0) {
 447                 pgsz = page_get_pagesize(szc);
 448                 ASSERT(pgsz > PAGESIZE);
 449                 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 450                 if (newsize > (size_t)p->p_stk_ctl) {
 451                         szc = 0;
 452                         pgsz = PAGESIZE;
 453                         newsize = p->p_usrstack - sp;
 454                 }
 455         } else {
 456                 pgsz = PAGESIZE;
 457                 newsize = p->p_usrstack - sp;
 458         }
 459
 460         if (newsize > (size_t)p->p_stk_ctl) {
 461                 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 462                     RCA_UNSAFE_ALL);
 463
 464                 return (ENOMEM);
 465         }
 466
 467         oldsize = p->p_stksize;
 468         ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 469
 470         if (newsize <= oldsize) {       /* prevent the stack from shrinking */
 471                 return (0);
 472         }
 473
 474         if (!(p->p_stkprot & PROT_EXEC)) {
 475                 crargs.prot &= ~PROT_EXEC;
 476         }
 477         /*
 478          * extend stack with the proposed new growszc, which is different
 479          * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 480          * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 481          * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 482          * if not aligned to szc's pgsz.
 483          */
 484         if (szc > 0) {
 485                 caddr_t oldsp = p->p_usrstack - oldsize;
 486                 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 487                     pgsz);
 488
 489                 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 490                         crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 491                             AS_MAP_NO_LPOOB;
 492                 } else if (oldsp == austk) {
 493                         crargs.szc = szc;
 494                 } else {
 495                         crargs.szc = AS_MAP_STACK;
 496                 }
 497         } else {
 498                 crargs.szc = AS_MAP_NO_LPOOB;
 499         }
 500         crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 501
 502         /*
 503          * The stack is about to grow into its guard.  This can be acceptable
 504          * if the size restriction on the stack has been expanded since its
 505          * initialization during exec().  In such cases, the guard segment will
 506          * be shrunk, provided the new size is reasonable.
 507          */
 508         new_start = (uintptr_t)p->p_usrstack - newsize;
 509         if (p->p_stkg_start != 0 && new_start > p->p_stkg_start &&
 510             new_start < p->p_stkg_end) {
 511                 const size_t unmap_sz = p->p_stkg_end - new_start;
 512                 const size_t remain_sz = new_start - p->p_stkg_start;
 513                 extern size_t stack_guard_min_sz;
 514
 515                 /* Do not allow the guard to shrink below minimum size */
 516                 if (remain_sz < stack_guard_min_sz) {
 517                         return (ENOMEM);
 518                 }
 519
 520                 error = as_unmap(p->p_as, (caddr_t)new_start, unmap_sz);
 521                 if (error != 0) {
 522                         return (error);
 523                 }
 524                 p->p_stkg_end -= unmap_sz;
 525         }
 526
 527         if ((error = as_map(p->p_as, (caddr_t)new_start, newsize - oldsize,
 528             segvn_create, &crargs)) != 0) {
 529                 if (error == EAGAIN) {
 530                         cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 531                             "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 532                 }
 533                 return (error);
 534         }
 535         p->p_stksize = newsize;
 536         return (0);
 537 }
 538
 539 /*
 540  * Find address for user to map.  If MAP_FIXED is not specified, we can pick
 541  * any address we want, but we will first try the value in *addrp if it is
 542  * non-NULL and _MAP_RANDOMIZE is not set.  Thus this is implementing a way to
 543  * try and get a preferred address.
 544  */
 545 int
 546 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 547     int vacalign, uint_t flags)
 548 {
 549         caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 550         size_t lenp = len;
 551
 552         ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 553         if (flags & MAP_FIXED) {
 554                 (void) as_unmap(as, *addrp, len);
 555                 return (0);
 556         } else if (basep != NULL &&
 557             ((flags & (MAP_ALIGN | _MAP_RANDOMIZE)) == 0) &&
 558             !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 559                 /* User supplied address was available */
 560                 *addrp = basep;
 561         } else {
 562                 /*
 563                  * No user supplied address or the address supplied was not
 564                  * available.
 565                  */
 566                 map_addr(addrp, len, off, vacalign, flags);
 567         }
 568         if (*addrp == NULL)
 569                 return (ENOMEM);
 570         return (0);
 571 }
 572
 573
 574 /*
 575  * Used for MAP_ANON - fast way to get anonymous pages
 576  */
 577 static int
 578 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 579     offset_t pos)
 580 {
 581         struct segvn_crargs vn_a;
 582         int error;
 583
 584         if (((PROT_ALL & uprot) != uprot))
 585                 return (EACCES);
 586
 587         if ((flags & MAP_FIXED) != 0) {
 588                 caddr_t userlimit;
 589
 590                 /*
 591                  * Use the user address.  First verify that
 592                  * the address to be used is page aligned.
 593                  * Then make some simple bounds checks.
 594                  */
 595                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 596                         return (EINVAL);
 597
 598                 userlimit = flags & _MAP_LOW32 ?
 599                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 600                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 601                 case RANGE_OKAY:
 602                         break;
 603                 case RANGE_BADPROT:
 604                         return (ENOTSUP);
 605                 case RANGE_BADADDR:
 606                 default:
 607                         return (ENOMEM);
 608                 }
 609         }
 610         /*
 611          * No need to worry about vac alignment for anonymous
 612          * pages since this is a "clone" object that doesn't
 613          * yet exist.
 614          */
 615         error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 616         if (error != 0) {
 617                 return (error);
 618         }
 619
 620         /*
 621          * Use the seg_vn segment driver; passing in the NULL amp
 622          * gives the desired "cloning" effect.
 623          */
 624         vn_a.vp = NULL;
 625         vn_a.offset = 0;
 626         vn_a.type = flags & MAP_TYPE;
 627         vn_a.prot = uprot;
 628         vn_a.maxprot = PROT_ALL;
 629         vn_a.flags = flags & ~MAP_TYPE;
 630         vn_a.cred = CRED();
 631         vn_a.amp = NULL;
 632         vn_a.szc = 0;
 633         vn_a.lgrp_mem_policy_flags = 0;
 634
 635         return (as_map(as, *addrp, len, segvn_create, &vn_a));
 636 }
 637
 638 #define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
 639         !(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
 640
 641 static int
 642 smmap_common(caddr_t *addrp, size_t len,
 643     int prot, int flags, struct file *fp, offset_t pos)
 644 {
 645         struct vnode *vp;
 646         struct as *as = curproc->p_as;
 647         uint_t uprot, maxprot, type;
 648         int error;
 649         int in_crit = 0;
 650
 651         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED |
 652             _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 653             MAP_TEXT | MAP_INITDATA)) != 0) {
 654                 /* | MAP_RENAME */      /* not implemented, let user know */
 655                 return (EINVAL);
 656         }
 657
 658         if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 659                 return (EINVAL);
 660         }
 661
 662         if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 663                 return (EINVAL);
 664         }
 665
 666         if ((flags & (MAP_FIXED | _MAP_RANDOMIZE)) ==
 667             (MAP_FIXED | _MAP_RANDOMIZE)) {
 668                 return (EINVAL);
 669         }
 670
 671         /*
 672          * If it's not a fixed allocation and mmap ASLR is enabled, randomize
 673          * it.
 674          */
 675         if (RANDOMIZABLE_MAPPING(*addrp, flags) &&
 676             secflag_enabled(curproc, PROC_SEC_ASLR))
 677                 flags |= _MAP_RANDOMIZE;
 678
 679         type = flags & MAP_TYPE;
 680         if (type != MAP_PRIVATE && type != MAP_SHARED)
 681                 return (EINVAL);
 682
 683
 684         if (flags & MAP_ALIGN) {
 685                 if (flags & MAP_FIXED)
 686                         return (EINVAL);
 687
 688                 /* alignment needs to be a power of 2 >= page size */
 689                 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 690                     !ISP2((uintptr_t)*addrp))
 691                         return (EINVAL);
 692         }
 693         /*
 694          * Check for bad lengths and file position.
 695          * We let the fop_map routine check for negative lengths
 696          * since on some vnode types this might be appropriate.
 697          */
 698         if (len == 0 || (pos & (uoff_t)PAGEOFFSET) != 0)
 699                 return (EINVAL);
 700
 701         maxprot = PROT_ALL;             /* start out allowing all accesses */
 702         uprot = prot | PROT_USER;
 703
 704         if (fp == NULL) {
 705                 ASSERT(flags & MAP_ANON);
 706                 /* discard lwpchan mappings, like munmap() */
 707                 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 708                         lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 709                 as_rangelock(as);
 710                 error = zmap(as, addrp, len, uprot, flags, pos);
 711                 as_rangeunlock(as);
 712                 /*
 713                  * Tell machine specific code that lwp has mapped shared memory
 714                  */
 715                 if (error == 0 && (flags & MAP_SHARED)) {
 716                         /* EMPTY */
 717                         LWP_MMODEL_SHARED_AS(*addrp, len);
 718                 }
 719                 return (error);
 720         } else if ((flags & MAP_ANON) != 0)
 721                 return (EINVAL);
 722
 723         vp = fp->f_vnode;
 724
 725         /* Can't execute code from "noexec" mounted filesystem. */
 726         if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 727                 maxprot &= ~PROT_EXEC;
 728
 729         /*
 730          * These checks were added as part of large files.
 731          *
 732          * Return ENXIO if the initial position is negative; return EOVERFLOW
 733          * if (offset + len) would overflow the maximum allowed offset for the
 734          * type of file descriptor being used.
 735          */
 736         if (vp->v_type == VREG) {
 737                 if (pos < 0)
 738                         return (ENXIO);
 739                 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 740                         return (EOVERFLOW);
 741         }
 742
 743         if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 744                 /* no write access allowed */
 745                 maxprot &= ~PROT_WRITE;
 746         }
 747
 748         /*
 749          * XXX - Do we also adjust maxprot based on protections
 750          * of the vnode?  E.g. if no execute permission is given
 751          * on the vnode for the current user, maxprot probably
 752          * should disallow PROT_EXEC also?  This is different
 753          * from the write access as this would be a per vnode
 754          * test as opposed to a per fd test for writability.
 755          */
 756
 757         /*
 758          * Verify that the specified protections are not greater than
 759          * the maximum allowable protections.  Also test to make sure
 760          * that the file descriptor does allows for read access since
 761          * "write only" mappings are hard to do since normally we do
 762          * the read from the file before the page can be written.
 763          */
 764         if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 765                 return (EACCES);
 766
 767         /*
 768          * If the user specified an address, do some simple checks here
 769          */
 770         if ((flags & MAP_FIXED) != 0) {
 771                 caddr_t userlimit;
 772
 773                 /*
 774                  * Use the user address.  First verify that
 775                  * the address to be used is page aligned.
 776                  * Then make some simple bounds checks.
 777                  */
 778                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 779                         return (EINVAL);
 780
 781                 userlimit = flags & _MAP_LOW32 ?
 782                     (caddr_t)USERLIMIT32 : as->a_userlimit;
 783                 switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
 784                 case RANGE_OKAY:
 785                         break;
 786                 case RANGE_BADPROT:
 787                         return (ENOTSUP);
 788                 case RANGE_BADADDR:
 789                 default:
 790                         return (ENOMEM);
 791                 }
 792         }
 793
 794         if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 795             nbl_need_check(vp)) {
 796                 int svmand;
 797                 nbl_op_t nop;
 798
 799                 nbl_start_crit(vp, RW_READER);
 800                 in_crit = 1;
 801                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 802                 if (error != 0)
 803                         goto done;
 804                 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 805                         if (prot & (PROT_READ | PROT_EXEC)) {
 806                                 nop = NBL_READWRITE;
 807                         } else {
 808                                 nop = NBL_WRITE;
 809                         }
 810                 } else {
 811                         nop = NBL_READ;
 812                 }
 813                 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 814                         error = EACCES;
 815                         goto done;
 816                 }
 817         }
 818
 819         /* discard lwpchan mappings, like munmap() */
 820         if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 821                 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 822
 823         /*
 824          * Ok, now let the vnode map routine do its thing to set things up.
 825          */
 826         error = fop_map(vp, pos, as,
 827             addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 828
 829         if (error == 0) {
 830                 /*
 831                  * Tell machine specific code that lwp has mapped shared memory
 832                  */
 833                 if (flags & MAP_SHARED) {
 834                         /* EMPTY */
 835                         LWP_MMODEL_SHARED_AS(*addrp, len);
 836                 }
 837                 if (vp->v_type == VREG &&
 838                     (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 839                         /*
 840                          * Mark this as an executable vnode
 841                          */
 842                         mutex_enter(&vp->v_lock);
 843                         vp->v_flag |= VVMEXEC;
 844                         mutex_exit(&vp->v_lock);
 845                 }
 846         }
 847
 848 done:
 849         if (in_crit)
 850                 nbl_end_crit(vp);
 851         return (error);
 852 }
 853
 854 #ifdef _LP64
 855 /*
 856  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 857  *
 858  * The "large file" mmap routine mmap64(2) is also mapped to this routine
 859  * by the 64-bit version of libc.
 860  *
 861  * Eventually, this should be the only version, and have smmap_common()
 862  * folded back into it again.  Some day.
 863  */
 864 caddr_t
 865 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 866 {
 867         struct file *fp;
 868         int error;
 869
 870         if (fd == -1 && (flags & MAP_ANON) != 0)
 871                 error = smmap_common(&addr, len, prot, flags,
 872                     NULL, (offset_t)pos);
 873         else if ((fp = getf(fd)) != NULL) {
 874                 error = smmap_common(&addr, len, prot, flags,
 875                     fp, (offset_t)pos);
 876                 releasef(fd);
 877         } else
 878                 error = EBADF;
 879
 880         return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 881 }
 882 #endif  /* _LP64 */
 883
 884 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 885
 886 /*
 887  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 888  */
 889 caddr_t
 890 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 891 {
 892         struct file *fp;
 893         int error;
 894         caddr_t a = (caddr_t)(uintptr_t)addr;
 895
 896         if (flags & _MAP_LOW32)
 897                 error = EINVAL;
 898         else if (fd == -1 && (flags & MAP_ANON) != 0)
 899                 error = smmap_common(&a, (size_t)len, prot,
 900                     flags | _MAP_LOW32, NULL, (offset_t)pos);
 901         else if ((fp = getf(fd)) != NULL) {
 902                 error = smmap_common(&a, (size_t)len, prot,
 903                     flags | _MAP_LOW32, fp, (offset_t)pos);
 904                 releasef(fd);
 905         } else
 906                 error = EBADF;
 907
 908         ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 909
 910         return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 911 }
 912
 913 /*
 914  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 915  *
 916  * Now things really get ugly because we can't use the C-style
 917  * calling convention for more than 6 args, and 64-bit parameter
 918  * passing on 32-bit systems is less than clean.
 919  */
 920
 921 struct mmaplf32a {
 922         caddr_t addr;
 923         size_t len;
 924 #ifdef _LP64
 925         /*
 926          * 32-bit contents, 64-bit cells
 927          */
 928         uint64_t prot;
 929         uint64_t flags;
 930         uint64_t fd;
 931         uint64_t offhi;
 932         uint64_t offlo;
 933 #else
 934         /*
 935          * 32-bit contents, 32-bit cells
 936          */
 937         uint32_t prot;
 938         uint32_t flags;
 939         uint32_t fd;
 940         uint32_t offhi;
 941         uint32_t offlo;
 942 #endif
 943 };
 944
 945 int
 946 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 947 {
 948         struct file *fp;
 949         int error;
 950         caddr_t a = uap->addr;
 951         int flags = (int)uap->flags;
 952         int fd = (int)uap->fd;
 953 #ifdef _BIG_ENDIAN
 954         offset_t off = ((uoff_t)uap->offhi << 32) | (uoff_t)uap->offlo;
 955 #else
 956         offset_t off = ((uoff_t)uap->offlo << 32) | (uoff_t)uap->offhi;
 957 #endif
 958
 959         if (flags & _MAP_LOW32)
 960                 error = EINVAL;
 961         else if (fd == -1 && (flags & MAP_ANON) != 0)
 962                 error = smmap_common(&a, uap->len, (int)uap->prot,
 963                     flags | _MAP_LOW32, NULL, off);
 964         else if ((fp = getf(fd)) != NULL) {
 965                 error = smmap_common(&a, uap->len, (int)uap->prot,
 966                     flags | _MAP_LOW32, fp, off);
 967                 releasef(fd);
 968         } else
 969                 error = EBADF;
 970
 971         if (error == 0)
 972                 rvp->r_val1 = (uintptr_t)a;
 973         return (error);
 974 }
 975
 976 #endif  /* _SYSCALL32_IMPL || _ILP32 */
 977
 978 int
 979 munmap(caddr_t addr, size_t len)
 980 {
 981         struct proc *p = curproc;
 982         struct as *as = p->p_as;
 983
 984         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 985                 return (set_errno(EINVAL));
 986
 987         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 988                 return (set_errno(EINVAL));
 989
 990         /*
 991          * Discard lwpchan mappings.
 992          */
 993         if (p->p_lcp != NULL)
 994                 lwpchan_delete_mapping(p, addr, addr + len);
 995         if (as_unmap(as, addr, len) != 0)
 996                 return (set_errno(EINVAL));
 997
 998         return (0);
 999 }
1000
1001 int
1002 mprotect(caddr_t addr, size_t len, int prot)
1003 {
1004         struct as *as = curproc->p_as;
1005         uint_t uprot = prot | PROT_USER;
1006         int error;
1007
1008         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
1009                 return (set_errno(EINVAL));
1010
1011         switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
1012         case RANGE_OKAY:
1013                 break;
1014         case RANGE_BADPROT:
1015                 return (set_errno(ENOTSUP));
1016         case RANGE_BADADDR:
1017         default:
1018                 return (set_errno(ENOMEM));
1019         }
1020
1021         error = as_setprot(as, addr, len, uprot);
1022         if (error)
1023                 return (set_errno(error));
1024         return (0);
1025 }
1026
1027 #define MC_CACHE        128                     /* internal result buffer */
1028 #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
1029
1030 int
1031 mincore(caddr_t addr, size_t len, char *vecp)
1032 {
1033         struct as *as = curproc->p_as;
1034         caddr_t ea;                     /* end address of loop */
1035         size_t rl;                      /* inner result length */
1036         char vec[MC_CACHE];             /* local vector cache */
1037         int error;
1038         model_t model;
1039         long    llen;
1040
1041         model = get_udatamodel();
1042         /*
1043          * Validate form of address parameters.
1044          */
1045         if (model == DATAMODEL_NATIVE) {
1046                 llen = (long)len;
1047         } else {
1048                 llen = (int32_t)(size32_t)len;
1049         }
1050         if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1051                 return (set_errno(EINVAL));
1052
1053         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1054                 return (set_errno(ENOMEM));
1055
1056         /*
1057          * Loop over subranges of interval [addr : addr + len), recovering
1058          * results internally and then copying them out to caller.  Subrange
1059          * is based on the size of MC_CACHE, defined above.
1060          */
1061         for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1062                 error = as_incore(as, addr,
1063                     (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1064                 if (rl != 0) {
1065                         rl = (rl + PAGESIZE - 1) / PAGESIZE;
1066                         if (copyout(vec, vecp, rl) != 0)
1067                                 return (set_errno(EFAULT));
1068                         vecp += rl;
1069                 }
1070                 if (error != 0)
1071                         return (set_errno(ENOMEM));
1072         }
1073         return (0);
1074 }