kernel/vm/vm_as.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2018 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  29 /*        All Rights Reserved   */
  30
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40
  41 /*
  42  * VM - address spaces.
  43  */
  44
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>
  49 #include <sys/systm.h>
  50 #include <sys/mman.h>
  51 #include <sys/sysmacros.h>
  52 #include <sys/cpuvar.h>
  53 #include <sys/sysinfo.h>
  54 #include <sys/kmem.h>
  55 #include <sys/vnode.h>
  56 #include <sys/vmsystm.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/debug.h>
  59 #include <sys/tnf_probe.h>
  60 #include <sys/vtrace.h>
  61
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_vn.h>
  66 #include <vm/seg_dev.h>
  67 #include <vm/seg_kmem.h>
  68 #include <vm/seg_map.h>
  69 #include <vm/seg_spt.h>
  70 #include <vm/seg_hole.h>
  71 #include <vm/page.h>
  72
  73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  74
  75 static struct kmem_cache *as_cache;
  76
  77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  78 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  79
  80
  81 /*
  82  * Verifying the segment lists is very time-consuming; it may not be
  83  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  84  */
  85 #ifdef DEBUG
  86 #define VERIFY_SEGLIST
  87 int do_as_verify = 0;
  88 #endif
  89
  90 /*
  91  * Allocate a new callback data structure entry and fill in the events of
  92  * interest, the address range of interest, and the callback argument.
  93  * Link the entry on the as->a_callbacks list. A callback entry for the
  94  * entire address space may be specified with vaddr = 0 and size = -1.
  95  *
  96  * CALLERS RESPONSIBILITY: If not calling from within the process context for
  97  * the specified as, the caller must guarantee persistence of the specified as
  98  * for the duration of this function (eg. pages being locked within the as
  99  * will guarantee persistence).
 100  */
 101 int
 102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 103     caddr_t vaddr, size_t size, int sleepflag)
 104 {
 105         struct as_callback      *current_head, *cb;
 106         caddr_t                 saddr;
 107         size_t                  rsize;
 108
 109         /* callback function and an event are mandatory */
 110         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 111                 return (EINVAL);
 112
 113         /* Adding a callback after as_free has been called is not allowed */
 114         if (as == &kas)
 115                 return (ENOMEM);
 116
 117         /*
 118          * vaddr = 0 and size = -1 is used to indicate that the callback range
 119          * is the entire address space so no rounding is done in that case.
 120          */
 121         if (size != -1) {
 122                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 123                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 124                     (size_t)saddr;
 125                 /* check for wraparound */
 126                 if (saddr + rsize < saddr)
 127                         return (ENOMEM);
 128         } else {
 129                 if (vaddr != 0)
 130                         return (EINVAL);
 131                 saddr = vaddr;
 132                 rsize = size;
 133         }
 134
 135         /* Allocate and initialize a callback entry */
 136         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 137         if (cb == NULL)
 138                 return (EAGAIN);
 139
 140         cb->ascb_func = cb_func;
 141         cb->ascb_arg = arg;
 142         cb->ascb_events = events;
 143         cb->ascb_saddr = saddr;
 144         cb->ascb_len = rsize;
 145
 146         /* Add the entry to the list */
 147         mutex_enter(&as->a_contents);
 148         current_head = as->a_callbacks;
 149         as->a_callbacks = cb;
 150         cb->ascb_next = current_head;
 151
 152         /*
 153          * The call to this function may lose in a race with
 154          * a pertinent event - eg. a thread does long term memory locking
 155          * but before the callback is added another thread executes as_unmap.
 156          * A broadcast here resolves that.
 157          */
 158         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 159                 AS_CLRUNMAPWAIT(as);
 160                 cv_broadcast(&as->a_cv);
 161         }
 162
 163         mutex_exit(&as->a_contents);
 164         return (0);
 165 }
 166
 167 /*
 168  * Search the callback list for an entry which pertains to arg.
 169  *
 170  * This is called from within the client upon completion of the callback.
 171  * RETURN VALUES:
 172  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 173  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 174  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 175  *                      entry will be made in as_do_callbacks)
 176  *
 177  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 178  * set, it indicates that as_do_callbacks is processing this entry.  The
 179  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 180  * to unblock as_do_callbacks, in case it is blocked.
 181  *
 182  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 183  * the specified as, the caller must guarantee persistence of the specified as
 184  * for the duration of this function (eg. pages being locked within the as
 185  * will guarantee persistence).
 186  */
 187 uint_t
 188 as_delete_callback(struct as *as, void *arg)
 189 {
 190         struct as_callback **prevcb = &as->a_callbacks;
 191         struct as_callback *cb;
 192         uint_t rc = AS_CALLBACK_NOTFOUND;
 193
 194         mutex_enter(&as->a_contents);
 195         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 196                 if (cb->ascb_arg != arg)
 197                         continue;
 198
 199                 /*
 200                  * If the events indicate AS_CALLBACK_CALLED, just clear
 201                  * AS_ALL_EVENT in the events field and wakeup the thread
 202                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 203                  * will take care of removing this entry from the list.  In
 204                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 205                  * (AS_CALLBACK_CALLED not set), just remove it from the
 206                  * list, return the memory and return AS_CALLBACK_DELETED.
 207                  */
 208                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 209                         /* leave AS_CALLBACK_CALLED */
 210                         cb->ascb_events &= ~AS_ALL_EVENT;
 211                         rc = AS_CALLBACK_DELETE_DEFERRED;
 212                         cv_broadcast(&as->a_cv);
 213                 } else {
 214                         *prevcb = cb->ascb_next;
 215                         kmem_free(cb, sizeof (struct as_callback));
 216                         rc = AS_CALLBACK_DELETED;
 217                 }
 218                 break;
 219         }
 220         mutex_exit(&as->a_contents);
 221         return (rc);
 222 }
 223
 224 /*
 225  * Searches the as callback list for a matching entry.
 226  * Returns a pointer to the first matching callback, or NULL if
 227  * nothing is found.
 228  * This function never sleeps so it is ok to call it with more
 229  * locks held but the (required) a_contents mutex.
 230  *
 231  * See also comment on as_do_callbacks below.
 232  */
 233 static struct as_callback *
 234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 235     size_t event_len)
 236 {
 237         struct as_callback      *cb;
 238
 239         ASSERT(MUTEX_HELD(&as->a_contents));
 240         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 241                 /*
 242                  * If the callback has not already been called, then
 243                  * check if events or address range pertains.  An event_len
 244                  * of zero means do an unconditional callback.
 245                  */
 246                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 247                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 248                     (event_addr + event_len < cb->ascb_saddr) ||
 249                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 250                         continue;
 251                 }
 252                 break;
 253         }
 254         return (cb);
 255 }
 256
 257 /*
 258  * Executes a given callback and removes it from the callback list for
 259  * this address space.
 260  * This function may sleep so the caller must drop all locks except
 261  * a_contents before calling this func.
 262  *
 263  * See also comments on as_do_callbacks below.
 264  */
 265 static void
 266 as_execute_callback(struct as *as, struct as_callback *cb,
 267     uint_t events)
 268 {
 269         struct as_callback **prevcb;
 270         void    *cb_arg;
 271
 272         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 273         cb->ascb_events |= AS_CALLBACK_CALLED;
 274         mutex_exit(&as->a_contents);
 275         (*cb->ascb_func)(as, cb->ascb_arg, events);
 276         mutex_enter(&as->a_contents);
 277         /*
 278          * the callback function is required to delete the callback
 279          * when the callback function determines it is OK for
 280          * this thread to continue. as_delete_callback will clear
 281          * the AS_ALL_EVENT in the events field when it is deleted.
 282          * If the callback function called as_delete_callback,
 283          * events will already be cleared and there will be no blocking.
 284          */
 285         while ((cb->ascb_events & events) != 0) {
 286                 cv_wait(&as->a_cv, &as->a_contents);
 287         }
 288         /*
 289          * This entry needs to be taken off the list. Normally, the
 290          * callback func itself does that, but unfortunately the list
 291          * may have changed while the callback was running because the
 292          * a_contents mutex was dropped and someone else other than the
 293          * callback func itself could have called as_delete_callback,
 294          * so we have to search to find this entry again.  The entry
 295          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 296          */
 297         cb_arg = cb->ascb_arg;
 298         prevcb = &as->a_callbacks;
 299         for (cb = as->a_callbacks; cb != NULL;
 300             prevcb = &cb->ascb_next, cb = *prevcb) {
 301                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 302                     (cb_arg != cb->ascb_arg)) {
 303                         continue;
 304                 }
 305                 *prevcb = cb->ascb_next;
 306                 kmem_free(cb, sizeof (struct as_callback));
 307                 break;
 308         }
 309 }
 310
 311 /*
 312  * Check the callback list for a matching event and intersection of
 313  * address range. If there is a match invoke the callback.  Skip an entry if:
 314  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 315  *    - not event of interest
 316  *    - not address range of interest
 317  *
 318  * An event_len of zero indicates a request for an unconditional callback
 319  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 320  * a_contents lock must be dropped before a callback, so only one callback
 321  * can be done before returning. Return -1 (true) if a callback was
 322  * executed and removed from the list, else return 0 (false).
 323  *
 324  * The logically separate parts, i.e. finding a matching callback and
 325  * executing a given callback have been separated into two functions
 326  * so that they can be called with different sets of locks held beyond
 327  * the always-required a_contents. as_find_callback does not sleep so
 328  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 329  * rwlock) are held. as_execute_callback on the other hand may sleep
 330  * so all locks beyond a_contents must be dropped by the caller if one
 331  * does not want to end comatose.
 332  */
 333 static int
 334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 335     size_t event_len)
 336 {
 337         struct as_callback *cb;
 338
 339         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 340                 as_execute_callback(as, cb, events);
 341                 return (-1);
 342         }
 343         return (0);
 344 }
 345
 346 /*
 347  * Search for the segment containing addr. If a segment containing addr
 348  * exists, that segment is returned.  If no such segment exists, and
 349  * the list spans addresses greater than addr, then the first segment
 350  * whose base is greater than addr is returned; otherwise, NULL is
 351  * returned unless tail is true, in which case the last element of the
 352  * list is returned.
 353  *
 354  * a_seglast is used to cache the last found segment for repeated
 355  * searches to the same addr (which happens frequently).
 356  */
 357 struct seg *
 358 as_findseg(struct as *as, caddr_t addr, int tail)
 359 {
 360         struct seg *seg = as->a_seglast;
 361         avl_index_t where;
 362
 363         ASSERT(AS_LOCK_HELD(as));
 364
 365         if (seg != NULL &&
 366             seg->s_base <= addr &&
 367             addr < seg->s_base + seg->s_size)
 368                 return (seg);
 369
 370         seg = avl_find(&as->a_segtree, &addr, &where);
 371         if (seg != NULL)
 372                 return (as->a_seglast = seg);
 373
 374         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 375         if (seg == NULL && tail)
 376                 seg = avl_last(&as->a_segtree);
 377         return (as->a_seglast = seg);
 378 }
 379
 380 #ifdef VERIFY_SEGLIST
 381 /*
 382  * verify that the linked list is coherent
 383  */
 384 static void
 385 as_verify(struct as *as)
 386 {
 387         struct seg *seg, *seglast, *p, *n;
 388         uint_t nsegs = 0;
 389
 390         if (do_as_verify == 0)
 391                 return;
 392
 393         seglast = as->a_seglast;
 394
 395         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 396                 ASSERT(seg->s_as == as);
 397                 p = AS_SEGPREV(as, seg);
 398                 n = AS_SEGNEXT(as, seg);
 399                 ASSERT(p == NULL || p->s_as == as);
 400                 ASSERT(p == NULL || p->s_base < seg->s_base);
 401                 ASSERT(n == NULL || n->s_base > seg->s_base);
 402                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 403                 if (seg == seglast)
 404                         seglast = NULL;
 405                 nsegs++;
 406         }
 407         ASSERT(seglast == NULL);
 408         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 409 }
 410 #endif /* VERIFY_SEGLIST */
 411
 412 /*
 413  * Add a new segment to the address space. The avl_find()
 414  * may be expensive so we attempt to use last segment accessed
 415  * in as_gap() as an insertion point.
 416  */
 417 int
 418 as_addseg(struct as  *as, struct seg *newseg)
 419 {
 420         struct seg *seg;
 421         caddr_t addr;
 422         caddr_t eaddr;
 423         avl_index_t where;
 424
 425         ASSERT(AS_WRITE_HELD(as));
 426
 427         as->a_updatedir = 1;    /* inform /proc */
 428         gethrestime(&as->a_updatetime);
 429
 430         if (as->a_lastgaphl != NULL) {
 431                 struct seg *hseg = NULL;
 432                 struct seg *lseg = NULL;
 433
 434                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 435                         hseg = as->a_lastgaphl;
 436                         lseg = AVL_PREV(&as->a_segtree, hseg);
 437                 } else {
 438                         lseg = as->a_lastgaphl;
 439                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 440                 }
 441
 442                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 443                     hseg->s_base > newseg->s_base) {
 444                         avl_insert_here(&as->a_segtree, newseg, lseg,
 445                             AVL_AFTER);
 446                         as->a_lastgaphl = NULL;
 447                         as->a_seglast = newseg;
 448                         return (0);
 449                 }
 450                 as->a_lastgaphl = NULL;
 451         }
 452
 453         addr = newseg->s_base;
 454         eaddr = addr + newseg->s_size;
 455 again:
 456
 457         seg = avl_find(&as->a_segtree, &addr, &where);
 458
 459         if (seg == NULL)
 460                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 461
 462         if (seg == NULL)
 463                 seg = avl_last(&as->a_segtree);
 464
 465         if (seg != NULL) {
 466                 caddr_t base = seg->s_base;
 467
 468                 /*
 469                  * If top of seg is below the requested address, then
 470                  * the insertion point is at the end of the linked list,
 471                  * and seg points to the tail of the list.  Otherwise,
 472                  * the insertion point is immediately before seg.
 473                  */
 474                 if (base + seg->s_size > addr) {
 475                         if (addr >= base || eaddr > base) {
 476                                 return (-1);    /* overlapping segment */
 477                         }
 478                 }
 479         }
 480         as->a_seglast = newseg;
 481         avl_insert(&as->a_segtree, newseg, where);
 482
 483 #ifdef VERIFY_SEGLIST
 484         as_verify(as);
 485 #endif
 486         return (0);
 487 }
 488
 489 struct seg *
 490 as_removeseg(struct as *as, struct seg *seg)
 491 {
 492         avl_tree_t *t;
 493
 494         ASSERT(AS_WRITE_HELD(as));
 495
 496         as->a_updatedir = 1;    /* inform /proc */
 497         gethrestime(&as->a_updatetime);
 498
 499         if (seg == NULL)
 500                 return (NULL);
 501
 502         t = &as->a_segtree;
 503         if (as->a_seglast == seg)
 504                 as->a_seglast = NULL;
 505         as->a_lastgaphl = NULL;
 506
 507         /*
 508          * if this segment is at an address higher than
 509          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 510          */
 511         if (as->a_lastgap &&
 512             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 513                 as->a_lastgap = AVL_NEXT(t, seg);
 514
 515         /*
 516          * remove the segment from the seg tree
 517          */
 518         avl_remove(t, seg);
 519
 520 #ifdef VERIFY_SEGLIST
 521         as_verify(as);
 522 #endif
 523         return (seg);
 524 }
 525
 526 /*
 527  * Find a segment containing addr.
 528  */
 529 struct seg *
 530 as_segat(struct as *as, caddr_t addr)
 531 {
 532         struct seg *seg = as->a_seglast;
 533
 534         ASSERT(AS_LOCK_HELD(as));
 535
 536         if (seg != NULL && seg->s_base <= addr &&
 537             addr < seg->s_base + seg->s_size)
 538                 return (seg);
 539
 540         seg = avl_find(&as->a_segtree, &addr, NULL);
 541         return (seg);
 542 }
 543
 544 /*
 545  * Serialize all searches for holes in an address space to
 546  * prevent two or more threads from allocating the same virtual
 547  * address range.  The address space must not be "read/write"
 548  * locked by the caller since we may block.
 549  */
 550 void
 551 as_rangelock(struct as *as)
 552 {
 553         mutex_enter(&as->a_contents);
 554         while (AS_ISCLAIMGAP(as))
 555                 cv_wait(&as->a_cv, &as->a_contents);
 556         AS_SETCLAIMGAP(as);
 557         mutex_exit(&as->a_contents);
 558 }
 559
 560 /*
 561  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 562  */
 563 void
 564 as_rangeunlock(struct as *as)
 565 {
 566         mutex_enter(&as->a_contents);
 567         AS_CLRCLAIMGAP(as);
 568         cv_signal(&as->a_cv);
 569         mutex_exit(&as->a_contents);
 570 }
 571
 572 /*
 573  * compar segments (or just an address) by segment address range
 574  */
 575 static int
 576 as_segcompar(const void *x, const void *y)
 577 {
 578         struct seg *a = (struct seg *)x;
 579         struct seg *b = (struct seg *)y;
 580
 581         if (a->s_base < b->s_base)
 582                 return (-1);
 583         if (a->s_base >= b->s_base + b->s_size)
 584                 return (1);
 585         return (0);
 586 }
 587
 588
 589 void
 590 as_avlinit(struct as *as)
 591 {
 592         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 593             offsetof(struct seg, s_tree));
 594         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 595             offsetof(struct watched_page, wp_link));
 596 }
 597
 598 /*ARGSUSED*/
 599 static int
 600 as_constructor(void *buf, void *cdrarg, int kmflags)
 601 {
 602         struct as *as = buf;
 603
 604         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 605         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 606         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 607         as_avlinit(as);
 608         return (0);
 609 }
 610
 611 /*ARGSUSED1*/
 612 static void
 613 as_destructor(void *buf, void *cdrarg)
 614 {
 615         struct as *as = buf;
 616
 617         avl_destroy(&as->a_segtree);
 618         mutex_destroy(&as->a_contents);
 619         cv_destroy(&as->a_cv);
 620         rw_destroy(&as->a_lock);
 621 }
 622
 623 void
 624 as_init(void)
 625 {
 626         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 627             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 628 }
 629
 630 /*
 631  * Allocate and initialize an address space data structure.
 632  * We call hat_alloc to allow any machine dependent
 633  * information in the hat structure to be initialized.
 634  */
 635 struct as *
 636 as_alloc(void)
 637 {
 638         struct as *as;
 639
 640         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 641
 642         as->a_flags             = 0;
 643         as->a_vbits             = 0;
 644         as->a_hrm               = NULL;
 645         as->a_seglast           = NULL;
 646         as->a_size              = 0;
 647         as->a_resvsize          = 0;
 648         as->a_updatedir         = 0;
 649         gethrestime(&as->a_updatetime);
 650         as->a_objectdir         = NULL;
 651         as->a_sizedir           = 0;
 652         as->a_userlimit         = (caddr_t)USERLIMIT;
 653         as->a_lastgap           = NULL;
 654         as->a_lastgaphl         = NULL;
 655         as->a_callbacks         = NULL;
 656         as->a_proc              = NULL;
 657
 658         AS_LOCK_ENTER(as, RW_WRITER);
 659         as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 660         AS_LOCK_EXIT(as);
 661
 662         return (as);
 663 }
 664
 665 /*
 666  * Free an address space data structure.
 667  * Need to free the hat first and then
 668  * all the segments on this as and finally
 669  * the space for the as struct itself.
 670  */
 671 void
 672 as_free(struct as *as)
 673 {
 674         struct hat *hat = as->a_hat;
 675         struct seg *seg, *next;
 676         boolean_t free_started = B_FALSE;
 677
 678 top:
 679         /*
 680          * Invoke ALL callbacks. as_do_callbacks will do one callback
 681          * per call, and not return (-1) until the callback has completed.
 682          * When as_do_callbacks returns zero, all callbacks have completed.
 683          */
 684         mutex_enter(&as->a_contents);
 685         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 686                 ;
 687
 688         mutex_exit(&as->a_contents);
 689         AS_LOCK_ENTER(as, RW_WRITER);
 690
 691         if (!free_started) {
 692                 free_started = B_TRUE;
 693                 hat_free_start(hat);
 694         }
 695         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 696                 int err;
 697
 698                 next = AS_SEGNEXT(as, seg);
 699 retry:
 700                 err = segop_unmap(seg, seg->s_base, seg->s_size);
 701                 if (err == EAGAIN) {
 702                         mutex_enter(&as->a_contents);
 703                         if (as->a_callbacks) {
 704                                 AS_LOCK_EXIT(as);
 705                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 706                                 /*
 707                                  * Memory is currently locked. Wait for a
 708                                  * cv_signal that it has been unlocked, then
 709                                  * try the operation again.
 710                                  */
 711                                 if (AS_ISUNMAPWAIT(as) == 0)
 712                                         cv_broadcast(&as->a_cv);
 713                                 AS_SETUNMAPWAIT(as);
 714                                 AS_LOCK_EXIT(as);
 715                                 while (AS_ISUNMAPWAIT(as))
 716                                         cv_wait(&as->a_cv, &as->a_contents);
 717                         } else {
 718                                 /*
 719                                  * We may have raced with
 720                                  * segvn_reclaim()/segspt_reclaim(). In this
 721                                  * case clean nounmapwait flag and retry since
 722                                  * softlockcnt in this segment may be already
 723                                  * 0.  We don't drop as writer lock so our
 724                                  * number of retries without sleeping should
 725                                  * be very small. See segvn_reclaim() for
 726                                  * more comments.
 727                                  */
 728                                 AS_CLRNOUNMAPWAIT(as);
 729                                 mutex_exit(&as->a_contents);
 730                                 goto retry;
 731                         }
 732                         mutex_exit(&as->a_contents);
 733                         goto top;
 734                 } else {
 735                         /*
 736                          * We do not expect any other error return at this
 737                          * time. This is similar to an ASSERT in seg_unmap()
 738                          */
 739                         ASSERT(err == 0);
 740                 }
 741         }
 742         hat_free_end(hat);
 743         AS_LOCK_EXIT(as);
 744
 745         /* /proc stuff */
 746         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 747         if (as->a_objectdir) {
 748                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 749                 as->a_objectdir = NULL;
 750                 as->a_sizedir = 0;
 751         }
 752
 753         /*
 754          * Free the struct as back to kmem.  Assert it has no segments.
 755          */
 756         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 757         kmem_cache_free(as_cache, as);
 758 }
 759
 760 int
 761 as_dup(struct as *as, struct proc *forkedproc)
 762 {
 763         struct as *newas;
 764         struct seg *seg, *newseg;
 765         size_t  purgesize = 0;
 766         int error;
 767
 768         AS_LOCK_ENTER(as, RW_WRITER);
 769         as_clearwatch(as);
 770         newas = as_alloc();
 771         newas->a_userlimit = as->a_userlimit;
 772         newas->a_proc = forkedproc;
 773
 774         AS_LOCK_ENTER(newas, RW_WRITER);
 775
 776         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 777
 778         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 779
 780                 if (seg->s_flags & S_PURGE) {
 781                         purgesize += seg->s_size;
 782                         continue;
 783                 }
 784
 785                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 786                 if (newseg == NULL) {
 787                         AS_LOCK_EXIT(newas);
 788                         as_setwatch(as);
 789                         AS_LOCK_EXIT(as);
 790                         as_free(newas);
 791                         return (-1);
 792                 }
 793                 if ((error = segop_dup(seg, newseg)) != 0) {
 794                         /*
 795                          * We call seg_free() on the new seg
 796                          * because the segment is not set up
 797                          * completely; i.e. it has no ops.
 798                          */
 799                         as_setwatch(as);
 800                         AS_LOCK_EXIT(as);
 801                         seg_free(newseg);
 802                         AS_LOCK_EXIT(newas);
 803                         as_free(newas);
 804                         return (error);
 805                 }
 806                 if ((newseg->s_flags & S_HOLE) == 0) {
 807                         newas->a_size += seg->s_size;
 808                 }
 809         }
 810         newas->a_resvsize = as->a_resvsize - purgesize;
 811
 812         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 813
 814         AS_LOCK_EXIT(newas);
 815
 816         as_setwatch(as);
 817         AS_LOCK_EXIT(as);
 818         if (error != 0) {
 819                 as_free(newas);
 820                 return (error);
 821         }
 822         forkedproc->p_as = newas;
 823         return (0);
 824 }
 825
 826 /*
 827  * Handle a ``fault'' at addr for size bytes.
 828  */
 829 faultcode_t
 830 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 831     enum fault_type type, enum seg_rw rw)
 832 {
 833         struct seg *seg;
 834         caddr_t raddr;                  /* rounded down addr */
 835         size_t rsize;                   /* rounded up size */
 836         size_t ssize;
 837         faultcode_t res = 0;
 838         caddr_t addrsav;
 839         struct seg *segsav;
 840         int as_lock_held;
 841         klwp_t *lwp = ttolwp(curthread);
 842
 843
 844
 845 retry:
 846         /*
 847          * Indicate that the lwp is not to be stopped while waiting for a
 848          * pagefault.  This is to avoid deadlock while debugging a process
 849          * via /proc over NFS (in particular).
 850          */
 851         if (lwp != NULL)
 852                 lwp->lwp_nostop++;
 853
 854         /*
 855          * same length must be used when we softlock and softunlock.  We
 856          * don't support softunlocking lengths less than the original length
 857          * when there is largepage support.  See seg_dev.c for more
 858          * comments.
 859          */
 860         switch (type) {
 861
 862         case F_SOFTLOCK:
 863                 CPU_STATS_ADD_K(vm, softlock, 1);
 864                 break;
 865
 866         case F_SOFTUNLOCK:
 867                 break;
 868
 869         case F_PROT:
 870                 CPU_STATS_ADD_K(vm, prot_fault, 1);
 871                 break;
 872
 873         case F_INVAL:
 874                 CPU_STATS_ENTER_K();
 875                 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 876                 if (as == &kas)
 877                         CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 878                 CPU_STATS_EXIT_K();
 879                 break;
 880         }
 881
 882         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 883         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 884             (size_t)raddr;
 885
 886         /*
 887          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 888          * correctness, but then we could be stuck holding this lock for
 889          * a LONG time if the fault needs to be resolved on a slow
 890          * filesystem, and then no-one will be able to exec new commands,
 891          * as exec'ing requires the write lock on the as.
 892          */
 893         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 894             raddr + size < segkmap->s_base + segkmap->s_size) {
 895                 seg = segkmap;
 896                 as_lock_held = 0;
 897         } else {
 898                 AS_LOCK_ENTER(as, RW_READER);
 899
 900                 seg = as_segat(as, raddr);
 901                 if (seg == NULL) {
 902                         AS_LOCK_EXIT(as);
 903                         if (lwp != NULL)
 904                                 lwp->lwp_nostop--;
 905                         return (FC_NOMAP);
 906                 }
 907
 908                 as_lock_held = 1;
 909         }
 910
 911         addrsav = raddr;
 912         segsav = seg;
 913
 914         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 915                 if (raddr >= seg->s_base + seg->s_size) {
 916                         seg = AS_SEGNEXT(as, seg);
 917                         if (seg == NULL || raddr != seg->s_base) {
 918                                 res = FC_NOMAP;
 919                                 break;
 920                         }
 921                 }
 922                 if (raddr + rsize > seg->s_base + seg->s_size)
 923                         ssize = seg->s_base + seg->s_size - raddr;
 924                 else
 925                         ssize = rsize;
 926
 927                 res = segop_fault(hat, seg, raddr, ssize, type, rw);
 928                 if (res != 0)
 929                         break;
 930         }
 931
 932         /*
 933          * If we were SOFTLOCKing and encountered a failure,
 934          * we must SOFTUNLOCK the range we already did. (Maybe we
 935          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 936          * right here...)
 937          */
 938         if (res != 0 && type == F_SOFTLOCK) {
 939                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 940                         if (addrsav >= seg->s_base + seg->s_size)
 941                                 seg = AS_SEGNEXT(as, seg);
 942                         ASSERT(seg != NULL);
 943                         /*
 944                          * Now call the fault routine again to perform the
 945                          * unlock using S_OTHER instead of the rw variable
 946                          * since we never got a chance to touch the pages.
 947                          */
 948                         if (raddr > seg->s_base + seg->s_size)
 949                                 ssize = seg->s_base + seg->s_size - addrsav;
 950                         else
 951                                 ssize = raddr - addrsav;
 952                         (void) segop_fault(hat, seg, addrsav, ssize,
 953                             F_SOFTUNLOCK, S_OTHER);
 954                 }
 955         }
 956         if (as_lock_held)
 957                 AS_LOCK_EXIT(as);
 958         if (lwp != NULL)
 959                 lwp->lwp_nostop--;
 960
 961         /*
 962          * If the lower levels returned EDEADLK for a fault,
 963          * It means that we should retry the fault.  Let's wait
 964          * a bit also to let the deadlock causing condition clear.
 965          * This is part of a gross hack to work around a design flaw
 966          * in the ufs/sds logging code and should go away when the
 967          * logging code is re-designed to fix the problem. See bug
 968          * 4125102 for details of the problem.
 969          */
 970         if (FC_ERRNO(res) == EDEADLK) {
 971                 delay(deadlk_wait);
 972                 res = 0;
 973                 goto retry;
 974         }
 975         return (res);
 976 }
 977
 978
 979
 980 /*
 981  * Asynchronous ``fault'' at addr for size bytes.
 982  */
 983 faultcode_t
 984 as_faulta(struct as *as, caddr_t addr, size_t size)
 985 {
 986         struct seg *seg;
 987         caddr_t raddr;                  /* rounded down addr */
 988         size_t rsize;                   /* rounded up size */
 989         faultcode_t res = 0;
 990         klwp_t *lwp = ttolwp(curthread);
 991
 992 retry:
 993         /*
 994          * Indicate that the lwp is not to be stopped while waiting
 995          * for a pagefault.  This is to avoid deadlock while debugging
 996          * a process via /proc over NFS (in particular).
 997          */
 998         if (lwp != NULL)
 999                 lwp->lwp_nostop++;
1000
1001         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1002         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1003             (size_t)raddr;
1004
1005         AS_LOCK_ENTER(as, RW_READER);
1006         seg = as_segat(as, raddr);
1007         if (seg == NULL) {
1008                 AS_LOCK_EXIT(as);
1009                 if (lwp != NULL)
1010                         lwp->lwp_nostop--;
1011                 return (FC_NOMAP);
1012         }
1013
1014         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1015                 if (raddr >= seg->s_base + seg->s_size) {
1016                         seg = AS_SEGNEXT(as, seg);
1017                         if (seg == NULL || raddr != seg->s_base) {
1018                                 res = FC_NOMAP;
1019                                 break;
1020                         }
1021                 }
1022                 res = segop_faulta(seg, raddr);
1023                 if (res != 0)
1024                         break;
1025         }
1026         AS_LOCK_EXIT(as);
1027         if (lwp != NULL)
1028                 lwp->lwp_nostop--;
1029         /*
1030          * If the lower levels returned EDEADLK for a fault,
1031          * It means that we should retry the fault.  Let's wait
1032          * a bit also to let the deadlock causing condition clear.
1033          * This is part of a gross hack to work around a design flaw
1034          * in the ufs/sds logging code and should go away when the
1035          * logging code is re-designed to fix the problem. See bug
1036          * 4125102 for details of the problem.
1037          */
1038         if (FC_ERRNO(res) == EDEADLK) {
1039                 delay(deadlk_wait);
1040                 res = 0;
1041                 goto retry;
1042         }
1043         return (res);
1044 }
1045
1046 /*
1047  * Set the virtual mapping for the interval from [addr : addr + size)
1048  * in address space `as' to have the specified protection.
1049  * It is ok for the range to cross over several segments,
1050  * as long as they are contiguous.
1051  */
1052 int
1053 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1054 {
1055         struct seg *seg;
1056         struct as_callback *cb;
1057         size_t ssize;
1058         caddr_t raddr;                  /* rounded down addr */
1059         size_t rsize;                   /* rounded up size */
1060         int error = 0, writer = 0;
1061         caddr_t saveraddr;
1062         size_t saversize;
1063
1064 setprot_top:
1065         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1066         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1067             (size_t)raddr;
1068
1069         if (raddr + rsize < raddr)              /* check for wraparound */
1070                 return (ENOMEM);
1071
1072         saveraddr = raddr;
1073         saversize = rsize;
1074
1075         /*
1076          * Normally we only lock the as as a reader. But
1077          * if due to setprot the segment driver needs to split
1078          * a segment it will return IE_RETRY. Therefore we re-acquire
1079          * the as lock as a writer so the segment driver can change
1080          * the seg list. Also the segment driver will return IE_RETRY
1081          * after it has changed the segment list so we therefore keep
1082          * locking as a writer. Since these opeartions should be rare
1083          * want to only lock as a writer when necessary.
1084          */
1085         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1086                 AS_LOCK_ENTER(as, RW_WRITER);
1087         } else {
1088                 AS_LOCK_ENTER(as, RW_READER);
1089         }
1090
1091         as_clearwatchprot(as, raddr, rsize);
1092         seg = as_segat(as, raddr);
1093         if (seg == NULL) {
1094                 as_setwatch(as);
1095                 AS_LOCK_EXIT(as);
1096                 return (ENOMEM);
1097         }
1098
1099         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1100                 if (raddr >= seg->s_base + seg->s_size) {
1101                         seg = AS_SEGNEXT(as, seg);
1102                         if (seg == NULL || raddr != seg->s_base) {
1103                                 error = ENOMEM;
1104                                 break;
1105                         }
1106                 }
1107                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1108                         ssize = seg->s_base + seg->s_size - raddr;
1109                 else
1110                         ssize = rsize;
1111 retry:
1112                 error = segop_setprot(seg, raddr, ssize, prot);
1113
1114                 if (error == IE_NOMEM) {
1115                         error = EAGAIN;
1116                         break;
1117                 }
1118
1119                 if (error == IE_RETRY) {
1120                         AS_LOCK_EXIT(as);
1121                         writer = 1;
1122                         goto setprot_top;
1123                 }
1124
1125                 if (error == EAGAIN) {
1126                         /*
1127                          * Make sure we have a_lock as writer.
1128                          */
1129                         if (writer == 0) {
1130                                 AS_LOCK_EXIT(as);
1131                                 writer = 1;
1132                                 goto setprot_top;
1133                         }
1134
1135                         /*
1136                          * Memory is currently locked.  It must be unlocked
1137                          * before this operation can succeed through a retry.
1138                          * The possible reasons for locked memory and
1139                          * corresponding strategies for unlocking are:
1140                          * (1) Normal I/O
1141                          *      wait for a signal that the I/O operation
1142                          *      has completed and the memory is unlocked.
1143                          * (2) Asynchronous I/O
1144                          *      The aio subsystem does not unlock pages when
1145                          *      the I/O is completed. Those pages are unlocked
1146                          *      when the application calls aiowait/aioerror.
1147                          *      So, to prevent blocking forever, cv_broadcast()
1148                          *      is done to wake up aio_cleanup_thread.
1149                          *      Subsequently, segvn_reclaim will be called, and
1150                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1151                          * (3) Long term page locking:
1152                          *      Drivers intending to have pages locked for a
1153                          *      period considerably longer than for normal I/O
1154                          *      (essentially forever) may have registered for a
1155                          *      callback so they may unlock these pages on
1156                          *      request. This is needed to allow this operation
1157                          *      to succeed. Each entry on the callback list is
1158                          *      examined. If the event or address range pertains
1159                          *      the callback is invoked (unless it already is in
1160                          *      progress). The a_contents lock must be dropped
1161                          *      before the callback, so only one callback can
1162                          *      be done at a time. Go to the top and do more
1163                          *      until zero is returned. If zero is returned,
1164                          *      either there were no callbacks for this event
1165                          *      or they were already in progress.
1166                          */
1167                         mutex_enter(&as->a_contents);
1168                         if (as->a_callbacks &&
1169                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1170                             seg->s_base, seg->s_size))) {
1171                                 AS_LOCK_EXIT(as);
1172                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1173                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1174                                 if (AS_ISUNMAPWAIT(as) == 0)
1175                                         cv_broadcast(&as->a_cv);
1176                                 AS_SETUNMAPWAIT(as);
1177                                 AS_LOCK_EXIT(as);
1178                                 while (AS_ISUNMAPWAIT(as))
1179                                         cv_wait(&as->a_cv, &as->a_contents);
1180                         } else {
1181                                 /*
1182                                  * We may have raced with
1183                                  * segvn_reclaim()/segspt_reclaim(). In this
1184                                  * case clean nounmapwait flag and retry since
1185                                  * softlockcnt in this segment may be already
1186                                  * 0.  We don't drop as writer lock so our
1187                                  * number of retries without sleeping should
1188                                  * be very small. See segvn_reclaim() for
1189                                  * more comments.
1190                                  */
1191                                 AS_CLRNOUNMAPWAIT(as);
1192                                 mutex_exit(&as->a_contents);
1193                                 goto retry;
1194                         }
1195                         mutex_exit(&as->a_contents);
1196                         goto setprot_top;
1197                 } else if (error != 0)
1198                         break;
1199         }
1200         if (error != 0) {
1201                 as_setwatch(as);
1202         } else {
1203                 as_setwatchprot(as, saveraddr, saversize, prot);
1204         }
1205         AS_LOCK_EXIT(as);
1206         return (error);
1207 }
1208
1209 /*
1210  * Check to make sure that the interval [addr, addr + size)
1211  * in address space `as' has at least the specified protection.
1212  * It is ok for the range to cross over several segments, as long
1213  * as they are contiguous.
1214  */
1215 int
1216 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1217 {
1218         struct seg *seg;
1219         size_t ssize;
1220         caddr_t raddr;                  /* rounded down addr */
1221         size_t rsize;                   /* rounded up size */
1222         int error = 0;
1223
1224         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1225         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1226             (size_t)raddr;
1227
1228         if (raddr + rsize < raddr)              /* check for wraparound */
1229                 return (ENOMEM);
1230
1231         /*
1232          * This is ugly as sin...
1233          * Normally, we only acquire the address space readers lock.
1234          * However, if the address space has watchpoints present,
1235          * we must acquire the writer lock on the address space for
1236          * the benefit of as_clearwatchprot() and as_setwatchprot().
1237          */
1238         if (avl_numnodes(&as->a_wpage) != 0)
1239                 AS_LOCK_ENTER(as, RW_WRITER);
1240         else
1241                 AS_LOCK_ENTER(as, RW_READER);
1242         as_clearwatchprot(as, raddr, rsize);
1243         seg = as_segat(as, raddr);
1244         if (seg == NULL) {
1245                 as_setwatch(as);
1246                 AS_LOCK_EXIT(as);
1247                 return (ENOMEM);
1248         }
1249
1250         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1251                 if (raddr >= seg->s_base + seg->s_size) {
1252                         seg = AS_SEGNEXT(as, seg);
1253                         if (seg == NULL || raddr != seg->s_base) {
1254                                 error = ENOMEM;
1255                                 break;
1256                         }
1257                 }
1258                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1259                         ssize = seg->s_base + seg->s_size - raddr;
1260                 else
1261                         ssize = rsize;
1262
1263                 error = segop_checkprot(seg, raddr, ssize, prot);
1264                 if (error != 0)
1265                         break;
1266         }
1267         as_setwatch(as);
1268         AS_LOCK_EXIT(as);
1269         return (error);
1270 }
1271
1272 int
1273 as_unmap(struct as *as, caddr_t addr, size_t size)
1274 {
1275         struct seg *seg, *seg_next;
1276         struct as_callback *cb;
1277         caddr_t raddr, eaddr;
1278         size_t ssize, rsize = 0;
1279         int err;
1280
1281 top:
1282         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1283         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1284             (uintptr_t)PAGEMASK);
1285
1286         AS_LOCK_ENTER(as, RW_WRITER);
1287
1288         as->a_updatedir = 1;    /* inform /proc */
1289         gethrestime(&as->a_updatetime);
1290
1291         /*
1292          * Use as_findseg to find the first segment in the range, then
1293          * step through the segments in order, following s_next.
1294          */
1295         as_clearwatchprot(as, raddr, eaddr - raddr);
1296
1297         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1298                 const boolean_t is_hole = ((seg->s_flags & S_HOLE) != 0);
1299
1300                 if (eaddr <= seg->s_base)
1301                         break;          /* eaddr was in a gap; all done */
1302
1303                 /* this is implied by the test above */
1304                 ASSERT(raddr < eaddr);
1305
1306                 if (raddr < seg->s_base)
1307                         raddr = seg->s_base;    /* raddr was in a gap */
1308
1309                 if (eaddr > (seg->s_base + seg->s_size))
1310                         ssize = seg->s_base + seg->s_size - raddr;
1311                 else
1312                         ssize = eaddr - raddr;
1313
1314                 /*
1315                  * Save next segment pointer since seg can be
1316                  * destroyed during the segment unmap operation.
1317                  */
1318                 seg_next = AS_SEGNEXT(as, seg);
1319
1320                 /*
1321                  * We didn't count /dev/null mappings, so ignore them here.
1322                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1323                  * we have to do this check here while we have seg.)
1324                  */
1325                 rsize = 0;
1326                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1327                     !SEG_IS_PARTIAL_RESV(seg))
1328                         rsize = ssize;
1329
1330 retry:
1331                 err = segop_unmap(seg, raddr, ssize);
1332                 if (err == EAGAIN) {
1333                         /*
1334                          * Memory is currently locked.  It must be unlocked
1335                          * before this operation can succeed through a retry.
1336                          * The possible reasons for locked memory and
1337                          * corresponding strategies for unlocking are:
1338                          * (1) Normal I/O
1339                          *      wait for a signal that the I/O operation
1340                          *      has completed and the memory is unlocked.
1341                          * (2) Asynchronous I/O
1342                          *      The aio subsystem does not unlock pages when
1343                          *      the I/O is completed. Those pages are unlocked
1344                          *      when the application calls aiowait/aioerror.
1345                          *      So, to prevent blocking forever, cv_broadcast()
1346                          *      is done to wake up aio_cleanup_thread.
1347                          *      Subsequently, segvn_reclaim will be called, and
1348                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1349                          * (3) Long term page locking:
1350                          *      Drivers intending to have pages locked for a
1351                          *      period considerably longer than for normal I/O
1352                          *      (essentially forever) may have registered for a
1353                          *      callback so they may unlock these pages on
1354                          *      request. This is needed to allow this operation
1355                          *      to succeed. Each entry on the callback list is
1356                          *      examined. If the event or address range pertains
1357                          *      the callback is invoked (unless it already is in
1358                          *      progress). The a_contents lock must be dropped
1359                          *      before the callback, so only one callback can
1360                          *      be done at a time. Go to the top and do more
1361                          *      until zero is returned. If zero is returned,
1362                          *      either there were no callbacks for this event
1363                          *      or they were already in progress.
1364                          */
1365                         mutex_enter(&as->a_contents);
1366                         if (as->a_callbacks &&
1367                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1368                             seg->s_base, seg->s_size))) {
1369                                 AS_LOCK_EXIT(as);
1370                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1371                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1372                                 if (AS_ISUNMAPWAIT(as) == 0)
1373                                         cv_broadcast(&as->a_cv);
1374                                 AS_SETUNMAPWAIT(as);
1375                                 AS_LOCK_EXIT(as);
1376                                 while (AS_ISUNMAPWAIT(as))
1377                                         cv_wait(&as->a_cv, &as->a_contents);
1378                         } else {
1379                                 /*
1380                                  * We may have raced with
1381                                  * segvn_reclaim()/segspt_reclaim(). In this
1382                                  * case clean nounmapwait flag and retry since
1383                                  * softlockcnt in this segment may be already
1384                                  * 0.  We don't drop as writer lock so our
1385                                  * number of retries without sleeping should
1386                                  * be very small. See segvn_reclaim() for
1387                                  * more comments.
1388                                  */
1389                                 AS_CLRNOUNMAPWAIT(as);
1390                                 mutex_exit(&as->a_contents);
1391                                 goto retry;
1392                         }
1393                         mutex_exit(&as->a_contents);
1394                         goto top;
1395                 } else if (err == IE_RETRY) {
1396                         AS_LOCK_EXIT(as);
1397                         goto top;
1398                 } else if (err) {
1399                         as_setwatch(as);
1400                         AS_LOCK_EXIT(as);
1401                         return (-1);
1402                 }
1403
1404                 if (!is_hole) {
1405                         as->a_size -= ssize;
1406                         if (rsize)
1407                                 as->a_resvsize -= rsize;
1408                 }
1409                 raddr += ssize;
1410         }
1411         AS_LOCK_EXIT(as);
1412         return (0);
1413 }
1414
1415 static int
1416 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1417     segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1418 {
1419         uint_t szc, nszc, save_szcvec;
1420         int error;
1421         caddr_t a, eaddr;
1422         size_t pgsz;
1423         const boolean_t do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1424
1425         ASSERT(AS_WRITE_HELD(as));
1426         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1427         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1428         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1429
1430         if (!do_off) {
1431                 vn_a->offset = 0;
1432         }
1433
1434         if (szcvec <= 1) {
1435                 struct seg *seg, *segref;
1436
1437                 seg = segref = seg_alloc(as, addr, size);
1438                 if (seg == NULL) {
1439                         return (ENOMEM);
1440                 }
1441                 vn_a->szc = 0;
1442                 error = (*crfp)(&seg, vn_a);
1443                 if (error != 0) {
1444                         VERIFY3P(seg, ==, segref);
1445                         seg_free(seg);
1446                 } else {
1447                         as->a_size += size;
1448                         as->a_resvsize += size;
1449                 }
1450                 return (error);
1451         }
1452
1453         eaddr = addr + size;
1454         save_szcvec = szcvec;
1455         szcvec >>= 1;
1456         szc = 0;
1457         nszc = 0;
1458         while (szcvec) {
1459                 if ((szcvec & 0x1) == 0) {
1460                         nszc++;
1461                         szcvec >>= 1;
1462                         continue;
1463                 }
1464                 nszc++;
1465                 pgsz = page_get_pagesize(nszc);
1466                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1467                 if (a != addr) {
1468                         struct seg *seg, *segref;
1469                         size_t segsize;
1470
1471                         ASSERT(a < eaddr);
1472
1473                         segsize = a - addr;
1474                         seg = segref = seg_alloc(as, addr, segsize);
1475                         if (seg == NULL) {
1476                                 return (ENOMEM);
1477                         }
1478                         vn_a->szc = szc;
1479                         error = (*crfp)(&seg, vn_a);
1480                         if (error != 0) {
1481                                 VERIFY3P(seg, ==, segref);
1482                                 seg_free(seg);
1483                                 return (error);
1484                         }
1485                         as->a_size += segsize;
1486                         as->a_resvsize += segsize;
1487                         *segcreated = B_TRUE;
1488                         if (do_off) {
1489                                 vn_a->offset += segsize;
1490                         }
1491                         addr = a;
1492                 }
1493                 szc = nszc;
1494                 szcvec >>= 1;
1495         }
1496
1497         ASSERT(addr < eaddr);
1498         szcvec = save_szcvec | 1; /* add 8K pages */
1499         while (szcvec) {
1500                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1501                 ASSERT(a >= addr);
1502                 if (a != addr) {
1503                         struct seg *seg, *segref;
1504                         size_t segsize;
1505
1506                         segsize = a - addr;
1507                         seg = segref = seg_alloc(as, addr, segsize);
1508                         if (seg == NULL) {
1509                                 return (ENOMEM);
1510                         }
1511                         vn_a->szc = szc;
1512                         error = (*crfp)(&seg, vn_a);
1513                         if (error != 0) {
1514                                 VERIFY3P(seg, ==, segref);
1515                                 seg_free(seg);
1516                                 return (error);
1517                         }
1518                         as->a_size += segsize;
1519                         as->a_resvsize += segsize;
1520                         *segcreated = B_TRUE;
1521                         if (do_off) {
1522                                 vn_a->offset += segsize;
1523                         }
1524                         addr = a;
1525                 }
1526                 szcvec &= ~(1 << szc);
1527                 if (szcvec) {
1528                         szc = highbit(szcvec) - 1;
1529                         pgsz = page_get_pagesize(szc);
1530                 }
1531         }
1532         ASSERT(addr == eaddr);
1533
1534         return (0);
1535 }
1536
1537 static int
1538 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1539     segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1540 {
1541         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1542         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1543         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1544             type, 0);
1545         int error;
1546         struct vattr va;
1547         uoff_t eoff;
1548         size_t save_size = 0;
1549         extern size_t textrepl_size_thresh;
1550
1551         ASSERT(AS_WRITE_HELD(as));
1552         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1553         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1554         ASSERT(vn_a->vp != NULL);
1555         ASSERT(vn_a->amp == NULL);
1556
1557 again:
1558         if (szcvec <= 1) {
1559                 struct seg *seg, *segref;
1560
1561                 seg = segref = seg_alloc(as, addr, size);
1562                 if (seg == NULL) {
1563                         return (ENOMEM);
1564                 }
1565                 vn_a->szc = 0;
1566                 error = (*crfp)(&seg, vn_a);
1567                 if (error != 0) {
1568                         VERIFY3P(seg, ==, segref);
1569                         seg_free(seg);
1570                 } else {
1571                         as->a_size += size;
1572                         as->a_resvsize += size;
1573                 }
1574                 return (error);
1575         }
1576
1577         va.va_mask = VATTR_SIZE;
1578         if (fop_getattr(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1579                 szcvec = 0;
1580                 goto again;
1581         }
1582         eoff = vn_a->offset & PAGEMASK;
1583         if (eoff >= va.va_size) {
1584                 szcvec = 0;
1585                 goto again;
1586         }
1587         eoff += size;
1588         if (btopr(va.va_size) < btopr(eoff)) {
1589                 save_size = size;
1590                 size = va.va_size - (vn_a->offset & PAGEMASK);
1591                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1592                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1593                     type, 0);
1594                 if (szcvec <= 1) {
1595                         size = save_size;
1596                         goto again;
1597                 }
1598         }
1599
1600         if (size > textrepl_size_thresh) {
1601                 vn_a->flags |= _MAP_TEXTREPL;
1602         }
1603         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1604             segcreated);
1605         if (error != 0) {
1606                 return (error);
1607         }
1608         if (save_size) {
1609                 addr += size;
1610                 size = save_size - size;
1611                 szcvec = 0;
1612                 goto again;
1613         }
1614         return (0);
1615 }
1616
1617 /*
1618  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1619  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1620  */
1621 static int
1622 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1623     segcreate_func_t crfp, struct segvn_crargs *vn_a, boolean_t *segcreated)
1624 {
1625         uint_t szcvec;
1626         uchar_t type;
1627
1628         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1629         if (vn_a->type == MAP_SHARED) {
1630                 type = MAPPGSZC_SHM;
1631         } else if (vn_a->type == MAP_PRIVATE) {
1632                 if (vn_a->szc == AS_MAP_HEAP) {
1633                         type = MAPPGSZC_HEAP;
1634                 } else if (vn_a->szc == AS_MAP_STACK) {
1635                         type = MAPPGSZC_STACK;
1636                 } else {
1637                         type = MAPPGSZC_PRIVM;
1638                 }
1639         }
1640         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1641             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1642             (vn_a->flags & MAP_TEXT), type, 0);
1643         ASSERT(AS_WRITE_HELD(as));
1644         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1645         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1646         ASSERT(vn_a->vp == NULL);
1647
1648         return (as_map_segvn_segs(as, addr, size, szcvec,
1649             crfp, vn_a, segcreated));
1650 }
1651
1652 int
1653 as_map(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
1654     void *argsp)
1655 {
1656         AS_LOCK_ENTER(as, RW_WRITER);
1657         return (as_map_locked(as, addr, size, crfp, argsp));
1658 }
1659
1660 int
1661 as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp,
1662     void *argsp)
1663 {
1664         caddr_t raddr;                  /* rounded down addr */
1665         size_t rsize;                   /* rounded up size */
1666         int error;
1667         boolean_t is_hole = B_FALSE;
1668         /*
1669          * The use of a_proc is preferred to handle the case where curproc is
1670          * a door_call server and is allocating memory in the client's (a_proc)
1671          * address space.
1672          * When creating a shared memory segment a_proc will be NULL so we
1673          * fallback to curproc in that case.
1674          */
1675         struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1676         struct segvn_crargs crargs;
1677
1678         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1679         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1680             (size_t)raddr;
1681
1682         /*
1683          * check for wrap around
1684          */
1685         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1686                 AS_LOCK_EXIT(as);
1687                 return (ENOMEM);
1688         }
1689
1690         as->a_updatedir = 1;    /* inform /proc */
1691         gethrestime(&as->a_updatetime);
1692
1693         if (as != &kas) {
1694                 /*
1695                  * Ensure that the virtual size of the process will not exceed
1696                  * the configured limit.  Since seg_hole segments will later
1697                  * set the S_HOLE flag indicating their status as a hole in the
1698                  * AS, they are excluded from this check.
1699                  */
1700                 if (as->a_size + rsize > (size_t)p->p_vmem_ctl &&
1701                     !AS_MAP_CHECK_SEGHOLE(crfp)) {
1702                         AS_LOCK_EXIT(as);
1703
1704                         (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM],
1705                             p->p_rctls, p, RCA_UNSAFE_ALL);
1706                         return (ENOMEM);
1707                 }
1708         }
1709
1710         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1711                 boolean_t do_unmap = B_FALSE;
1712
1713                 crargs = *(struct segvn_crargs *)argsp;
1714                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs,
1715                     &do_unmap);
1716                 if (error != 0) {
1717                         AS_LOCK_EXIT(as);
1718                         if (do_unmap) {
1719                                 (void) as_unmap(as, addr, size);
1720                         }
1721                         return (error);
1722                 }
1723         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1724                 boolean_t do_unmap = B_FALSE;
1725
1726                 crargs = *(struct segvn_crargs *)argsp;
1727                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs,
1728                     &do_unmap);
1729                 if (error != 0) {
1730                         AS_LOCK_EXIT(as);
1731                         if (do_unmap) {
1732                                 (void) as_unmap(as, addr, size);
1733                         }
1734                         return (error);
1735                 }
1736         } else {
1737                 struct seg *seg, *segref;
1738
1739                 seg = segref = seg_alloc(as, addr, size);
1740                 if (seg == NULL) {
1741                         AS_LOCK_EXIT(as);
1742                         return (ENOMEM);
1743                 }
1744
1745                 /*
1746                  * It is possible that the segment creation routine will free
1747                  * 'seg' as part of a more advanced operation, such as when
1748                  * segvn concatenates adjacent segments together.  When this
1749                  * occurs, the seg*_create routine must communicate the
1750                  * resulting segment out via the 'struct seg **' parameter.
1751                  *
1752                  * If segment creation fails, it must not free the passed-in
1753                  * segment, nor alter the argument pointer.
1754                  */
1755                 error = (*crfp)(&seg, argsp);
1756                 if (error != 0) {
1757                         VERIFY3P(seg, ==, segref);
1758                         seg_free(seg);
1759                         AS_LOCK_EXIT(as);
1760                         return (error);
1761                 }
1762
1763                 /*
1764                  * Check if the resulting segment represents a hole in the
1765                  * address space, rather than contributing to the AS size.
1766                  */
1767                 is_hole = ((seg->s_flags & S_HOLE) != 0);
1768
1769                 /* Add size now so as_unmap will work if as_ctl fails. */
1770                 if (!is_hole) {
1771                         as->a_size += rsize;
1772                         as->a_resvsize += rsize;
1773                 }
1774         }
1775
1776         as_setwatch(as);
1777
1778         /*
1779          * Establish memory locks for the segment if the address space is
1780          * locked, provided it's not an explicit hole in the AS.
1781          */
1782         mutex_enter(&as->a_contents);
1783         if (AS_ISPGLCK(as) && !is_hole) {
1784                 mutex_exit(&as->a_contents);
1785                 AS_LOCK_EXIT(as);
1786                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1787                 if (error != 0)
1788                         (void) as_unmap(as, addr, size);
1789         } else {
1790                 mutex_exit(&as->a_contents);
1791                 AS_LOCK_EXIT(as);
1792         }
1793         return (error);
1794 }
1795
1796
1797 /*
1798  * Delete all segments in the address space marked with S_PURGE.
1799  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1800  * These segments are deleted as a first step before calls to as_gap(), so
1801  * that they don't affect mmap() or shmat().
1802  */
1803 void
1804 as_purge(struct as *as)
1805 {
1806         struct seg *seg;
1807         struct seg *next_seg;
1808
1809         /*
1810          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1811          * no need to grab a_contents mutex for this check
1812          */
1813         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1814                 return;
1815
1816         AS_LOCK_ENTER(as, RW_WRITER);
1817         next_seg = NULL;
1818         seg = AS_SEGFIRST(as);
1819         while (seg != NULL) {
1820                 next_seg = AS_SEGNEXT(as, seg);
1821                 if (seg->s_flags & S_PURGE)
1822                         (void) segop_unmap(seg, seg->s_base, seg->s_size);
1823                 seg = next_seg;
1824         }
1825         AS_LOCK_EXIT(as);
1826
1827         mutex_enter(&as->a_contents);
1828         as->a_flags &= ~AS_NEEDSPURGE;
1829         mutex_exit(&as->a_contents);
1830 }
1831
1832 /*
1833  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1834  * range of addresses at least "minlen" long, where the base of the range is
1835  * at "off" phase from an "align" boundary and there is space for a
1836  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1837  * if align was 4M and off was 16k, the user wants a hole which will start
1838  * 16k into a 4M page.
1839  *
1840  * If flags specifies AH_HI, the hole will have the highest possible address
1841  * in the range.  We use the as->a_lastgap field to figure out where to
1842  * start looking for a gap.
1843  *
1844  * Otherwise, the gap will have the lowest possible address.
1845  *
1846  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1847  *
1848  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1849  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1850  *
1851  * NOTE: This routine is not correct when base+len overflows caddr_t.
1852  */
1853 int
1854 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1855     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1856 {
1857         caddr_t lobound = *basep;
1858         caddr_t hibound = lobound + *lenp;
1859         struct seg *lseg, *hseg;
1860         caddr_t lo, hi;
1861         int forward;
1862         caddr_t save_base;
1863         size_t save_len;
1864         size_t save_minlen;
1865         size_t save_redzone;
1866         int fast_path = 1;
1867
1868         save_base = *basep;
1869         save_len = *lenp;
1870         save_minlen = minlen;
1871         save_redzone = redzone;
1872
1873         /*
1874          * For the first pass/fast_path, just add align and redzone into
1875          * minlen since if we get an allocation, we can guarantee that it
1876          * will fit the alignment and redzone requested.
1877          * This increases the chance that hibound will be adjusted to
1878          * a_lastgap->s_base which will likely allow us to find an
1879          * acceptable hole in the address space quicker.
1880          * If we can't find a hole with this fast_path, then we look for
1881          * smaller holes in which the alignment and offset may allow
1882          * the allocation to fit.
1883          */
1884         minlen += align;
1885         minlen += 2 * redzone;
1886         redzone = 0;
1887
1888         AS_LOCK_ENTER(as, RW_READER);
1889         if (AS_SEGFIRST(as) == NULL) {
1890                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1891                     align, redzone, off)) {
1892                         AS_LOCK_EXIT(as);
1893                         return (0);
1894                 } else {
1895                         AS_LOCK_EXIT(as);
1896                         *basep = save_base;
1897                         *lenp = save_len;
1898                         return (-1);
1899                 }
1900         }
1901
1902 retry:
1903         /*
1904          * Set up to iterate over all the inter-segment holes in the given
1905          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1906          * NULL for the highest-addressed hole.  If moving backwards, we reset
1907          * sseg to denote the highest-addressed segment.
1908          */
1909         forward = (flags & AH_DIR) == AH_LO;
1910         if (forward) {
1911                 hseg = as_findseg(as, lobound, 1);
1912                 lseg = AS_SEGPREV(as, hseg);
1913         } else {
1914
1915                 /*
1916                  * If allocating at least as much as the last allocation,
1917                  * use a_lastgap's base as a better estimate of hibound.
1918                  */
1919                 if (as->a_lastgap &&
1920                     minlen >= as->a_lastgap->s_size &&
1921                     hibound >= as->a_lastgap->s_base)
1922                         hibound = as->a_lastgap->s_base;
1923
1924                 hseg = as_findseg(as, hibound, 1);
1925                 if (hseg->s_base + hseg->s_size < hibound) {
1926                         lseg = hseg;
1927                         hseg = NULL;
1928                 } else {
1929                         lseg = AS_SEGPREV(as, hseg);
1930                 }
1931         }
1932
1933         for (;;) {
1934                 /*
1935                  * Set lo and hi to the hole's boundaries.  (We should really
1936                  * use MAXADDR in place of hibound in the expression below,
1937                  * but can't express it easily; using hibound in its place is
1938                  * harmless.)
1939                  */
1940                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1941                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1942                 /*
1943                  * If the iteration has moved past the interval from lobound
1944                  * to hibound it's pointless to continue.
1945                  */
1946                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1947                         break;
1948                 else if (lo > hibound || hi < lobound)
1949                         goto cont;
1950                 /*
1951                  * Candidate hole lies at least partially within the allowable
1952                  * range.  Restrict it to fall completely within that range,
1953                  * i.e., to [max(lo, lobound), min(hi, hibound)].
1954                  */
1955                 if (lo < lobound)
1956                         lo = lobound;
1957                 if (hi > hibound)
1958                         hi = hibound;
1959                 /*
1960                  * Verify that the candidate hole is big enough and meets
1961                  * hardware constraints.  If the hole is too small, no need
1962                  * to do the further checks since they will fail.
1963                  */
1964                 *basep = lo;
1965                 *lenp = hi - lo;
1966                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1967                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1968                     ((flags & AH_CONTAIN) == 0 ||
1969                     (*basep <= addr && *basep + *lenp > addr))) {
1970                         if (!forward)
1971                                 as->a_lastgap = hseg;
1972                         if (hseg != NULL)
1973                                 as->a_lastgaphl = hseg;
1974                         else
1975                                 as->a_lastgaphl = lseg;
1976                         AS_LOCK_EXIT(as);
1977                         return (0);
1978                 }
1979         cont:
1980                 /*
1981                  * Move to the next hole.
1982                  */
1983                 if (forward) {
1984                         lseg = hseg;
1985                         if (lseg == NULL)
1986                                 break;
1987                         hseg = AS_SEGNEXT(as, hseg);
1988                 } else {
1989                         hseg = lseg;
1990                         if (hseg == NULL)
1991                                 break;
1992                         lseg = AS_SEGPREV(as, lseg);
1993                 }
1994         }
1995         if (fast_path && (align != 0 || save_redzone != 0)) {
1996                 fast_path = 0;
1997                 minlen = save_minlen;
1998                 redzone = save_redzone;
1999                 goto retry;
2000         }
2001         *basep = save_base;
2002         *lenp = save_len;
2003         AS_LOCK_EXIT(as);
2004         return (-1);
2005 }
2006
2007 /*
2008  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2009  *
2010  * If flags specifies AH_HI, the hole will have the highest possible address
2011  * in the range.  We use the as->a_lastgap field to figure out where to
2012  * start looking for a gap.
2013  *
2014  * Otherwise, the gap will have the lowest possible address.
2015  *
2016  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2017  *
2018  * If an adequate hole is found, base and len are set to reflect the part of
2019  * the hole that is within range, and 0 is returned, otherwise,
2020  * -1 is returned.
2021  *
2022  * NOTE: This routine is not correct when base+len overflows caddr_t.
2023  */
2024 int
2025 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2026     caddr_t addr)
2027 {
2028
2029         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2030 }
2031
2032 /*
2033  * Return the next range within [base, base + len) that is backed
2034  * with "real memory".  Skip holes and non-seg_vn segments.
2035  * We're lazy and only return one segment at a time.
2036  */
2037 int
2038 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2039 {
2040         extern const struct seg_ops segspt_shmops; /* needs a header file */
2041         struct seg *seg;
2042         caddr_t addr, eaddr;
2043         caddr_t segend;
2044
2045         AS_LOCK_ENTER(as, RW_READER);
2046
2047         addr = *basep;
2048         eaddr = addr + *lenp;
2049
2050         seg = as_findseg(as, addr, 0);
2051         if (seg != NULL)
2052                 addr = MAX(seg->s_base, addr);
2053
2054         for (;;) {
2055                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2056                         AS_LOCK_EXIT(as);
2057                         return (EINVAL);
2058                 }
2059
2060                 if (seg->s_ops == &segvn_ops) {
2061                         segend = seg->s_base + seg->s_size;
2062                         break;
2063                 }
2064
2065                 /*
2066                  * We do ISM by looking into the private data
2067                  * to determine the real size of the segment.
2068                  */
2069                 if (seg->s_ops == &segspt_shmops) {
2070                         segend = seg->s_base + spt_realsize(seg);
2071                         if (addr < segend)
2072                                 break;
2073                 }
2074
2075                 seg = AS_SEGNEXT(as, seg);
2076
2077                 if (seg != NULL)
2078                         addr = seg->s_base;
2079         }
2080
2081         *basep = addr;
2082
2083         if (segend > eaddr)
2084                 *lenp = eaddr - addr;
2085         else
2086                 *lenp = segend - addr;
2087
2088         AS_LOCK_EXIT(as);
2089         return (0);
2090 }
2091
2092 /*
2093  * Determine whether data from the mappings in interval [addr, addr + size)
2094  * are in the primary memory (core) cache.
2095  */
2096 int
2097 as_incore(struct as *as, caddr_t addr,
2098     size_t size, char *vec, size_t *sizep)
2099 {
2100         struct seg *seg;
2101         size_t ssize;
2102         caddr_t raddr;          /* rounded down addr */
2103         size_t rsize;           /* rounded up size */
2104         size_t isize;                   /* iteration size */
2105         int error = 0;          /* result, assume success */
2106
2107         *sizep = 0;
2108         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2109         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2110             (size_t)raddr;
2111
2112         if (raddr + rsize < raddr)              /* check for wraparound */
2113                 return (ENOMEM);
2114
2115         AS_LOCK_ENTER(as, RW_READER);
2116         seg = as_segat(as, raddr);
2117         if (seg == NULL) {
2118                 AS_LOCK_EXIT(as);
2119                 return (-1);
2120         }
2121
2122         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2123                 if (raddr >= seg->s_base + seg->s_size) {
2124                         seg = AS_SEGNEXT(as, seg);
2125                         if (seg == NULL || raddr != seg->s_base) {
2126                                 error = -1;
2127                                 break;
2128                         }
2129                 }
2130                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2131                         ssize = seg->s_base + seg->s_size - raddr;
2132                 else
2133                         ssize = rsize;
2134                 *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2135                 if (isize != ssize) {
2136                         error = -1;
2137                         break;
2138                 }
2139                 vec += btopr(ssize);
2140         }
2141         AS_LOCK_EXIT(as);
2142         return (error);
2143 }
2144
2145 static void
2146 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2147     ulong_t *bitmap, size_t position, size_t npages)
2148 {
2149         caddr_t range_start;
2150         size_t  pos1 = position;
2151         size_t  pos2;
2152         size_t  size;
2153         size_t  end_pos = npages + position;
2154
2155         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2156                 size = ptob((pos2 - pos1));
2157                 range_start = (caddr_t)((uintptr_t)addr +
2158                     ptob(pos1 - position));
2159
2160                 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2161                     NULL, 0);
2162                 pos1 = pos2;
2163         }
2164 }
2165
2166 static void
2167 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2168     caddr_t raddr, size_t rsize)
2169 {
2170         struct seg *seg = as_segat(as, raddr);
2171         size_t ssize;
2172
2173         while (rsize != 0) {
2174                 if (raddr >= seg->s_base + seg->s_size)
2175                         seg = AS_SEGNEXT(as, seg);
2176
2177                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2178                         ssize = seg->s_base + seg->s_size - raddr;
2179                 else
2180                         ssize = rsize;
2181
2182                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2183
2184                 rsize -= ssize;
2185                 raddr += ssize;
2186         }
2187 }
2188
2189 /*
2190  * Cache control operations over the interval [addr, addr + size) in
2191  * address space "as".
2192  */
2193 /*ARGSUSED*/
2194 int
2195 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2196     uintptr_t arg, ulong_t *lock_map, size_t pos)
2197 {
2198         struct seg *seg;        /* working segment */
2199         caddr_t raddr;          /* rounded down addr */
2200         caddr_t initraddr;      /* saved initial rounded down addr */
2201         size_t rsize;           /* rounded up size */
2202         size_t initrsize;       /* saved initial rounded up size */
2203         size_t ssize;           /* size of seg */
2204         int error = 0;                  /* result */
2205         size_t mlock_size;      /* size of bitmap */
2206         ulong_t *mlock_map;     /* pointer to bitmap used */
2207                                 /* to represent the locked */
2208                                 /* pages. */
2209 retry:
2210         if (error == IE_RETRY)
2211                 AS_LOCK_ENTER(as, RW_WRITER);
2212         else
2213                 AS_LOCK_ENTER(as, RW_READER);
2214
2215         /*
2216          * If these are address space lock/unlock operations, loop over
2217          * all segments in the address space, as appropriate.
2218          */
2219         if (func == MC_LOCKAS) {
2220                 size_t npages, idx;
2221                 size_t rlen = 0;        /* rounded as length */
2222
2223                 idx = pos;
2224
2225                 if (arg & MCL_FUTURE) {
2226                         mutex_enter(&as->a_contents);
2227                         AS_SETPGLCK(as);
2228                         mutex_exit(&as->a_contents);
2229                 }
2230                 if ((arg & MCL_CURRENT) == 0) {
2231                         AS_LOCK_EXIT(as);
2232                         return (0);
2233                 }
2234
2235                 seg = AS_SEGFIRST(as);
2236                 if (seg == NULL) {
2237                         AS_LOCK_EXIT(as);
2238                         return (0);
2239                 }
2240
2241                 do {
2242                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2243                             (uintptr_t)PAGEMASK);
2244                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2245                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2246                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2247
2248                 mlock_size = BT_BITOUL(btopr(rlen));
2249                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2250                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2251                                 AS_LOCK_EXIT(as);
2252                                 return (EAGAIN);
2253                 }
2254
2255                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2256                         if ((seg->s_flags & S_HOLE) != 0)
2257                                 continue;
2258
2259                         error = segop_lockop(seg, seg->s_base,
2260                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2261                         if (error != 0)
2262                                 break;
2263                         pos += seg_pages(seg);
2264                 }
2265
2266                 if (error) {
2267                         for (seg = AS_SEGFIRST(as); seg != NULL;
2268                             seg = AS_SEGNEXT(as, seg)) {
2269
2270                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2271                                     (uintptr_t)PAGEMASK);
2272                                 npages = seg_pages(seg);
2273                                 as_segunlock(seg, raddr, attr, mlock_map,
2274                                     idx, npages);
2275                                 idx += npages;
2276                         }
2277                 }
2278
2279                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2280                 AS_LOCK_EXIT(as);
2281                 goto lockerr;
2282         } else if (func == MC_UNLOCKAS) {
2283                 mutex_enter(&as->a_contents);
2284                 AS_CLRPGLCK(as);
2285                 mutex_exit(&as->a_contents);
2286
2287                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2288                         if ((seg->s_flags & S_HOLE) != 0)
2289                                 continue;
2290
2291                         error = segop_lockop(seg, seg->s_base,
2292                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2293                         if (error != 0)
2294                                 break;
2295                 }
2296
2297                 AS_LOCK_EXIT(as);
2298                 goto lockerr;
2299         }
2300
2301         /*
2302          * Normalize addresses and sizes.
2303          */
2304         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2305         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2306             (size_t)raddr;
2307
2308         if (raddr + rsize < raddr) {            /* check for wraparound */
2309                 AS_LOCK_EXIT(as);
2310                 return (ENOMEM);
2311         }
2312
2313         /*
2314          * Get initial segment.
2315          */
2316         if ((seg = as_segat(as, raddr)) == NULL) {
2317                 AS_LOCK_EXIT(as);
2318                 return (ENOMEM);
2319         }
2320
2321         if (func == MC_LOCK) {
2322                 mlock_size = BT_BITOUL(btopr(rsize));
2323                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2324                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2325                                 AS_LOCK_EXIT(as);
2326                                 return (EAGAIN);
2327                 }
2328         }
2329
2330         /*
2331          * Loop over all segments.  If a hole in the address range is
2332          * discovered, then fail.  For each segment, perform the appropriate
2333          * control operation.
2334          */
2335         while (rsize != 0) {
2336
2337                 /*
2338                  * Make sure there's no hole, calculate the portion
2339                  * of the next segment to be operated over.
2340                  */
2341                 if (raddr >= seg->s_base + seg->s_size) {
2342                         seg = AS_SEGNEXT(as, seg);
2343                         if (seg == NULL || raddr != seg->s_base) {
2344                                 if (func == MC_LOCK) {
2345                                         as_unlockerr(as, attr, mlock_map,
2346                                             initraddr, initrsize - rsize);
2347                                         kmem_free(mlock_map,
2348                                             mlock_size * sizeof (ulong_t));
2349                                 }
2350                                 AS_LOCK_EXIT(as);
2351                                 return (ENOMEM);
2352                         }
2353                 }
2354                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2355                         ssize = seg->s_base + seg->s_size - raddr;
2356                 else
2357                         ssize = rsize;
2358
2359                 /*
2360                  * Dispatch on specific function.
2361                  */
2362                 switch (func) {
2363
2364                 /*
2365                  * Synchronize cached data from mappings with backing
2366                  * objects.
2367                  */
2368                 case MC_SYNC:
2369                         if (error = segop_sync(seg, raddr, ssize,
2370                             attr, (uint_t)arg)) {
2371                                 AS_LOCK_EXIT(as);
2372                                 return (error);
2373                         }
2374                         break;
2375
2376                 /*
2377                  * Lock pages in memory.
2378                  */
2379                 case MC_LOCK:
2380                         if (error = segop_lockop(seg, raddr, ssize,
2381                             attr, func, mlock_map, pos)) {
2382                                 as_unlockerr(as, attr, mlock_map, initraddr,
2383                                     initrsize - rsize + ssize);
2384                                 kmem_free(mlock_map, mlock_size *
2385                                     sizeof (ulong_t));
2386                                 AS_LOCK_EXIT(as);
2387                                 goto lockerr;
2388                         }
2389                         break;
2390
2391                 /*
2392                  * Unlock mapped pages.
2393                  */
2394                 case MC_UNLOCK:
2395                         (void) segop_lockop(seg, raddr, ssize, attr, func,
2396                             NULL, 0);
2397                         break;
2398
2399                 /*
2400                  * Store VM advise for mapped pages in segment layer.
2401                  */
2402                 case MC_ADVISE:
2403                         error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2404
2405                         /*
2406                          * Check for regular errors and special retry error
2407                          */
2408                         if (error) {
2409                                 if (error == IE_RETRY) {
2410                                         /*
2411                                          * Need to acquire writers lock, so
2412                                          * have to drop readers lock and start
2413                                          * all over again
2414                                          */
2415                                         AS_LOCK_EXIT(as);
2416                                         goto retry;
2417                                 } else if (error == IE_REATTACH) {
2418                                         /*
2419                                          * Find segment for current address
2420                                          * because current segment just got
2421                                          * split or concatenated
2422                                          */
2423                                         seg = as_segat(as, raddr);
2424                                         if (seg == NULL) {
2425                                                 AS_LOCK_EXIT(as);
2426                                                 return (ENOMEM);
2427                                         }
2428                                 } else {
2429                                         /*
2430                                          * Regular error
2431                                          */
2432                                         AS_LOCK_EXIT(as);
2433                                         return (error);
2434                                 }
2435                         }
2436                         break;
2437
2438                 case MC_INHERIT_ZERO:
2439                         error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2440                         if (error != 0) {
2441                                 AS_LOCK_EXIT(as);
2442                                 return (error);
2443                         }
2444                         break;
2445
2446                 /*
2447                  * Can't happen.
2448                  */
2449                 default:
2450                         panic("as_ctl: bad operation %d", func);
2451                         /*NOTREACHED*/
2452                 }
2453
2454                 rsize -= ssize;
2455                 raddr += ssize;
2456         }
2457
2458         if (func == MC_LOCK)
2459                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2460         AS_LOCK_EXIT(as);
2461         return (0);
2462 lockerr:
2463
2464         /*
2465          * If the lower levels returned EDEADLK for a segment lockop,
2466          * it means that we should retry the operation.  Let's wait
2467          * a bit also to let the deadlock causing condition clear.
2468          * This is part of a gross hack to work around a design flaw
2469          * in the ufs/sds logging code and should go away when the
2470          * logging code is re-designed to fix the problem. See bug
2471          * 4125102 for details of the problem.
2472          */
2473         if (error == EDEADLK) {
2474                 delay(deadlk_wait);
2475                 error = 0;
2476                 goto retry;
2477         }
2478         return (error);
2479 }
2480
2481 int
2482 fc_decode(faultcode_t fault_err)
2483 {
2484         int error = 0;
2485
2486         switch (FC_CODE(fault_err)) {
2487         case FC_OBJERR:
2488                 error = FC_ERRNO(fault_err);
2489                 break;
2490         case FC_PROT:
2491                 error = EACCES;
2492                 break;
2493         default:
2494                 error = EFAULT;
2495                 break;
2496         }
2497         return (error);
2498 }
2499
2500 /*
2501  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2502  * lists from each segment and copy them to one contiguous shadow list (plist)
2503  * as expected by the caller.  Save pointers to per segment shadow lists at
2504  * the tail of plist so that they can be used during as_pageunlock().
2505  */
2506 static int
2507 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2508     caddr_t addr, size_t size, enum seg_rw rw)
2509 {
2510         caddr_t sv_addr = addr;
2511         size_t sv_size = size;
2512         struct seg *sv_seg = seg;
2513         ulong_t segcnt = 1;
2514         ulong_t cnt;
2515         size_t ssize;
2516         pgcnt_t npages = btop(size);
2517         page_t **plist;
2518         page_t **pl;
2519         int error;
2520         caddr_t eaddr;
2521         faultcode_t fault_err = 0;
2522         pgcnt_t pl_off;
2523         extern const struct seg_ops segspt_shmops;
2524
2525         ASSERT(AS_LOCK_HELD(as));
2526         ASSERT(seg != NULL);
2527         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2528         ASSERT(addr + size > seg->s_base + seg->s_size);
2529         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2530         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2531
2532         /*
2533          * Count the number of segments covered by the range we are about to
2534          * lock. The segment count is used to size the shadow list we return
2535          * back to the caller.
2536          */
2537         for (; size != 0; size -= ssize, addr += ssize) {
2538                 if (addr >= seg->s_base + seg->s_size) {
2539
2540                         seg = AS_SEGNEXT(as, seg);
2541                         if (seg == NULL || addr != seg->s_base) {
2542                                 AS_LOCK_EXIT(as);
2543                                 return (EFAULT);
2544                         }
2545                         /*
2546                          * Do a quick check if subsequent segments
2547                          * will most likely support pagelock.
2548                          */
2549                         if (seg->s_ops == &segvn_ops) {
2550                                 vnode_t *vp;
2551
2552                                 if (segop_getvp(seg, addr, &vp) != 0 ||
2553                                     vp != NULL) {
2554                                         AS_LOCK_EXIT(as);
2555                                         goto slow;
2556                                 }
2557                         } else if (seg->s_ops != &segspt_shmops) {
2558                                 AS_LOCK_EXIT(as);
2559                                 goto slow;
2560                         }
2561                         segcnt++;
2562                 }
2563                 if (addr + size > seg->s_base + seg->s_size) {
2564                         ssize = seg->s_base + seg->s_size - addr;
2565                 } else {
2566                         ssize = size;
2567                 }
2568         }
2569         ASSERT(segcnt > 1);
2570
2571         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2572
2573         addr = sv_addr;
2574         size = sv_size;
2575         seg = sv_seg;
2576
2577         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2578                 if (addr >= seg->s_base + seg->s_size) {
2579                         seg = AS_SEGNEXT(as, seg);
2580                         ASSERT(seg != NULL && addr == seg->s_base);
2581                         cnt++;
2582                         ASSERT(cnt < segcnt);
2583                 }
2584                 if (addr + size > seg->s_base + seg->s_size) {
2585                         ssize = seg->s_base + seg->s_size - addr;
2586                 } else {
2587                         ssize = size;
2588                 }
2589                 pl = &plist[npages + cnt];
2590                 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2591                     L_PAGELOCK, rw);
2592                 if (error) {
2593                         break;
2594                 }
2595                 ASSERT(plist[npages + cnt] != NULL);
2596                 ASSERT(pl_off + btop(ssize) <= npages);
2597                 bcopy(plist[npages + cnt], &plist[pl_off],
2598                     btop(ssize) * sizeof (page_t *));
2599                 pl_off += btop(ssize);
2600         }
2601
2602         if (size == 0) {
2603                 AS_LOCK_EXIT(as);
2604                 ASSERT(cnt == segcnt - 1);
2605                 *ppp = plist;
2606                 return (0);
2607         }
2608
2609         /*
2610          * one of pagelock calls failed. The error type is in error variable.
2611          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2612          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2613          * back to the caller.
2614          */
2615
2616         eaddr = addr;
2617         seg = sv_seg;
2618
2619         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2620                 if (addr >= seg->s_base + seg->s_size) {
2621                         seg = AS_SEGNEXT(as, seg);
2622                         ASSERT(seg != NULL && addr == seg->s_base);
2623                         cnt++;
2624                         ASSERT(cnt < segcnt);
2625                 }
2626                 if (eaddr > seg->s_base + seg->s_size) {
2627                         ssize = seg->s_base + seg->s_size - addr;
2628                 } else {
2629                         ssize = eaddr - addr;
2630                 }
2631                 pl = &plist[npages + cnt];
2632                 ASSERT(*pl != NULL);
2633                 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2634                     L_PAGEUNLOCK, rw);
2635         }
2636
2637         AS_LOCK_EXIT(as);
2638
2639         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2640
2641         if (error != ENOTSUP && error != EFAULT) {
2642                 return (error);
2643         }
2644
2645 slow:
2646         /*
2647          * If we are here because pagelock failed due to the need to cow fault
2648          * in the pages we want to lock F_SOFTLOCK will do this job and in
2649          * next as_pagelock() call for this address range pagelock will
2650          * hopefully succeed.
2651          */
2652         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2653         if (fault_err != 0) {
2654                 return (fc_decode(fault_err));
2655         }
2656         *ppp = NULL;
2657
2658         return (0);
2659 }
2660
2661 /*
2662  * lock pages in a given address space. Return shadow list. If
2663  * the list is NULL, the MMU mapping is also locked.
2664  */
2665 int
2666 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2667     size_t size, enum seg_rw rw)
2668 {
2669         size_t rsize;
2670         caddr_t raddr;
2671         faultcode_t fault_err;
2672         struct seg *seg;
2673         int err;
2674
2675         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2676         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2677             (size_t)raddr;
2678
2679         /*
2680          * if the request crosses two segments let
2681          * as_fault handle it.
2682          */
2683         AS_LOCK_ENTER(as, RW_READER);
2684
2685         seg = as_segat(as, raddr);
2686         if (seg == NULL) {
2687                 AS_LOCK_EXIT(as);
2688                 return (EFAULT);
2689         }
2690         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2691         if (raddr + rsize > seg->s_base + seg->s_size) {
2692                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2693         }
2694         if (raddr + rsize <= raddr) {
2695                 AS_LOCK_EXIT(as);
2696                 return (EFAULT);
2697         }
2698
2699         /*
2700          * try to lock pages and pass back shadow list
2701          */
2702         err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2703
2704         AS_LOCK_EXIT(as);
2705
2706         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2707                 return (err);
2708         }
2709
2710         /*
2711          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2712          * to no pagelock support for this segment or pages need to be cow
2713          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2714          * this as_pagelock() call and in the next as_pagelock() call for the
2715          * same address range pagelock call will hopefull succeed.
2716          */
2717         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2718         if (fault_err != 0) {
2719                 return (fc_decode(fault_err));
2720         }
2721         *ppp = NULL;
2722
2723         return (0);
2724 }
2725
2726 /*
2727  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2728  * lists from the end of plist and call pageunlock interface for each segment.
2729  * Drop as lock and free plist.
2730  */
2731 static void
2732 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2733     struct page **plist, enum seg_rw rw)
2734 {
2735         ulong_t cnt;
2736         caddr_t eaddr = addr + size;
2737         pgcnt_t npages = btop(size);
2738         size_t ssize;
2739         page_t **pl;
2740
2741         ASSERT(AS_LOCK_HELD(as));
2742         ASSERT(seg != NULL);
2743         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2744         ASSERT(addr + size > seg->s_base + seg->s_size);
2745         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2746         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2747         ASSERT(plist != NULL);
2748
2749         for (cnt = 0; addr < eaddr; addr += ssize) {
2750                 if (addr >= seg->s_base + seg->s_size) {
2751                         seg = AS_SEGNEXT(as, seg);
2752                         ASSERT(seg != NULL && addr == seg->s_base);
2753                         cnt++;
2754                 }
2755                 if (eaddr > seg->s_base + seg->s_size) {
2756                         ssize = seg->s_base + seg->s_size - addr;
2757                 } else {
2758                         ssize = eaddr - addr;
2759                 }
2760                 pl = &plist[npages + cnt];
2761                 ASSERT(*pl != NULL);
2762                 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2763                     L_PAGEUNLOCK, rw);
2764         }
2765         ASSERT(cnt > 0);
2766         AS_LOCK_EXIT(as);
2767
2768         cnt++;
2769         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2770 }
2771
2772 /*
2773  * unlock pages in a given address range
2774  */
2775 void
2776 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2777     enum seg_rw rw)
2778 {
2779         struct seg *seg;
2780         size_t rsize;
2781         caddr_t raddr;
2782
2783         /*
2784          * if the shadow list is NULL, as_pagelock was
2785          * falling back to as_fault
2786          */
2787         if (pp == NULL) {
2788                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2789                 return;
2790         }
2791
2792         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2793         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2794             (size_t)raddr;
2795
2796         AS_LOCK_ENTER(as, RW_READER);
2797         seg = as_segat(as, raddr);
2798         ASSERT(seg != NULL);
2799
2800         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2801         if (raddr + rsize <= seg->s_base + seg->s_size) {
2802                 (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2803         } else {
2804                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2805                 return;
2806         }
2807         AS_LOCK_EXIT(as);
2808 }
2809
2810 int
2811 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2812     boolean_t wait)
2813 {
2814         struct seg *seg;
2815         size_t ssize;
2816         caddr_t raddr;                  /* rounded down addr */
2817         size_t rsize;                   /* rounded up size */
2818         int error = 0;
2819         size_t pgsz = page_get_pagesize(szc);
2820
2821 setpgsz_top:
2822         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2823                 return (EINVAL);
2824         }
2825
2826         raddr = addr;
2827         rsize = size;
2828
2829         if (raddr + rsize < raddr)              /* check for wraparound */
2830                 return (ENOMEM);
2831
2832         AS_LOCK_ENTER(as, RW_WRITER);
2833         as_clearwatchprot(as, raddr, rsize);
2834         seg = as_segat(as, raddr);
2835         if (seg == NULL) {
2836                 as_setwatch(as);
2837                 AS_LOCK_EXIT(as);
2838                 return (ENOMEM);
2839         }
2840
2841         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2842                 if (raddr >= seg->s_base + seg->s_size) {
2843                         seg = AS_SEGNEXT(as, seg);
2844                         if (seg == NULL || raddr != seg->s_base) {
2845                                 error = ENOMEM;
2846                                 break;
2847                         }
2848                 }
2849                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2850                         ssize = seg->s_base + seg->s_size - raddr;
2851                 } else {
2852                         ssize = rsize;
2853                 }
2854
2855 retry:
2856                 error = segop_setpagesize(seg, raddr, ssize, szc);
2857
2858                 if (error == IE_NOMEM) {
2859                         error = EAGAIN;
2860                         break;
2861                 }
2862
2863                 if (error == IE_RETRY) {
2864                         AS_LOCK_EXIT(as);
2865                         goto setpgsz_top;
2866                 }
2867
2868                 if (error == ENOTSUP) {
2869                         error = EINVAL;
2870                         break;
2871                 }
2872
2873                 if (wait && (error == EAGAIN)) {
2874                         /*
2875                          * Memory is currently locked.  It must be unlocked
2876                          * before this operation can succeed through a retry.
2877                          * The possible reasons for locked memory and
2878                          * corresponding strategies for unlocking are:
2879                          * (1) Normal I/O
2880                          *      wait for a signal that the I/O operation
2881                          *      has completed and the memory is unlocked.
2882                          * (2) Asynchronous I/O
2883                          *      The aio subsystem does not unlock pages when
2884                          *      the I/O is completed. Those pages are unlocked
2885                          *      when the application calls aiowait/aioerror.
2886                          *      So, to prevent blocking forever, cv_broadcast()
2887                          *      is done to wake up aio_cleanup_thread.
2888                          *      Subsequently, segvn_reclaim will be called, and
2889                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
2890                          * (3) Long term page locking:
2891                          *      This is not relevant for as_setpagesize()
2892                          *      because we cannot change the page size for
2893                          *      driver memory. The attempt to do so will
2894                          *      fail with a different error than EAGAIN so
2895                          *      there's no need to trigger as callbacks like
2896                          *      as_unmap, as_setprot or as_free would do.
2897                          */
2898                         mutex_enter(&as->a_contents);
2899                         if (!AS_ISNOUNMAPWAIT(as)) {
2900                                 if (AS_ISUNMAPWAIT(as) == 0) {
2901                                         cv_broadcast(&as->a_cv);
2902                                 }
2903                                 AS_SETUNMAPWAIT(as);
2904                                 AS_LOCK_EXIT(as);
2905                                 while (AS_ISUNMAPWAIT(as)) {
2906                                         cv_wait(&as->a_cv, &as->a_contents);
2907                                 }
2908                         } else {
2909                                 /*
2910                                  * We may have raced with
2911                                  * segvn_reclaim()/segspt_reclaim(). In this
2912                                  * case clean nounmapwait flag and retry since
2913                                  * softlockcnt in this segment may be already
2914                                  * 0.  We don't drop as writer lock so our
2915                                  * number of retries without sleeping should
2916                                  * be very small. See segvn_reclaim() for
2917                                  * more comments.
2918                                  */
2919                                 AS_CLRNOUNMAPWAIT(as);
2920                                 mutex_exit(&as->a_contents);
2921                                 goto retry;
2922                         }
2923                         mutex_exit(&as->a_contents);
2924                         goto setpgsz_top;
2925                 } else if (error != 0) {
2926                         break;
2927                 }
2928         }
2929         as_setwatch(as);
2930         AS_LOCK_EXIT(as);
2931         return (error);
2932 }
2933
2934 /*
2935  * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2936  * in its chunk where s_szc is less than the szc we want to set.
2937  */
2938 static int
2939 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2940     int *retry)
2941 {
2942         struct seg *seg;
2943         size_t ssize;
2944         int error;
2945
2946         ASSERT(AS_WRITE_HELD(as));
2947
2948         seg = as_segat(as, raddr);
2949         if (seg == NULL) {
2950                 panic("as_iset3_default_lpsize: no seg");
2951         }
2952
2953         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2954                 if (raddr >= seg->s_base + seg->s_size) {
2955                         seg = AS_SEGNEXT(as, seg);
2956                         if (seg == NULL || raddr != seg->s_base) {
2957                                 panic("as_iset3_default_lpsize: as changed");
2958                         }
2959                 }
2960                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2961                         ssize = seg->s_base + seg->s_size - raddr;
2962                 } else {
2963                         ssize = rsize;
2964                 }
2965
2966                 if (szc > seg->s_szc) {
2967                         error = segop_setpagesize(seg, raddr, ssize, szc);
2968                         /* Only retry on EINVAL segments that have no vnode. */
2969                         if (error == EINVAL) {
2970                                 vnode_t *vp = NULL;
2971                                 if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
2972                                     (segop_getvp(seg, raddr, &vp) != 0 ||
2973                                     vp == NULL)) {
2974                                         *retry = 1;
2975                                 } else {
2976                                         *retry = 0;
2977                                 }
2978                         }
2979                         if (error) {
2980                                 return (error);
2981                         }
2982                 }
2983         }
2984         return (0);
2985 }
2986
2987 /*
2988  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2989  * pagesize on each segment in its range, but if any fails with EINVAL,
2990  * then it reduces the pagesizes to the next size in the bitmap and
2991  * retries as_iset3_default_lpsize(). The reason why the code retries
2992  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2993  * match the bigger sizes, and (b) it's hard to get this offset (to begin
2994  * with) to pass to map_pgszcvec().
2995  */
2996 static int
2997 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2998     uint_t szcvec)
2999 {
3000         int error;
3001         int retry;
3002
3003         ASSERT(AS_WRITE_HELD(as));
3004
3005         for (;;) {
3006                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3007                 if (error == EINVAL && retry) {
3008                         szcvec &= ~(1 << szc);
3009                         if (szcvec <= 1) {
3010                                 return (EINVAL);
3011                         }
3012                         szc = highbit(szcvec) - 1;
3013                 } else {
3014                         return (error);
3015                 }
3016         }
3017 }
3018
3019 /*
3020  * as_iset1_default_lpsize() breaks its chunk into areas where existing
3021  * segments have a smaller szc than we want to set. For each such area,
3022  * it calls as_iset2_default_lpsize()
3023  */
3024 static int
3025 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3026     uint_t szcvec)
3027 {
3028         struct seg *seg;
3029         size_t ssize;
3030         caddr_t setaddr = raddr;
3031         size_t setsize = 0;
3032         int set;
3033         int error;
3034
3035         ASSERT(AS_WRITE_HELD(as));
3036
3037         seg = as_segat(as, raddr);
3038         if (seg == NULL) {
3039                 panic("as_iset1_default_lpsize: no seg");
3040         }
3041         if (seg->s_szc < szc) {
3042                 set = 1;
3043         } else {
3044                 set = 0;
3045         }
3046
3047         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3048                 if (raddr >= seg->s_base + seg->s_size) {
3049                         seg = AS_SEGNEXT(as, seg);
3050                         if (seg == NULL || raddr != seg->s_base) {
3051                                 panic("as_iset1_default_lpsize: as changed");
3052                         }
3053                         if (seg->s_szc >= szc && set) {
3054                                 ASSERT(setsize != 0);
3055                                 error = as_iset2_default_lpsize(as,
3056                                     setaddr, setsize, szc, szcvec);
3057                                 if (error) {
3058                                         return (error);
3059                                 }
3060                                 set = 0;
3061                         } else if (seg->s_szc < szc && !set) {
3062                                 setaddr = raddr;
3063                                 setsize = 0;
3064                                 set = 1;
3065                         }
3066                 }
3067                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3068                         ssize = seg->s_base + seg->s_size - raddr;
3069                 } else {
3070                         ssize = rsize;
3071                 }
3072         }
3073         error = 0;
3074         if (set) {
3075                 ASSERT(setsize != 0);
3076                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3077                     szc, szcvec);
3078         }
3079         return (error);
3080 }
3081
3082 /*
3083  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3084  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3085  * chunk to as_iset1_default_lpsize().
3086  */
3087 static int
3088 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3089     int type)
3090 {
3091         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3092         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3093             flags, rtype, 1);
3094         uint_t szc;
3095         uint_t nszc;
3096         int error;
3097         caddr_t a;
3098         caddr_t eaddr;
3099         size_t segsize;
3100         size_t pgsz;
3101         uint_t save_szcvec;
3102
3103         ASSERT(AS_WRITE_HELD(as));
3104         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3105         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3106
3107         szcvec &= ~1;
3108         if (szcvec <= 1) {      /* skip if base page size */
3109                 return (0);
3110         }
3111
3112         /* Get the pagesize of the first larger page size. */
3113         szc = lowbit(szcvec) - 1;
3114         pgsz = page_get_pagesize(szc);
3115         eaddr = addr + size;
3116         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3117         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3118
3119         save_szcvec = szcvec;
3120         szcvec >>= (szc + 1);
3121         nszc = szc;
3122         while (szcvec) {
3123                 if ((szcvec & 0x1) == 0) {
3124                         nszc++;
3125                         szcvec >>= 1;
3126                         continue;
3127                 }
3128                 nszc++;
3129                 pgsz = page_get_pagesize(nszc);
3130                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3131                 if (a != addr) {
3132                         ASSERT(szc > 0);
3133                         ASSERT(a < eaddr);
3134                         segsize = a - addr;
3135                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3136                             save_szcvec);
3137                         if (error) {
3138                                 return (error);
3139                         }
3140                         addr = a;
3141                 }
3142                 szc = nszc;
3143                 szcvec >>= 1;
3144         }
3145
3146         ASSERT(addr < eaddr);
3147         szcvec = save_szcvec;
3148         while (szcvec) {
3149                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3150                 ASSERT(a >= addr);
3151                 if (a != addr) {
3152                         ASSERT(szc > 0);
3153                         segsize = a - addr;
3154                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3155                             save_szcvec);
3156                         if (error) {
3157                                 return (error);
3158                         }
3159                         addr = a;
3160                 }
3161                 szcvec &= ~(1 << szc);
3162                 if (szcvec) {
3163                         szc = highbit(szcvec) - 1;
3164                         pgsz = page_get_pagesize(szc);
3165                 }
3166         }
3167         ASSERT(addr == eaddr);
3168
3169         return (0);
3170 }
3171
3172 /*
3173  * Set the default large page size for the range. Called via memcntl with
3174  * page size set to 0. as_set_default_lpsize breaks the range down into
3175  * chunks with the same type/flags, ignores-non segvn segments, and passes
3176  * each chunk to as_iset_default_lpsize().
3177  */
3178 int
3179 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3180 {
3181         struct seg *seg;
3182         caddr_t raddr;
3183         size_t rsize;
3184         size_t ssize;
3185         int rtype, rflags;
3186         int stype, sflags;
3187         int error;
3188         caddr_t setaddr;
3189         size_t setsize;
3190         int segvn;
3191
3192         if (size == 0)
3193                 return (0);
3194
3195         AS_LOCK_ENTER(as, RW_WRITER);
3196 again:
3197         error = 0;
3198
3199         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3200         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3201             (size_t)raddr;
3202
3203         if (raddr + rsize < raddr) {            /* check for wraparound */
3204                 AS_LOCK_EXIT(as);
3205                 return (ENOMEM);
3206         }
3207         as_clearwatchprot(as, raddr, rsize);
3208         seg = as_segat(as, raddr);
3209         if (seg == NULL) {
3210                 as_setwatch(as);
3211                 AS_LOCK_EXIT(as);
3212                 return (ENOMEM);
3213         }
3214         if (seg->s_ops == &segvn_ops) {
3215                 rtype = segop_gettype(seg, addr);
3216                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3217                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3218                 segvn = 1;
3219         } else {
3220                 segvn = 0;
3221         }
3222         setaddr = raddr;
3223         setsize = 0;
3224
3225         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3226                 if (raddr >= (seg->s_base + seg->s_size)) {
3227                         seg = AS_SEGNEXT(as, seg);
3228                         if (seg == NULL || raddr != seg->s_base) {
3229                                 error = ENOMEM;
3230                                 break;
3231                         }
3232                         if (seg->s_ops == &segvn_ops) {
3233                                 stype = segop_gettype(seg, raddr);
3234                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3235                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3236                                 if (segvn && (rflags != sflags ||
3237                                     rtype != stype)) {
3238                                         /*
3239                                          * The next segment is also segvn but
3240                                          * has different flags and/or type.
3241                                          */
3242                                         ASSERT(setsize != 0);
3243                                         error = as_iset_default_lpsize(as,
3244                                             setaddr, setsize, rflags, rtype);
3245                                         if (error) {
3246                                                 break;
3247                                         }
3248                                         rflags = sflags;
3249                                         rtype = stype;
3250                                         setaddr = raddr;
3251                                         setsize = 0;
3252                                 } else if (!segvn) {
3253                                         rflags = sflags;
3254                                         rtype = stype;
3255                                         setaddr = raddr;
3256                                         setsize = 0;
3257                                         segvn = 1;
3258                                 }
3259                         } else if (segvn) {
3260                                 /* The next segment is not segvn. */
3261                                 ASSERT(setsize != 0);
3262                                 error = as_iset_default_lpsize(as,
3263                                     setaddr, setsize, rflags, rtype);
3264                                 if (error) {
3265                                         break;
3266                                 }
3267                                 segvn = 0;
3268                         }
3269                 }
3270                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3271                         ssize = seg->s_base + seg->s_size - raddr;
3272                 } else {
3273                         ssize = rsize;
3274                 }
3275         }
3276         if (error == 0 && segvn) {
3277                 /* The last chunk when rsize == 0. */
3278                 ASSERT(setsize != 0);
3279                 error = as_iset_default_lpsize(as, setaddr, setsize,
3280                     rflags, rtype);
3281         }
3282
3283         if (error == IE_RETRY) {
3284                 goto again;
3285         } else if (error == IE_NOMEM) {
3286                 error = EAGAIN;
3287         } else if (error == ENOTSUP) {
3288                 error = EINVAL;
3289         } else if (error == EAGAIN) {
3290                 mutex_enter(&as->a_contents);
3291                 if (!AS_ISNOUNMAPWAIT(as)) {
3292                         if (AS_ISUNMAPWAIT(as) == 0) {
3293                                 cv_broadcast(&as->a_cv);
3294                         }
3295                         AS_SETUNMAPWAIT(as);
3296                         AS_LOCK_EXIT(as);
3297                         while (AS_ISUNMAPWAIT(as)) {
3298                                 cv_wait(&as->a_cv, &as->a_contents);
3299                         }
3300                         mutex_exit(&as->a_contents);
3301                         AS_LOCK_ENTER(as, RW_WRITER);
3302                 } else {
3303                         /*
3304                          * We may have raced with
3305                          * segvn_reclaim()/segspt_reclaim(). In this case
3306                          * clean nounmapwait flag and retry since softlockcnt
3307                          * in this segment may be already 0.  We don't drop as
3308                          * writer lock so our number of retries without
3309                          * sleeping should be very small. See segvn_reclaim()
3310                          * for more comments.
3311                          */
3312                         AS_CLRNOUNMAPWAIT(as);
3313                         mutex_exit(&as->a_contents);
3314                 }
3315                 goto again;
3316         }
3317
3318         as_setwatch(as);
3319         AS_LOCK_EXIT(as);
3320         return (error);
3321 }
3322
3323 /*
3324  * Setup all of the uninitialized watched pages that we can.
3325  */
3326 void
3327 as_setwatch(struct as *as)
3328 {
3329         struct watched_page *pwp;
3330         struct seg *seg;
3331         caddr_t vaddr;
3332         uint_t prot;
3333         int  err, retrycnt;
3334
3335         if (avl_numnodes(&as->a_wpage) == 0)
3336                 return;
3337
3338         ASSERT(AS_WRITE_HELD(as));
3339
3340         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3341             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3342                 retrycnt = 0;
3343         retry:
3344                 vaddr = pwp->wp_vaddr;
3345                 if (pwp->wp_oprot != 0 ||       /* already set up */
3346                     (seg = as_segat(as, vaddr)) == NULL ||
3347                     segop_getprot(seg, vaddr, 0, &prot) != 0)
3348                         continue;
3349
3350                 pwp->wp_oprot = prot;
3351                 if (pwp->wp_read)
3352                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3353                 if (pwp->wp_write)
3354                         prot &= ~PROT_WRITE;
3355                 if (pwp->wp_exec)
3356                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3357                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3358                         err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3359                         if (err == IE_RETRY) {
3360                                 pwp->wp_oprot = 0;
3361                                 ASSERT(retrycnt == 0);
3362                                 retrycnt++;
3363                                 goto retry;
3364                         }
3365                 }
3366                 pwp->wp_prot = prot;
3367         }
3368 }
3369
3370 /*
3371  * Clear all of the watched pages in the address space.
3372  */
3373 void
3374 as_clearwatch(struct as *as)
3375 {
3376         struct watched_page *pwp;
3377         struct seg *seg;
3378         caddr_t vaddr;
3379         uint_t prot;
3380         int err, retrycnt;
3381
3382         if (avl_numnodes(&as->a_wpage) == 0)
3383                 return;
3384
3385         ASSERT(AS_WRITE_HELD(as));
3386
3387         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3388             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3389                 retrycnt = 0;
3390         retry:
3391                 vaddr = pwp->wp_vaddr;
3392                 if (pwp->wp_oprot == 0 ||       /* not set up */
3393                     (seg = as_segat(as, vaddr)) == NULL)
3394                         continue;
3395
3396                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3397                         err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3398                         if (err == IE_RETRY) {
3399                                 ASSERT(retrycnt == 0);
3400                                 retrycnt++;
3401                                 goto retry;
3402                         }
3403                 }
3404                 pwp->wp_oprot = 0;
3405                 pwp->wp_prot = 0;
3406         }
3407 }
3408
3409 /*
3410  * Force a new setup for all the watched pages in the range.
3411  */
3412 static void
3413 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3414 {
3415         struct watched_page *pwp;
3416         struct watched_page tpw;
3417         caddr_t eaddr = addr + size;
3418         caddr_t vaddr;
3419         struct seg *seg;
3420         int err, retrycnt;
3421         uint_t  wprot;
3422         avl_index_t where;
3423
3424         if (avl_numnodes(&as->a_wpage) == 0)
3425                 return;
3426
3427         ASSERT(AS_WRITE_HELD(as));
3428
3429         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3430         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3431                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3432
3433         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3434                 retrycnt = 0;
3435                 vaddr = pwp->wp_vaddr;
3436
3437                 wprot = prot;
3438                 if (pwp->wp_read)
3439                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3440                 if (pwp->wp_write)
3441                         wprot &= ~PROT_WRITE;
3442                 if (pwp->wp_exec)
3443                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3444                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3445                 retry:
3446                         seg = as_segat(as, vaddr);
3447                         if (seg == NULL) {
3448                                 panic("as_setwatchprot: no seg");
3449                                 /*NOTREACHED*/
3450                         }
3451                         err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3452                         if (err == IE_RETRY) {
3453                                 ASSERT(retrycnt == 0);
3454                                 retrycnt++;
3455                                 goto retry;
3456                         }
3457                 }
3458                 pwp->wp_oprot = prot;
3459                 pwp->wp_prot = wprot;
3460
3461                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3462         }
3463 }
3464
3465 /*
3466  * Clear all of the watched pages in the range.
3467  */
3468 static void
3469 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3470 {
3471         caddr_t eaddr = addr + size;
3472         struct watched_page *pwp;
3473         struct watched_page tpw;
3474         uint_t prot;
3475         struct seg *seg;
3476         int err, retrycnt;
3477         avl_index_t where;
3478
3479         if (avl_numnodes(&as->a_wpage) == 0)
3480                 return;
3481
3482         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3483         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3484                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3485
3486         ASSERT(AS_WRITE_HELD(as));
3487
3488         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3489
3490                 if ((prot = pwp->wp_oprot) != 0) {
3491                         retrycnt = 0;
3492
3493                         if (prot != pwp->wp_prot) {
3494                         retry:
3495                                 seg = as_segat(as, pwp->wp_vaddr);
3496                                 if (seg == NULL)
3497                                         continue;
3498                                 err = segop_setprot(seg, pwp->wp_vaddr,
3499                                     PAGESIZE, prot);
3500                                 if (err == IE_RETRY) {
3501                                         ASSERT(retrycnt == 0);
3502                                         retrycnt++;
3503                                         goto retry;
3504
3505                                 }
3506                         }
3507                         pwp->wp_oprot = 0;
3508                         pwp->wp_prot = 0;
3509                 }
3510
3511                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3512         }
3513 }
3514
3515 void
3516 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3517 {
3518         struct proc *p;
3519
3520         mutex_enter(&pidlock);
3521         for (p = practive; p; p = p->p_next) {
3522                 if (p->p_as == as) {
3523                         mutex_enter(&p->p_lock);
3524                         if (p->p_as == as)
3525                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3526                         mutex_exit(&p->p_lock);
3527                 }
3528         }
3529         mutex_exit(&pidlock);
3530 }
3531
3532 /*
3533  * return memory object ID
3534  */
3535 int
3536 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3537 {
3538         struct seg      *seg;
3539         int             sts;
3540
3541         AS_LOCK_ENTER(as, RW_READER);
3542         seg = as_segat(as, addr);
3543         if (seg == NULL) {
3544                 AS_LOCK_EXIT(as);
3545                 return (EFAULT);
3546         }
3547
3548         sts = segop_getmemid(seg, addr, memidp);
3549
3550         AS_LOCK_EXIT(as);
3551         return (sts);
3552 }