kernel/vm/vm_as.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015, Joyent, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  29 /*        All Rights Reserved   */
  30
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40
  41 /*
  42  * VM - address spaces.
  43  */
  44
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>
  49 #include <sys/systm.h>
  50 #include <sys/mman.h>
  51 #include <sys/sysmacros.h>
  52 #include <sys/cpuvar.h>
  53 #include <sys/sysinfo.h>
  54 #include <sys/kmem.h>
  55 #include <sys/vnode.h>
  56 #include <sys/vmsystm.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/debug.h>
  59 #include <sys/tnf_probe.h>
  60 #include <sys/vtrace.h>
  61
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_vn.h>
  66 #include <vm/seg_dev.h>
  67 #include <vm/seg_kmem.h>
  68 #include <vm/seg_map.h>
  69 #include <vm/seg_spt.h>
  70 #include <vm/page.h>
  71
  72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  73
  74 static struct kmem_cache *as_cache;
  75
  76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  79
  80
  81 /*
  82  * Verifying the segment lists is very time-consuming; it may not be
  83  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  84  */
  85 #ifdef DEBUG
  86 #define VERIFY_SEGLIST
  87 int do_as_verify = 0;
  88 #endif
  89
  90 /*
  91  * Allocate a new callback data structure entry and fill in the events of
  92  * interest, the address range of interest, and the callback argument.
  93  * Link the entry on the as->a_callbacks list. A callback entry for the
  94  * entire address space may be specified with vaddr = 0 and size = -1.
  95  *
  96  * CALLERS RESPONSIBILITY: If not calling from within the process context for
  97  * the specified as, the caller must guarantee persistence of the specified as
  98  * for the duration of this function (eg. pages being locked within the as
  99  * will guarantee persistence).
 100  */
 101 int
 102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 103     caddr_t vaddr, size_t size, int sleepflag)
 104 {
 105         struct as_callback      *current_head, *cb;
 106         caddr_t                 saddr;
 107         size_t                  rsize;
 108
 109         /* callback function and an event are mandatory */
 110         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 111                 return (EINVAL);
 112
 113         /* Adding a callback after as_free has been called is not allowed */
 114         if (as == &kas)
 115                 return (ENOMEM);
 116
 117         /*
 118          * vaddr = 0 and size = -1 is used to indicate that the callback range
 119          * is the entire address space so no rounding is done in that case.
 120          */
 121         if (size != -1) {
 122                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 123                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 124                     (size_t)saddr;
 125                 /* check for wraparound */
 126                 if (saddr + rsize < saddr)
 127                         return (ENOMEM);
 128         } else {
 129                 if (vaddr != 0)
 130                         return (EINVAL);
 131                 saddr = vaddr;
 132                 rsize = size;
 133         }
 134
 135         /* Allocate and initialize a callback entry */
 136         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 137         if (cb == NULL)
 138                 return (EAGAIN);
 139
 140         cb->ascb_func = cb_func;
 141         cb->ascb_arg = arg;
 142         cb->ascb_events = events;
 143         cb->ascb_saddr = saddr;
 144         cb->ascb_len = rsize;
 145
 146         /* Add the entry to the list */
 147         mutex_enter(&as->a_contents);
 148         current_head = as->a_callbacks;
 149         as->a_callbacks = cb;
 150         cb->ascb_next = current_head;
 151
 152         /*
 153          * The call to this function may lose in a race with
 154          * a pertinent event - eg. a thread does long term memory locking
 155          * but before the callback is added another thread executes as_unmap.
 156          * A broadcast here resolves that.
 157          */
 158         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 159                 AS_CLRUNMAPWAIT(as);
 160                 cv_broadcast(&as->a_cv);
 161         }
 162
 163         mutex_exit(&as->a_contents);
 164         return (0);
 165 }
 166
 167 /*
 168  * Search the callback list for an entry which pertains to arg.
 169  *
 170  * This is called from within the client upon completion of the callback.
 171  * RETURN VALUES:
 172  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 173  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 174  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 175  *                      entry will be made in as_do_callbacks)
 176  *
 177  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 178  * set, it indicates that as_do_callbacks is processing this entry.  The
 179  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 180  * to unblock as_do_callbacks, in case it is blocked.
 181  *
 182  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 183  * the specified as, the caller must guarantee persistence of the specified as
 184  * for the duration of this function (eg. pages being locked within the as
 185  * will guarantee persistence).
 186  */
 187 uint_t
 188 as_delete_callback(struct as *as, void *arg)
 189 {
 190         struct as_callback **prevcb = &as->a_callbacks;
 191         struct as_callback *cb;
 192         uint_t rc = AS_CALLBACK_NOTFOUND;
 193
 194         mutex_enter(&as->a_contents);
 195         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 196                 if (cb->ascb_arg != arg)
 197                         continue;
 198
 199                 /*
 200                  * If the events indicate AS_CALLBACK_CALLED, just clear
 201                  * AS_ALL_EVENT in the events field and wakeup the thread
 202                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 203                  * will take care of removing this entry from the list.  In
 204                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 205                  * (AS_CALLBACK_CALLED not set), just remove it from the
 206                  * list, return the memory and return AS_CALLBACK_DELETED.
 207                  */
 208                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 209                         /* leave AS_CALLBACK_CALLED */
 210                         cb->ascb_events &= ~AS_ALL_EVENT;
 211                         rc = AS_CALLBACK_DELETE_DEFERRED;
 212                         cv_broadcast(&as->a_cv);
 213                 } else {
 214                         *prevcb = cb->ascb_next;
 215                         kmem_free(cb, sizeof (struct as_callback));
 216                         rc = AS_CALLBACK_DELETED;
 217                 }
 218                 break;
 219         }
 220         mutex_exit(&as->a_contents);
 221         return (rc);
 222 }
 223
 224 /*
 225  * Searches the as callback list for a matching entry.
 226  * Returns a pointer to the first matching callback, or NULL if
 227  * nothing is found.
 228  * This function never sleeps so it is ok to call it with more
 229  * locks held but the (required) a_contents mutex.
 230  *
 231  * See also comment on as_do_callbacks below.
 232  */
 233 static struct as_callback *
 234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 235     size_t event_len)
 236 {
 237         struct as_callback      *cb;
 238
 239         ASSERT(MUTEX_HELD(&as->a_contents));
 240         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 241                 /*
 242                  * If the callback has not already been called, then
 243                  * check if events or address range pertains.  An event_len
 244                  * of zero means do an unconditional callback.
 245                  */
 246                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 247                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 248                     (event_addr + event_len < cb->ascb_saddr) ||
 249                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 250                         continue;
 251                 }
 252                 break;
 253         }
 254         return (cb);
 255 }
 256
 257 /*
 258  * Executes a given callback and removes it from the callback list for
 259  * this address space.
 260  * This function may sleep so the caller must drop all locks except
 261  * a_contents before calling this func.
 262  *
 263  * See also comments on as_do_callbacks below.
 264  */
 265 static void
 266 as_execute_callback(struct as *as, struct as_callback *cb,
 267     uint_t events)
 268 {
 269         struct as_callback **prevcb;
 270         void    *cb_arg;
 271
 272         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 273         cb->ascb_events |= AS_CALLBACK_CALLED;
 274         mutex_exit(&as->a_contents);
 275         (*cb->ascb_func)(as, cb->ascb_arg, events);
 276         mutex_enter(&as->a_contents);
 277         /*
 278          * the callback function is required to delete the callback
 279          * when the callback function determines it is OK for
 280          * this thread to continue. as_delete_callback will clear
 281          * the AS_ALL_EVENT in the events field when it is deleted.
 282          * If the callback function called as_delete_callback,
 283          * events will already be cleared and there will be no blocking.
 284          */
 285         while ((cb->ascb_events & events) != 0) {
 286                 cv_wait(&as->a_cv, &as->a_contents);
 287         }
 288         /*
 289          * This entry needs to be taken off the list. Normally, the
 290          * callback func itself does that, but unfortunately the list
 291          * may have changed while the callback was running because the
 292          * a_contents mutex was dropped and someone else other than the
 293          * callback func itself could have called as_delete_callback,
 294          * so we have to search to find this entry again.  The entry
 295          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 296          */
 297         cb_arg = cb->ascb_arg;
 298         prevcb = &as->a_callbacks;
 299         for (cb = as->a_callbacks; cb != NULL;
 300             prevcb = &cb->ascb_next, cb = *prevcb) {
 301                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 302                     (cb_arg != cb->ascb_arg)) {
 303                         continue;
 304                 }
 305                 *prevcb = cb->ascb_next;
 306                 kmem_free(cb, sizeof (struct as_callback));
 307                 break;
 308         }
 309 }
 310
 311 /*
 312  * Check the callback list for a matching event and intersection of
 313  * address range. If there is a match invoke the callback.  Skip an entry if:
 314  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 315  *    - not event of interest
 316  *    - not address range of interest
 317  *
 318  * An event_len of zero indicates a request for an unconditional callback
 319  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 320  * a_contents lock must be dropped before a callback, so only one callback
 321  * can be done before returning. Return -1 (true) if a callback was
 322  * executed and removed from the list, else return 0 (false).
 323  *
 324  * The logically separate parts, i.e. finding a matching callback and
 325  * executing a given callback have been separated into two functions
 326  * so that they can be called with different sets of locks held beyond
 327  * the always-required a_contents. as_find_callback does not sleep so
 328  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 329  * rwlock) are held. as_execute_callback on the other hand may sleep
 330  * so all locks beyond a_contents must be dropped by the caller if one
 331  * does not want to end comatose.
 332  */
 333 static int
 334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 335     size_t event_len)
 336 {
 337         struct as_callback *cb;
 338
 339         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 340                 as_execute_callback(as, cb, events);
 341                 return (-1);
 342         }
 343         return (0);
 344 }
 345
 346 /*
 347  * Search for the segment containing addr. If a segment containing addr
 348  * exists, that segment is returned.  If no such segment exists, and
 349  * the list spans addresses greater than addr, then the first segment
 350  * whose base is greater than addr is returned; otherwise, NULL is
 351  * returned unless tail is true, in which case the last element of the
 352  * list is returned.
 353  *
 354  * a_seglast is used to cache the last found segment for repeated
 355  * searches to the same addr (which happens frequently).
 356  */
 357 struct seg *
 358 as_findseg(struct as *as, caddr_t addr, int tail)
 359 {
 360         struct seg *seg = as->a_seglast;
 361         avl_index_t where;
 362
 363         ASSERT(AS_LOCK_HELD(as));
 364
 365         if (seg != NULL &&
 366             seg->s_base <= addr &&
 367             addr < seg->s_base + seg->s_size)
 368                 return (seg);
 369
 370         seg = avl_find(&as->a_segtree, &addr, &where);
 371         if (seg != NULL)
 372                 return (as->a_seglast = seg);
 373
 374         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 375         if (seg == NULL && tail)
 376                 seg = avl_last(&as->a_segtree);
 377         return (as->a_seglast = seg);
 378 }
 379
 380 #ifdef VERIFY_SEGLIST
 381 /*
 382  * verify that the linked list is coherent
 383  */
 384 static void
 385 as_verify(struct as *as)
 386 {
 387         struct seg *seg, *seglast, *p, *n;
 388         uint_t nsegs = 0;
 389
 390         if (do_as_verify == 0)
 391                 return;
 392
 393         seglast = as->a_seglast;
 394
 395         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 396                 ASSERT(seg->s_as == as);
 397                 p = AS_SEGPREV(as, seg);
 398                 n = AS_SEGNEXT(as, seg);
 399                 ASSERT(p == NULL || p->s_as == as);
 400                 ASSERT(p == NULL || p->s_base < seg->s_base);
 401                 ASSERT(n == NULL || n->s_base > seg->s_base);
 402                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 403                 if (seg == seglast)
 404                         seglast = NULL;
 405                 nsegs++;
 406         }
 407         ASSERT(seglast == NULL);
 408         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 409 }
 410 #endif /* VERIFY_SEGLIST */
 411
 412 /*
 413  * Add a new segment to the address space. The avl_find()
 414  * may be expensive so we attempt to use last segment accessed
 415  * in as_gap() as an insertion point.
 416  */
 417 int
 418 as_addseg(struct as  *as, struct seg *newseg)
 419 {
 420         struct seg *seg;
 421         caddr_t addr;
 422         caddr_t eaddr;
 423         avl_index_t where;
 424
 425         ASSERT(AS_WRITE_HELD(as));
 426
 427         as->a_updatedir = 1;    /* inform /proc */
 428         gethrestime(&as->a_updatetime);
 429
 430         if (as->a_lastgaphl != NULL) {
 431                 struct seg *hseg = NULL;
 432                 struct seg *lseg = NULL;
 433
 434                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 435                         hseg = as->a_lastgaphl;
 436                         lseg = AVL_PREV(&as->a_segtree, hseg);
 437                 } else {
 438                         lseg = as->a_lastgaphl;
 439                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 440                 }
 441
 442                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 443                     hseg->s_base > newseg->s_base) {
 444                         avl_insert_here(&as->a_segtree, newseg, lseg,
 445                             AVL_AFTER);
 446                         as->a_lastgaphl = NULL;
 447                         as->a_seglast = newseg;
 448                         return (0);
 449                 }
 450                 as->a_lastgaphl = NULL;
 451         }
 452
 453         addr = newseg->s_base;
 454         eaddr = addr + newseg->s_size;
 455 again:
 456
 457         seg = avl_find(&as->a_segtree, &addr, &where);
 458
 459         if (seg == NULL)
 460                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 461
 462         if (seg == NULL)
 463                 seg = avl_last(&as->a_segtree);
 464
 465         if (seg != NULL) {
 466                 caddr_t base = seg->s_base;
 467
 468                 /*
 469                  * If top of seg is below the requested address, then
 470                  * the insertion point is at the end of the linked list,
 471                  * and seg points to the tail of the list.  Otherwise,
 472                  * the insertion point is immediately before seg.
 473                  */
 474                 if (base + seg->s_size > addr) {
 475                         if (addr >= base || eaddr > base) {
 476 #ifdef __sparc
 477                                 extern const struct seg_ops segnf_ops;
 478
 479                                 /*
 480                                  * no-fault segs must disappear if overlaid.
 481                                  * XXX need new segment type so
 482                                  * we don't have to check s_ops
 483                                  */
 484                                 if (seg->s_ops == &segnf_ops) {
 485                                         seg_unmap(seg);
 486                                         goto again;
 487                                 }
 488 #endif
 489                                 return (-1);    /* overlapping segment */
 490                         }
 491                 }
 492         }
 493         as->a_seglast = newseg;
 494         avl_insert(&as->a_segtree, newseg, where);
 495
 496 #ifdef VERIFY_SEGLIST
 497         as_verify(as);
 498 #endif
 499         return (0);
 500 }
 501
 502 struct seg *
 503 as_removeseg(struct as *as, struct seg *seg)
 504 {
 505         avl_tree_t *t;
 506
 507         ASSERT(AS_WRITE_HELD(as));
 508
 509         as->a_updatedir = 1;    /* inform /proc */
 510         gethrestime(&as->a_updatetime);
 511
 512         if (seg == NULL)
 513                 return (NULL);
 514
 515         t = &as->a_segtree;
 516         if (as->a_seglast == seg)
 517                 as->a_seglast = NULL;
 518         as->a_lastgaphl = NULL;
 519
 520         /*
 521          * if this segment is at an address higher than
 522          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 523          */
 524         if (as->a_lastgap &&
 525             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 526                 as->a_lastgap = AVL_NEXT(t, seg);
 527
 528         /*
 529          * remove the segment from the seg tree
 530          */
 531         avl_remove(t, seg);
 532
 533 #ifdef VERIFY_SEGLIST
 534         as_verify(as);
 535 #endif
 536         return (seg);
 537 }
 538
 539 /*
 540  * Find a segment containing addr.
 541  */
 542 struct seg *
 543 as_segat(struct as *as, caddr_t addr)
 544 {
 545         struct seg *seg = as->a_seglast;
 546
 547         ASSERT(AS_LOCK_HELD(as));
 548
 549         if (seg != NULL && seg->s_base <= addr &&
 550             addr < seg->s_base + seg->s_size)
 551                 return (seg);
 552
 553         seg = avl_find(&as->a_segtree, &addr, NULL);
 554         return (seg);
 555 }
 556
 557 /*
 558  * Serialize all searches for holes in an address space to
 559  * prevent two or more threads from allocating the same virtual
 560  * address range.  The address space must not be "read/write"
 561  * locked by the caller since we may block.
 562  */
 563 void
 564 as_rangelock(struct as *as)
 565 {
 566         mutex_enter(&as->a_contents);
 567         while (AS_ISCLAIMGAP(as))
 568                 cv_wait(&as->a_cv, &as->a_contents);
 569         AS_SETCLAIMGAP(as);
 570         mutex_exit(&as->a_contents);
 571 }
 572
 573 /*
 574  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 575  */
 576 void
 577 as_rangeunlock(struct as *as)
 578 {
 579         mutex_enter(&as->a_contents);
 580         AS_CLRCLAIMGAP(as);
 581         cv_signal(&as->a_cv);
 582         mutex_exit(&as->a_contents);
 583 }
 584
 585 /*
 586  * compar segments (or just an address) by segment address range
 587  */
 588 static int
 589 as_segcompar(const void *x, const void *y)
 590 {
 591         struct seg *a = (struct seg *)x;
 592         struct seg *b = (struct seg *)y;
 593
 594         if (a->s_base < b->s_base)
 595                 return (-1);
 596         if (a->s_base >= b->s_base + b->s_size)
 597                 return (1);
 598         return (0);
 599 }
 600
 601
 602 void
 603 as_avlinit(struct as *as)
 604 {
 605         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 606             offsetof(struct seg, s_tree));
 607         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 608             offsetof(struct watched_page, wp_link));
 609 }
 610
 611 /*ARGSUSED*/
 612 static int
 613 as_constructor(void *buf, void *cdrarg, int kmflags)
 614 {
 615         struct as *as = buf;
 616
 617         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 618         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 619         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 620         as_avlinit(as);
 621         return (0);
 622 }
 623
 624 /*ARGSUSED1*/
 625 static void
 626 as_destructor(void *buf, void *cdrarg)
 627 {
 628         struct as *as = buf;
 629
 630         avl_destroy(&as->a_segtree);
 631         mutex_destroy(&as->a_contents);
 632         cv_destroy(&as->a_cv);
 633         rw_destroy(&as->a_lock);
 634 }
 635
 636 void
 637 as_init(void)
 638 {
 639         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 640             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 641 }
 642
 643 /*
 644  * Allocate and initialize an address space data structure.
 645  * We call hat_alloc to allow any machine dependent
 646  * information in the hat structure to be initialized.
 647  */
 648 struct as *
 649 as_alloc(void)
 650 {
 651         struct as *as;
 652
 653         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 654
 655         as->a_flags             = 0;
 656         as->a_vbits             = 0;
 657         as->a_hrm               = NULL;
 658         as->a_seglast           = NULL;
 659         as->a_size              = 0;
 660         as->a_resvsize          = 0;
 661         as->a_updatedir         = 0;
 662         gethrestime(&as->a_updatetime);
 663         as->a_objectdir         = NULL;
 664         as->a_sizedir           = 0;
 665         as->a_userlimit         = (caddr_t)USERLIMIT;
 666         as->a_lastgap           = NULL;
 667         as->a_lastgaphl         = NULL;
 668         as->a_callbacks         = NULL;
 669         as->a_proc              = NULL;
 670
 671         AS_LOCK_ENTER(as, RW_WRITER);
 672         as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 673         AS_LOCK_EXIT(as);
 674
 675         return (as);
 676 }
 677
 678 /*
 679  * Free an address space data structure.
 680  * Need to free the hat first and then
 681  * all the segments on this as and finally
 682  * the space for the as struct itself.
 683  */
 684 void
 685 as_free(struct as *as)
 686 {
 687         struct hat *hat = as->a_hat;
 688         struct seg *seg, *next;
 689         boolean_t free_started = B_FALSE;
 690
 691 top:
 692         /*
 693          * Invoke ALL callbacks. as_do_callbacks will do one callback
 694          * per call, and not return (-1) until the callback has completed.
 695          * When as_do_callbacks returns zero, all callbacks have completed.
 696          */
 697         mutex_enter(&as->a_contents);
 698         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 699                 ;
 700
 701         mutex_exit(&as->a_contents);
 702         AS_LOCK_ENTER(as, RW_WRITER);
 703
 704         if (!free_started) {
 705                 free_started = B_TRUE;
 706                 hat_free_start(hat);
 707         }
 708         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 709                 int err;
 710
 711                 next = AS_SEGNEXT(as, seg);
 712 retry:
 713                 err = segop_unmap(seg, seg->s_base, seg->s_size);
 714                 if (err == EAGAIN) {
 715                         mutex_enter(&as->a_contents);
 716                         if (as->a_callbacks) {
 717                                 AS_LOCK_EXIT(as);
 718                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 719                                 /*
 720                                  * Memory is currently locked. Wait for a
 721                                  * cv_signal that it has been unlocked, then
 722                                  * try the operation again.
 723                                  */
 724                                 if (AS_ISUNMAPWAIT(as) == 0)
 725                                         cv_broadcast(&as->a_cv);
 726                                 AS_SETUNMAPWAIT(as);
 727                                 AS_LOCK_EXIT(as);
 728                                 while (AS_ISUNMAPWAIT(as))
 729                                         cv_wait(&as->a_cv, &as->a_contents);
 730                         } else {
 731                                 /*
 732                                  * We may have raced with
 733                                  * segvn_reclaim()/segspt_reclaim(). In this
 734                                  * case clean nounmapwait flag and retry since
 735                                  * softlockcnt in this segment may be already
 736                                  * 0.  We don't drop as writer lock so our
 737                                  * number of retries without sleeping should
 738                                  * be very small. See segvn_reclaim() for
 739                                  * more comments.
 740                                  */
 741                                 AS_CLRNOUNMAPWAIT(as);
 742                                 mutex_exit(&as->a_contents);
 743                                 goto retry;
 744                         }
 745                         mutex_exit(&as->a_contents);
 746                         goto top;
 747                 } else {
 748                         /*
 749                          * We do not expect any other error return at this
 750                          * time. This is similar to an ASSERT in seg_unmap()
 751                          */
 752                         ASSERT(err == 0);
 753                 }
 754         }
 755         hat_free_end(hat);
 756         AS_LOCK_EXIT(as);
 757
 758         /* /proc stuff */
 759         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 760         if (as->a_objectdir) {
 761                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 762                 as->a_objectdir = NULL;
 763                 as->a_sizedir = 0;
 764         }
 765
 766         /*
 767          * Free the struct as back to kmem.  Assert it has no segments.
 768          */
 769         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 770         kmem_cache_free(as_cache, as);
 771 }
 772
 773 int
 774 as_dup(struct as *as, struct proc *forkedproc)
 775 {
 776         struct as *newas;
 777         struct seg *seg, *newseg;
 778         size_t  purgesize = 0;
 779         int error;
 780
 781         AS_LOCK_ENTER(as, RW_WRITER);
 782         as_clearwatch(as);
 783         newas = as_alloc();
 784         newas->a_userlimit = as->a_userlimit;
 785         newas->a_proc = forkedproc;
 786
 787         AS_LOCK_ENTER(newas, RW_WRITER);
 788
 789         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 790
 791         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 792
 793                 if (seg->s_flags & S_PURGE) {
 794                         purgesize += seg->s_size;
 795                         continue;
 796                 }
 797
 798                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 799                 if (newseg == NULL) {
 800                         AS_LOCK_EXIT(newas);
 801                         as_setwatch(as);
 802                         AS_LOCK_EXIT(as);
 803                         as_free(newas);
 804                         return (-1);
 805                 }
 806                 if ((error = segop_dup(seg, newseg)) != 0) {
 807                         /*
 808                          * We call seg_free() on the new seg
 809                          * because the segment is not set up
 810                          * completely; i.e. it has no ops.
 811                          */
 812                         as_setwatch(as);
 813                         AS_LOCK_EXIT(as);
 814                         seg_free(newseg);
 815                         AS_LOCK_EXIT(newas);
 816                         as_free(newas);
 817                         return (error);
 818                 }
 819                 newas->a_size += seg->s_size;
 820         }
 821         newas->a_resvsize = as->a_resvsize - purgesize;
 822
 823         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 824
 825         AS_LOCK_EXIT(newas);
 826
 827         as_setwatch(as);
 828         AS_LOCK_EXIT(as);
 829         if (error != 0) {
 830                 as_free(newas);
 831                 return (error);
 832         }
 833         forkedproc->p_as = newas;
 834         return (0);
 835 }
 836
 837 /*
 838  * Handle a ``fault'' at addr for size bytes.
 839  */
 840 faultcode_t
 841 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 842     enum fault_type type, enum seg_rw rw)
 843 {
 844         struct seg *seg;
 845         caddr_t raddr;                  /* rounded down addr */
 846         size_t rsize;                   /* rounded up size */
 847         size_t ssize;
 848         faultcode_t res = 0;
 849         caddr_t addrsav;
 850         struct seg *segsav;
 851         int as_lock_held;
 852         klwp_t *lwp = ttolwp(curthread);
 853
 854
 855
 856 retry:
 857         /*
 858          * Indicate that the lwp is not to be stopped while waiting for a
 859          * pagefault.  This is to avoid deadlock while debugging a process
 860          * via /proc over NFS (in particular).
 861          */
 862         if (lwp != NULL)
 863                 lwp->lwp_nostop++;
 864
 865         /*
 866          * same length must be used when we softlock and softunlock.  We
 867          * don't support softunlocking lengths less than the original length
 868          * when there is largepage support.  See seg_dev.c for more
 869          * comments.
 870          */
 871         switch (type) {
 872
 873         case F_SOFTLOCK:
 874                 CPU_STATS_ADD_K(vm, softlock, 1);
 875                 break;
 876
 877         case F_SOFTUNLOCK:
 878                 break;
 879
 880         case F_PROT:
 881                 CPU_STATS_ADD_K(vm, prot_fault, 1);
 882                 break;
 883
 884         case F_INVAL:
 885                 CPU_STATS_ENTER_K();
 886                 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 887                 if (as == &kas)
 888                         CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 889                 CPU_STATS_EXIT_K();
 890                 break;
 891         }
 892
 893         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 894         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 895             (size_t)raddr;
 896
 897         /*
 898          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 899          * correctness, but then we could be stuck holding this lock for
 900          * a LONG time if the fault needs to be resolved on a slow
 901          * filesystem, and then no-one will be able to exec new commands,
 902          * as exec'ing requires the write lock on the as.
 903          */
 904         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 905             raddr + size < segkmap->s_base + segkmap->s_size) {
 906                 seg = segkmap;
 907                 as_lock_held = 0;
 908         } else {
 909                 AS_LOCK_ENTER(as, RW_READER);
 910
 911                 seg = as_segat(as, raddr);
 912                 if (seg == NULL) {
 913                         AS_LOCK_EXIT(as);
 914                         if (lwp != NULL)
 915                                 lwp->lwp_nostop--;
 916                         return (FC_NOMAP);
 917                 }
 918
 919                 as_lock_held = 1;
 920         }
 921
 922         addrsav = raddr;
 923         segsav = seg;
 924
 925         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 926                 if (raddr >= seg->s_base + seg->s_size) {
 927                         seg = AS_SEGNEXT(as, seg);
 928                         if (seg == NULL || raddr != seg->s_base) {
 929                                 res = FC_NOMAP;
 930                                 break;
 931                         }
 932                 }
 933                 if (raddr + rsize > seg->s_base + seg->s_size)
 934                         ssize = seg->s_base + seg->s_size - raddr;
 935                 else
 936                         ssize = rsize;
 937
 938                 res = segop_fault(hat, seg, raddr, ssize, type, rw);
 939                 if (res != 0)
 940                         break;
 941         }
 942
 943         /*
 944          * If we were SOFTLOCKing and encountered a failure,
 945          * we must SOFTUNLOCK the range we already did. (Maybe we
 946          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 947          * right here...)
 948          */
 949         if (res != 0 && type == F_SOFTLOCK) {
 950                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 951                         if (addrsav >= seg->s_base + seg->s_size)
 952                                 seg = AS_SEGNEXT(as, seg);
 953                         ASSERT(seg != NULL);
 954                         /*
 955                          * Now call the fault routine again to perform the
 956                          * unlock using S_OTHER instead of the rw variable
 957                          * since we never got a chance to touch the pages.
 958                          */
 959                         if (raddr > seg->s_base + seg->s_size)
 960                                 ssize = seg->s_base + seg->s_size - addrsav;
 961                         else
 962                                 ssize = raddr - addrsav;
 963                         (void) segop_fault(hat, seg, addrsav, ssize,
 964                             F_SOFTUNLOCK, S_OTHER);
 965                 }
 966         }
 967         if (as_lock_held)
 968                 AS_LOCK_EXIT(as);
 969         if (lwp != NULL)
 970                 lwp->lwp_nostop--;
 971
 972         /*
 973          * If the lower levels returned EDEADLK for a fault,
 974          * It means that we should retry the fault.  Let's wait
 975          * a bit also to let the deadlock causing condition clear.
 976          * This is part of a gross hack to work around a design flaw
 977          * in the ufs/sds logging code and should go away when the
 978          * logging code is re-designed to fix the problem. See bug
 979          * 4125102 for details of the problem.
 980          */
 981         if (FC_ERRNO(res) == EDEADLK) {
 982                 delay(deadlk_wait);
 983                 res = 0;
 984                 goto retry;
 985         }
 986         return (res);
 987 }
 988
 989
 990
 991 /*
 992  * Asynchronous ``fault'' at addr for size bytes.
 993  */
 994 faultcode_t
 995 as_faulta(struct as *as, caddr_t addr, size_t size)
 996 {
 997         struct seg *seg;
 998         caddr_t raddr;                  /* rounded down addr */
 999         size_t rsize;                   /* rounded up size */
1000         faultcode_t res = 0;
1001         klwp_t *lwp = ttolwp(curthread);
1002
1003 retry:
1004         /*
1005          * Indicate that the lwp is not to be stopped while waiting
1006          * for a pagefault.  This is to avoid deadlock while debugging
1007          * a process via /proc over NFS (in particular).
1008          */
1009         if (lwp != NULL)
1010                 lwp->lwp_nostop++;
1011
1012         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1013         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1014             (size_t)raddr;
1015
1016         AS_LOCK_ENTER(as, RW_READER);
1017         seg = as_segat(as, raddr);
1018         if (seg == NULL) {
1019                 AS_LOCK_EXIT(as);
1020                 if (lwp != NULL)
1021                         lwp->lwp_nostop--;
1022                 return (FC_NOMAP);
1023         }
1024
1025         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1026                 if (raddr >= seg->s_base + seg->s_size) {
1027                         seg = AS_SEGNEXT(as, seg);
1028                         if (seg == NULL || raddr != seg->s_base) {
1029                                 res = FC_NOMAP;
1030                                 break;
1031                         }
1032                 }
1033                 res = segop_faulta(seg, raddr);
1034                 if (res != 0)
1035                         break;
1036         }
1037         AS_LOCK_EXIT(as);
1038         if (lwp != NULL)
1039                 lwp->lwp_nostop--;
1040         /*
1041          * If the lower levels returned EDEADLK for a fault,
1042          * It means that we should retry the fault.  Let's wait
1043          * a bit also to let the deadlock causing condition clear.
1044          * This is part of a gross hack to work around a design flaw
1045          * in the ufs/sds logging code and should go away when the
1046          * logging code is re-designed to fix the problem. See bug
1047          * 4125102 for details of the problem.
1048          */
1049         if (FC_ERRNO(res) == EDEADLK) {
1050                 delay(deadlk_wait);
1051                 res = 0;
1052                 goto retry;
1053         }
1054         return (res);
1055 }
1056
1057 /*
1058  * Set the virtual mapping for the interval from [addr : addr + size)
1059  * in address space `as' to have the specified protection.
1060  * It is ok for the range to cross over several segments,
1061  * as long as they are contiguous.
1062  */
1063 int
1064 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1065 {
1066         struct seg *seg;
1067         struct as_callback *cb;
1068         size_t ssize;
1069         caddr_t raddr;                  /* rounded down addr */
1070         size_t rsize;                   /* rounded up size */
1071         int error = 0, writer = 0;
1072         caddr_t saveraddr;
1073         size_t saversize;
1074
1075 setprot_top:
1076         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1077         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1078             (size_t)raddr;
1079
1080         if (raddr + rsize < raddr)              /* check for wraparound */
1081                 return (ENOMEM);
1082
1083         saveraddr = raddr;
1084         saversize = rsize;
1085
1086         /*
1087          * Normally we only lock the as as a reader. But
1088          * if due to setprot the segment driver needs to split
1089          * a segment it will return IE_RETRY. Therefore we re-acquire
1090          * the as lock as a writer so the segment driver can change
1091          * the seg list. Also the segment driver will return IE_RETRY
1092          * after it has changed the segment list so we therefore keep
1093          * locking as a writer. Since these opeartions should be rare
1094          * want to only lock as a writer when necessary.
1095          */
1096         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1097                 AS_LOCK_ENTER(as, RW_WRITER);
1098         } else {
1099                 AS_LOCK_ENTER(as, RW_READER);
1100         }
1101
1102         as_clearwatchprot(as, raddr, rsize);
1103         seg = as_segat(as, raddr);
1104         if (seg == NULL) {
1105                 as_setwatch(as);
1106                 AS_LOCK_EXIT(as);
1107                 return (ENOMEM);
1108         }
1109
1110         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1111                 if (raddr >= seg->s_base + seg->s_size) {
1112                         seg = AS_SEGNEXT(as, seg);
1113                         if (seg == NULL || raddr != seg->s_base) {
1114                                 error = ENOMEM;
1115                                 break;
1116                         }
1117                 }
1118                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1119                         ssize = seg->s_base + seg->s_size - raddr;
1120                 else
1121                         ssize = rsize;
1122 retry:
1123                 error = segop_setprot(seg, raddr, ssize, prot);
1124
1125                 if (error == IE_NOMEM) {
1126                         error = EAGAIN;
1127                         break;
1128                 }
1129
1130                 if (error == IE_RETRY) {
1131                         AS_LOCK_EXIT(as);
1132                         writer = 1;
1133                         goto setprot_top;
1134                 }
1135
1136                 if (error == EAGAIN) {
1137                         /*
1138                          * Make sure we have a_lock as writer.
1139                          */
1140                         if (writer == 0) {
1141                                 AS_LOCK_EXIT(as);
1142                                 writer = 1;
1143                                 goto setprot_top;
1144                         }
1145
1146                         /*
1147                          * Memory is currently locked.  It must be unlocked
1148                          * before this operation can succeed through a retry.
1149                          * The possible reasons for locked memory and
1150                          * corresponding strategies for unlocking are:
1151                          * (1) Normal I/O
1152                          *      wait for a signal that the I/O operation
1153                          *      has completed and the memory is unlocked.
1154                          * (2) Asynchronous I/O
1155                          *      The aio subsystem does not unlock pages when
1156                          *      the I/O is completed. Those pages are unlocked
1157                          *      when the application calls aiowait/aioerror.
1158                          *      So, to prevent blocking forever, cv_broadcast()
1159                          *      is done to wake up aio_cleanup_thread.
1160                          *      Subsequently, segvn_reclaim will be called, and
1161                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1162                          * (3) Long term page locking:
1163                          *      Drivers intending to have pages locked for a
1164                          *      period considerably longer than for normal I/O
1165                          *      (essentially forever) may have registered for a
1166                          *      callback so they may unlock these pages on
1167                          *      request. This is needed to allow this operation
1168                          *      to succeed. Each entry on the callback list is
1169                          *      examined. If the event or address range pertains
1170                          *      the callback is invoked (unless it already is in
1171                          *      progress). The a_contents lock must be dropped
1172                          *      before the callback, so only one callback can
1173                          *      be done at a time. Go to the top and do more
1174                          *      until zero is returned. If zero is returned,
1175                          *      either there were no callbacks for this event
1176                          *      or they were already in progress.
1177                          */
1178                         mutex_enter(&as->a_contents);
1179                         if (as->a_callbacks &&
1180                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1181                             seg->s_base, seg->s_size))) {
1182                                 AS_LOCK_EXIT(as);
1183                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1184                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1185                                 if (AS_ISUNMAPWAIT(as) == 0)
1186                                         cv_broadcast(&as->a_cv);
1187                                 AS_SETUNMAPWAIT(as);
1188                                 AS_LOCK_EXIT(as);
1189                                 while (AS_ISUNMAPWAIT(as))
1190                                         cv_wait(&as->a_cv, &as->a_contents);
1191                         } else {
1192                                 /*
1193                                  * We may have raced with
1194                                  * segvn_reclaim()/segspt_reclaim(). In this
1195                                  * case clean nounmapwait flag and retry since
1196                                  * softlockcnt in this segment may be already
1197                                  * 0.  We don't drop as writer lock so our
1198                                  * number of retries without sleeping should
1199                                  * be very small. See segvn_reclaim() for
1200                                  * more comments.
1201                                  */
1202                                 AS_CLRNOUNMAPWAIT(as);
1203                                 mutex_exit(&as->a_contents);
1204                                 goto retry;
1205                         }
1206                         mutex_exit(&as->a_contents);
1207                         goto setprot_top;
1208                 } else if (error != 0)
1209                         break;
1210         }
1211         if (error != 0) {
1212                 as_setwatch(as);
1213         } else {
1214                 as_setwatchprot(as, saveraddr, saversize, prot);
1215         }
1216         AS_LOCK_EXIT(as);
1217         return (error);
1218 }
1219
1220 /*
1221  * Check to make sure that the interval [addr, addr + size)
1222  * in address space `as' has at least the specified protection.
1223  * It is ok for the range to cross over several segments, as long
1224  * as they are contiguous.
1225  */
1226 int
1227 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1228 {
1229         struct seg *seg;
1230         size_t ssize;
1231         caddr_t raddr;                  /* rounded down addr */
1232         size_t rsize;                   /* rounded up size */
1233         int error = 0;
1234
1235         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1236         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1237             (size_t)raddr;
1238
1239         if (raddr + rsize < raddr)              /* check for wraparound */
1240                 return (ENOMEM);
1241
1242         /*
1243          * This is ugly as sin...
1244          * Normally, we only acquire the address space readers lock.
1245          * However, if the address space has watchpoints present,
1246          * we must acquire the writer lock on the address space for
1247          * the benefit of as_clearwatchprot() and as_setwatchprot().
1248          */
1249         if (avl_numnodes(&as->a_wpage) != 0)
1250                 AS_LOCK_ENTER(as, RW_WRITER);
1251         else
1252                 AS_LOCK_ENTER(as, RW_READER);
1253         as_clearwatchprot(as, raddr, rsize);
1254         seg = as_segat(as, raddr);
1255         if (seg == NULL) {
1256                 as_setwatch(as);
1257                 AS_LOCK_EXIT(as);
1258                 return (ENOMEM);
1259         }
1260
1261         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1262                 if (raddr >= seg->s_base + seg->s_size) {
1263                         seg = AS_SEGNEXT(as, seg);
1264                         if (seg == NULL || raddr != seg->s_base) {
1265                                 error = ENOMEM;
1266                                 break;
1267                         }
1268                 }
1269                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1270                         ssize = seg->s_base + seg->s_size - raddr;
1271                 else
1272                         ssize = rsize;
1273
1274                 error = segop_checkprot(seg, raddr, ssize, prot);
1275                 if (error != 0)
1276                         break;
1277         }
1278         as_setwatch(as);
1279         AS_LOCK_EXIT(as);
1280         return (error);
1281 }
1282
1283 int
1284 as_unmap(struct as *as, caddr_t addr, size_t size)
1285 {
1286         struct seg *seg, *seg_next;
1287         struct as_callback *cb;
1288         caddr_t raddr, eaddr;
1289         size_t ssize, rsize = 0;
1290         int err;
1291
1292 top:
1293         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1294         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1295             (uintptr_t)PAGEMASK);
1296
1297         AS_LOCK_ENTER(as, RW_WRITER);
1298
1299         as->a_updatedir = 1;    /* inform /proc */
1300         gethrestime(&as->a_updatetime);
1301
1302         /*
1303          * Use as_findseg to find the first segment in the range, then
1304          * step through the segments in order, following s_next.
1305          */
1306         as_clearwatchprot(as, raddr, eaddr - raddr);
1307
1308         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1309                 if (eaddr <= seg->s_base)
1310                         break;          /* eaddr was in a gap; all done */
1311
1312                 /* this is implied by the test above */
1313                 ASSERT(raddr < eaddr);
1314
1315                 if (raddr < seg->s_base)
1316                         raddr = seg->s_base;    /* raddr was in a gap */
1317
1318                 if (eaddr > (seg->s_base + seg->s_size))
1319                         ssize = seg->s_base + seg->s_size - raddr;
1320                 else
1321                         ssize = eaddr - raddr;
1322
1323                 /*
1324                  * Save next segment pointer since seg can be
1325                  * destroyed during the segment unmap operation.
1326                  */
1327                 seg_next = AS_SEGNEXT(as, seg);
1328
1329                 /*
1330                  * We didn't count /dev/null mappings, so ignore them here.
1331                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1332                  * we have to do this check here while we have seg.)
1333                  */
1334                 rsize = 0;
1335                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1336                     !SEG_IS_PARTIAL_RESV(seg))
1337                         rsize = ssize;
1338
1339 retry:
1340                 err = segop_unmap(seg, raddr, ssize);
1341                 if (err == EAGAIN) {
1342                         /*
1343                          * Memory is currently locked.  It must be unlocked
1344                          * before this operation can succeed through a retry.
1345                          * The possible reasons for locked memory and
1346                          * corresponding strategies for unlocking are:
1347                          * (1) Normal I/O
1348                          *      wait for a signal that the I/O operation
1349                          *      has completed and the memory is unlocked.
1350                          * (2) Asynchronous I/O
1351                          *      The aio subsystem does not unlock pages when
1352                          *      the I/O is completed. Those pages are unlocked
1353                          *      when the application calls aiowait/aioerror.
1354                          *      So, to prevent blocking forever, cv_broadcast()
1355                          *      is done to wake up aio_cleanup_thread.
1356                          *      Subsequently, segvn_reclaim will be called, and
1357                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1358                          * (3) Long term page locking:
1359                          *      Drivers intending to have pages locked for a
1360                          *      period considerably longer than for normal I/O
1361                          *      (essentially forever) may have registered for a
1362                          *      callback so they may unlock these pages on
1363                          *      request. This is needed to allow this operation
1364                          *      to succeed. Each entry on the callback list is
1365                          *      examined. If the event or address range pertains
1366                          *      the callback is invoked (unless it already is in
1367                          *      progress). The a_contents lock must be dropped
1368                          *      before the callback, so only one callback can
1369                          *      be done at a time. Go to the top and do more
1370                          *      until zero is returned. If zero is returned,
1371                          *      either there were no callbacks for this event
1372                          *      or they were already in progress.
1373                          */
1374                         mutex_enter(&as->a_contents);
1375                         if (as->a_callbacks &&
1376                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1377                             seg->s_base, seg->s_size))) {
1378                                 AS_LOCK_EXIT(as);
1379                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1380                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1381                                 if (AS_ISUNMAPWAIT(as) == 0)
1382                                         cv_broadcast(&as->a_cv);
1383                                 AS_SETUNMAPWAIT(as);
1384                                 AS_LOCK_EXIT(as);
1385                                 while (AS_ISUNMAPWAIT(as))
1386                                         cv_wait(&as->a_cv, &as->a_contents);
1387                         } else {
1388                                 /*
1389                                  * We may have raced with
1390                                  * segvn_reclaim()/segspt_reclaim(). In this
1391                                  * case clean nounmapwait flag and retry since
1392                                  * softlockcnt in this segment may be already
1393                                  * 0.  We don't drop as writer lock so our
1394                                  * number of retries without sleeping should
1395                                  * be very small. See segvn_reclaim() for
1396                                  * more comments.
1397                                  */
1398                                 AS_CLRNOUNMAPWAIT(as);
1399                                 mutex_exit(&as->a_contents);
1400                                 goto retry;
1401                         }
1402                         mutex_exit(&as->a_contents);
1403                         goto top;
1404                 } else if (err == IE_RETRY) {
1405                         AS_LOCK_EXIT(as);
1406                         goto top;
1407                 } else if (err) {
1408                         as_setwatch(as);
1409                         AS_LOCK_EXIT(as);
1410                         return (-1);
1411                 }
1412
1413                 as->a_size -= ssize;
1414                 if (rsize)
1415                         as->a_resvsize -= rsize;
1416                 raddr += ssize;
1417         }
1418         AS_LOCK_EXIT(as);
1419         return (0);
1420 }
1421
1422 static int
1423 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1424     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1425 {
1426         uint_t szc;
1427         uint_t nszc;
1428         int error;
1429         caddr_t a;
1430         caddr_t eaddr;
1431         size_t segsize;
1432         struct seg *seg;
1433         size_t pgsz;
1434         int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1435         uint_t save_szcvec;
1436
1437         ASSERT(AS_WRITE_HELD(as));
1438         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1439         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1440         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1441         if (!do_off) {
1442                 vn_a->offset = 0;
1443         }
1444
1445         if (szcvec <= 1) {
1446                 seg = seg_alloc(as, addr, size);
1447                 if (seg == NULL) {
1448                         return (ENOMEM);
1449                 }
1450                 vn_a->szc = 0;
1451                 error = (*crfp)(seg, vn_a);
1452                 if (error != 0) {
1453                         seg_free(seg);
1454                 } else {
1455                         as->a_size += size;
1456                         as->a_resvsize += size;
1457                 }
1458                 return (error);
1459         }
1460
1461         eaddr = addr + size;
1462         save_szcvec = szcvec;
1463         szcvec >>= 1;
1464         szc = 0;
1465         nszc = 0;
1466         while (szcvec) {
1467                 if ((szcvec & 0x1) == 0) {
1468                         nszc++;
1469                         szcvec >>= 1;
1470                         continue;
1471                 }
1472                 nszc++;
1473                 pgsz = page_get_pagesize(nszc);
1474                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1475                 if (a != addr) {
1476                         ASSERT(a < eaddr);
1477                         segsize = a - addr;
1478                         seg = seg_alloc(as, addr, segsize);
1479                         if (seg == NULL) {
1480                                 return (ENOMEM);
1481                         }
1482                         vn_a->szc = szc;
1483                         error = (*crfp)(seg, vn_a);
1484                         if (error != 0) {
1485                                 seg_free(seg);
1486                                 return (error);
1487                         }
1488                         as->a_size += segsize;
1489                         as->a_resvsize += segsize;
1490                         *segcreated = 1;
1491                         if (do_off) {
1492                                 vn_a->offset += segsize;
1493                         }
1494                         addr = a;
1495                 }
1496                 szc = nszc;
1497                 szcvec >>= 1;
1498         }
1499
1500         ASSERT(addr < eaddr);
1501         szcvec = save_szcvec | 1; /* add 8K pages */
1502         while (szcvec) {
1503                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1504                 ASSERT(a >= addr);
1505                 if (a != addr) {
1506                         segsize = a - addr;
1507                         seg = seg_alloc(as, addr, segsize);
1508                         if (seg == NULL) {
1509                                 return (ENOMEM);
1510                         }
1511                         vn_a->szc = szc;
1512                         error = (*crfp)(seg, vn_a);
1513                         if (error != 0) {
1514                                 seg_free(seg);
1515                                 return (error);
1516                         }
1517                         as->a_size += segsize;
1518                         as->a_resvsize += segsize;
1519                         *segcreated = 1;
1520                         if (do_off) {
1521                                 vn_a->offset += segsize;
1522                         }
1523                         addr = a;
1524                 }
1525                 szcvec &= ~(1 << szc);
1526                 if (szcvec) {
1527                         szc = highbit(szcvec) - 1;
1528                         pgsz = page_get_pagesize(szc);
1529                 }
1530         }
1531         ASSERT(addr == eaddr);
1532
1533         return (0);
1534 }
1535
1536 static int
1537 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1538     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1539 {
1540         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1541         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1542         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1543             type, 0);
1544         int error;
1545         struct seg *seg;
1546         struct vattr va;
1547         uoff_t eoff;
1548         size_t save_size = 0;
1549         extern size_t textrepl_size_thresh;
1550
1551         ASSERT(AS_WRITE_HELD(as));
1552         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1553         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1554         ASSERT(vn_a->vp != NULL);
1555         ASSERT(vn_a->amp == NULL);
1556
1557 again:
1558         if (szcvec <= 1) {
1559                 seg = seg_alloc(as, addr, size);
1560                 if (seg == NULL) {
1561                         return (ENOMEM);
1562                 }
1563                 vn_a->szc = 0;
1564                 error = (*crfp)(seg, vn_a);
1565                 if (error != 0) {
1566                         seg_free(seg);
1567                 } else {
1568                         as->a_size += size;
1569                         as->a_resvsize += size;
1570                 }
1571                 return (error);
1572         }
1573
1574         va.va_mask = AT_SIZE;
1575         if (fop_getattr(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1576                 szcvec = 0;
1577                 goto again;
1578         }
1579         eoff = vn_a->offset & PAGEMASK;
1580         if (eoff >= va.va_size) {
1581                 szcvec = 0;
1582                 goto again;
1583         }
1584         eoff += size;
1585         if (btopr(va.va_size) < btopr(eoff)) {
1586                 save_size = size;
1587                 size = va.va_size - (vn_a->offset & PAGEMASK);
1588                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1589                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1590                     type, 0);
1591                 if (szcvec <= 1) {
1592                         size = save_size;
1593                         goto again;
1594                 }
1595         }
1596
1597         if (size > textrepl_size_thresh) {
1598                 vn_a->flags |= _MAP_TEXTREPL;
1599         }
1600         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1601             segcreated);
1602         if (error != 0) {
1603                 return (error);
1604         }
1605         if (save_size) {
1606                 addr += size;
1607                 size = save_size - size;
1608                 szcvec = 0;
1609                 goto again;
1610         }
1611         return (0);
1612 }
1613
1614 /*
1615  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1616  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1617  */
1618 static int
1619 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1620     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1621 {
1622         uint_t szcvec;
1623         uchar_t type;
1624
1625         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1626         if (vn_a->type == MAP_SHARED) {
1627                 type = MAPPGSZC_SHM;
1628         } else if (vn_a->type == MAP_PRIVATE) {
1629                 if (vn_a->szc == AS_MAP_HEAP) {
1630                         type = MAPPGSZC_HEAP;
1631                 } else if (vn_a->szc == AS_MAP_STACK) {
1632                         type = MAPPGSZC_STACK;
1633                 } else {
1634                         type = MAPPGSZC_PRIVM;
1635                 }
1636         }
1637         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1638             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1639             (vn_a->flags & MAP_TEXT), type, 0);
1640         ASSERT(AS_WRITE_HELD(as));
1641         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1642         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1643         ASSERT(vn_a->vp == NULL);
1644
1645         return (as_map_segvn_segs(as, addr, size, szcvec,
1646             crfp, vn_a, segcreated));
1647 }
1648
1649 int
1650 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1651 {
1652         AS_LOCK_ENTER(as, RW_WRITER);
1653         return (as_map_locked(as, addr, size, crfp, argsp));
1654 }
1655
1656 int
1657 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1658     void *argsp)
1659 {
1660         struct seg *seg = NULL;
1661         caddr_t raddr;                  /* rounded down addr */
1662         size_t rsize;                   /* rounded up size */
1663         int error;
1664         int unmap = 0;
1665         /*
1666          * The use of a_proc is preferred to handle the case where curproc is
1667          * a door_call server and is allocating memory in the client's (a_proc)
1668          * address space.
1669          * When creating a shared memory segment a_proc will be NULL so we
1670          * fallback to curproc in that case.
1671          */
1672         struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1673         struct segvn_crargs crargs;
1674
1675         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1676         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1677             (size_t)raddr;
1678
1679         /*
1680          * check for wrap around
1681          */
1682         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1683                 AS_LOCK_EXIT(as);
1684                 return (ENOMEM);
1685         }
1686
1687         as->a_updatedir = 1;    /* inform /proc */
1688         gethrestime(&as->a_updatetime);
1689
1690         if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1691                 AS_LOCK_EXIT(as);
1692
1693                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1694                     RCA_UNSAFE_ALL);
1695
1696                 return (ENOMEM);
1697         }
1698
1699         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1700                 crargs = *(struct segvn_crargs *)argsp;
1701                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1702                 if (error != 0) {
1703                         AS_LOCK_EXIT(as);
1704                         if (unmap) {
1705                                 (void) as_unmap(as, addr, size);
1706                         }
1707                         return (error);
1708                 }
1709         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1710                 crargs = *(struct segvn_crargs *)argsp;
1711                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1712                 if (error != 0) {
1713                         AS_LOCK_EXIT(as);
1714                         if (unmap) {
1715                                 (void) as_unmap(as, addr, size);
1716                         }
1717                         return (error);
1718                 }
1719         } else {
1720                 seg = seg_alloc(as, addr, size);
1721                 if (seg == NULL) {
1722                         AS_LOCK_EXIT(as);
1723                         return (ENOMEM);
1724                 }
1725
1726                 error = (*crfp)(seg, argsp);
1727                 if (error != 0) {
1728                         seg_free(seg);
1729                         AS_LOCK_EXIT(as);
1730                         return (error);
1731                 }
1732                 /*
1733                  * Add size now so as_unmap will work if as_ctl fails.
1734                  */
1735                 as->a_size += rsize;
1736                 as->a_resvsize += rsize;
1737         }
1738
1739         as_setwatch(as);
1740
1741         /*
1742          * If the address space is locked,
1743          * establish memory locks for the new segment.
1744          */
1745         mutex_enter(&as->a_contents);
1746         if (AS_ISPGLCK(as)) {
1747                 mutex_exit(&as->a_contents);
1748                 AS_LOCK_EXIT(as);
1749                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1750                 if (error != 0)
1751                         (void) as_unmap(as, addr, size);
1752         } else {
1753                 mutex_exit(&as->a_contents);
1754                 AS_LOCK_EXIT(as);
1755         }
1756         return (error);
1757 }
1758
1759
1760 /*
1761  * Delete all segments in the address space marked with S_PURGE.
1762  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1763  * These segments are deleted as a first step before calls to as_gap(), so
1764  * that they don't affect mmap() or shmat().
1765  */
1766 void
1767 as_purge(struct as *as)
1768 {
1769         struct seg *seg;
1770         struct seg *next_seg;
1771
1772         /*
1773          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1774          * no need to grab a_contents mutex for this check
1775          */
1776         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1777                 return;
1778
1779         AS_LOCK_ENTER(as, RW_WRITER);
1780         next_seg = NULL;
1781         seg = AS_SEGFIRST(as);
1782         while (seg != NULL) {
1783                 next_seg = AS_SEGNEXT(as, seg);
1784                 if (seg->s_flags & S_PURGE)
1785                         (void) segop_unmap(seg, seg->s_base, seg->s_size);
1786                 seg = next_seg;
1787         }
1788         AS_LOCK_EXIT(as);
1789
1790         mutex_enter(&as->a_contents);
1791         as->a_flags &= ~AS_NEEDSPURGE;
1792         mutex_exit(&as->a_contents);
1793 }
1794
1795 /*
1796  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1797  * range of addresses at least "minlen" long, where the base of the range is
1798  * at "off" phase from an "align" boundary and there is space for a
1799  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1800  * if align was 4M and off was 16k, the user wants a hole which will start
1801  * 16k into a 4M page.
1802  *
1803  * If flags specifies AH_HI, the hole will have the highest possible address
1804  * in the range.  We use the as->a_lastgap field to figure out where to
1805  * start looking for a gap.
1806  *
1807  * Otherwise, the gap will have the lowest possible address.
1808  *
1809  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1810  *
1811  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1812  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1813  *
1814  * NOTE: This routine is not correct when base+len overflows caddr_t.
1815  */
1816 int
1817 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1818     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1819 {
1820         caddr_t lobound = *basep;
1821         caddr_t hibound = lobound + *lenp;
1822         struct seg *lseg, *hseg;
1823         caddr_t lo, hi;
1824         int forward;
1825         caddr_t save_base;
1826         size_t save_len;
1827         size_t save_minlen;
1828         size_t save_redzone;
1829         int fast_path = 1;
1830
1831         save_base = *basep;
1832         save_len = *lenp;
1833         save_minlen = minlen;
1834         save_redzone = redzone;
1835
1836         /*
1837          * For the first pass/fast_path, just add align and redzone into
1838          * minlen since if we get an allocation, we can guarantee that it
1839          * will fit the alignment and redzone requested.
1840          * This increases the chance that hibound will be adjusted to
1841          * a_lastgap->s_base which will likely allow us to find an
1842          * acceptable hole in the address space quicker.
1843          * If we can't find a hole with this fast_path, then we look for
1844          * smaller holes in which the alignment and offset may allow
1845          * the allocation to fit.
1846          */
1847         minlen += align;
1848         minlen += 2 * redzone;
1849         redzone = 0;
1850
1851         AS_LOCK_ENTER(as, RW_READER);
1852         if (AS_SEGFIRST(as) == NULL) {
1853                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1854                     align, redzone, off)) {
1855                         AS_LOCK_EXIT(as);
1856                         return (0);
1857                 } else {
1858                         AS_LOCK_EXIT(as);
1859                         *basep = save_base;
1860                         *lenp = save_len;
1861                         return (-1);
1862                 }
1863         }
1864
1865 retry:
1866         /*
1867          * Set up to iterate over all the inter-segment holes in the given
1868          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1869          * NULL for the highest-addressed hole.  If moving backwards, we reset
1870          * sseg to denote the highest-addressed segment.
1871          */
1872         forward = (flags & AH_DIR) == AH_LO;
1873         if (forward) {
1874                 hseg = as_findseg(as, lobound, 1);
1875                 lseg = AS_SEGPREV(as, hseg);
1876         } else {
1877
1878                 /*
1879                  * If allocating at least as much as the last allocation,
1880                  * use a_lastgap's base as a better estimate of hibound.
1881                  */
1882                 if (as->a_lastgap &&
1883                     minlen >= as->a_lastgap->s_size &&
1884                     hibound >= as->a_lastgap->s_base)
1885                         hibound = as->a_lastgap->s_base;
1886
1887                 hseg = as_findseg(as, hibound, 1);
1888                 if (hseg->s_base + hseg->s_size < hibound) {
1889                         lseg = hseg;
1890                         hseg = NULL;
1891                 } else {
1892                         lseg = AS_SEGPREV(as, hseg);
1893                 }
1894         }
1895
1896         for (;;) {
1897                 /*
1898                  * Set lo and hi to the hole's boundaries.  (We should really
1899                  * use MAXADDR in place of hibound in the expression below,
1900                  * but can't express it easily; using hibound in its place is
1901                  * harmless.)
1902                  */
1903                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1904                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1905                 /*
1906                  * If the iteration has moved past the interval from lobound
1907                  * to hibound it's pointless to continue.
1908                  */
1909                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1910                         break;
1911                 else if (lo > hibound || hi < lobound)
1912                         goto cont;
1913                 /*
1914                  * Candidate hole lies at least partially within the allowable
1915                  * range.  Restrict it to fall completely within that range,
1916                  * i.e., to [max(lo, lobound), min(hi, hibound)].
1917                  */
1918                 if (lo < lobound)
1919                         lo = lobound;
1920                 if (hi > hibound)
1921                         hi = hibound;
1922                 /*
1923                  * Verify that the candidate hole is big enough and meets
1924                  * hardware constraints.  If the hole is too small, no need
1925                  * to do the further checks since they will fail.
1926                  */
1927                 *basep = lo;
1928                 *lenp = hi - lo;
1929                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1930                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1931                     ((flags & AH_CONTAIN) == 0 ||
1932                     (*basep <= addr && *basep + *lenp > addr))) {
1933                         if (!forward)
1934                                 as->a_lastgap = hseg;
1935                         if (hseg != NULL)
1936                                 as->a_lastgaphl = hseg;
1937                         else
1938                                 as->a_lastgaphl = lseg;
1939                         AS_LOCK_EXIT(as);
1940                         return (0);
1941                 }
1942         cont:
1943                 /*
1944                  * Move to the next hole.
1945                  */
1946                 if (forward) {
1947                         lseg = hseg;
1948                         if (lseg == NULL)
1949                                 break;
1950                         hseg = AS_SEGNEXT(as, hseg);
1951                 } else {
1952                         hseg = lseg;
1953                         if (hseg == NULL)
1954                                 break;
1955                         lseg = AS_SEGPREV(as, lseg);
1956                 }
1957         }
1958         if (fast_path && (align != 0 || save_redzone != 0)) {
1959                 fast_path = 0;
1960                 minlen = save_minlen;
1961                 redzone = save_redzone;
1962                 goto retry;
1963         }
1964         *basep = save_base;
1965         *lenp = save_len;
1966         AS_LOCK_EXIT(as);
1967         return (-1);
1968 }
1969
1970 /*
1971  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1972  *
1973  * If flags specifies AH_HI, the hole will have the highest possible address
1974  * in the range.  We use the as->a_lastgap field to figure out where to
1975  * start looking for a gap.
1976  *
1977  * Otherwise, the gap will have the lowest possible address.
1978  *
1979  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1980  *
1981  * If an adequate hole is found, base and len are set to reflect the part of
1982  * the hole that is within range, and 0 is returned, otherwise,
1983  * -1 is returned.
1984  *
1985  * NOTE: This routine is not correct when base+len overflows caddr_t.
1986  */
1987 int
1988 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1989     caddr_t addr)
1990 {
1991
1992         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1993 }
1994
1995 /*
1996  * Return the next range within [base, base + len) that is backed
1997  * with "real memory".  Skip holes and non-seg_vn segments.
1998  * We're lazy and only return one segment at a time.
1999  */
2000 int
2001 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2002 {
2003         extern const struct seg_ops segspt_shmops; /* needs a header file */
2004         struct seg *seg;
2005         caddr_t addr, eaddr;
2006         caddr_t segend;
2007
2008         AS_LOCK_ENTER(as, RW_READER);
2009
2010         addr = *basep;
2011         eaddr = addr + *lenp;
2012
2013         seg = as_findseg(as, addr, 0);
2014         if (seg != NULL)
2015                 addr = MAX(seg->s_base, addr);
2016
2017         for (;;) {
2018                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2019                         AS_LOCK_EXIT(as);
2020                         return (EINVAL);
2021                 }
2022
2023                 if (seg->s_ops == &segvn_ops) {
2024                         segend = seg->s_base + seg->s_size;
2025                         break;
2026                 }
2027
2028                 /*
2029                  * We do ISM by looking into the private data
2030                  * to determine the real size of the segment.
2031                  */
2032                 if (seg->s_ops == &segspt_shmops) {
2033                         segend = seg->s_base + spt_realsize(seg);
2034                         if (addr < segend)
2035                                 break;
2036                 }
2037
2038                 seg = AS_SEGNEXT(as, seg);
2039
2040                 if (seg != NULL)
2041                         addr = seg->s_base;
2042         }
2043
2044         *basep = addr;
2045
2046         if (segend > eaddr)
2047                 *lenp = eaddr - addr;
2048         else
2049                 *lenp = segend - addr;
2050
2051         AS_LOCK_EXIT(as);
2052         return (0);
2053 }
2054
2055 /*
2056  * Determine whether data from the mappings in interval [addr, addr + size)
2057  * are in the primary memory (core) cache.
2058  */
2059 int
2060 as_incore(struct as *as, caddr_t addr,
2061     size_t size, char *vec, size_t *sizep)
2062 {
2063         struct seg *seg;
2064         size_t ssize;
2065         caddr_t raddr;          /* rounded down addr */
2066         size_t rsize;           /* rounded up size */
2067         size_t isize;                   /* iteration size */
2068         int error = 0;          /* result, assume success */
2069
2070         *sizep = 0;
2071         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2072         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2073             (size_t)raddr;
2074
2075         if (raddr + rsize < raddr)              /* check for wraparound */
2076                 return (ENOMEM);
2077
2078         AS_LOCK_ENTER(as, RW_READER);
2079         seg = as_segat(as, raddr);
2080         if (seg == NULL) {
2081                 AS_LOCK_EXIT(as);
2082                 return (-1);
2083         }
2084
2085         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2086                 if (raddr >= seg->s_base + seg->s_size) {
2087                         seg = AS_SEGNEXT(as, seg);
2088                         if (seg == NULL || raddr != seg->s_base) {
2089                                 error = -1;
2090                                 break;
2091                         }
2092                 }
2093                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2094                         ssize = seg->s_base + seg->s_size - raddr;
2095                 else
2096                         ssize = rsize;
2097                 *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2098                 if (isize != ssize) {
2099                         error = -1;
2100                         break;
2101                 }
2102                 vec += btopr(ssize);
2103         }
2104         AS_LOCK_EXIT(as);
2105         return (error);
2106 }
2107
2108 static void
2109 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2110     ulong_t *bitmap, size_t position, size_t npages)
2111 {
2112         caddr_t range_start;
2113         size_t  pos1 = position;
2114         size_t  pos2;
2115         size_t  size;
2116         size_t  end_pos = npages + position;
2117
2118         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2119                 size = ptob((pos2 - pos1));
2120                 range_start = (caddr_t)((uintptr_t)addr +
2121                     ptob(pos1 - position));
2122
2123                 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2124                     NULL, 0);
2125                 pos1 = pos2;
2126         }
2127 }
2128
2129 static void
2130 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2131     caddr_t raddr, size_t rsize)
2132 {
2133         struct seg *seg = as_segat(as, raddr);
2134         size_t ssize;
2135
2136         while (rsize != 0) {
2137                 if (raddr >= seg->s_base + seg->s_size)
2138                         seg = AS_SEGNEXT(as, seg);
2139
2140                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2141                         ssize = seg->s_base + seg->s_size - raddr;
2142                 else
2143                         ssize = rsize;
2144
2145                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2146
2147                 rsize -= ssize;
2148                 raddr += ssize;
2149         }
2150 }
2151
2152 /*
2153  * Cache control operations over the interval [addr, addr + size) in
2154  * address space "as".
2155  */
2156 /*ARGSUSED*/
2157 int
2158 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2159     uintptr_t arg, ulong_t *lock_map, size_t pos)
2160 {
2161         struct seg *seg;        /* working segment */
2162         caddr_t raddr;          /* rounded down addr */
2163         caddr_t initraddr;      /* saved initial rounded down addr */
2164         size_t rsize;           /* rounded up size */
2165         size_t initrsize;       /* saved initial rounded up size */
2166         size_t ssize;           /* size of seg */
2167         int error = 0;                  /* result */
2168         size_t mlock_size;      /* size of bitmap */
2169         ulong_t *mlock_map;     /* pointer to bitmap used */
2170                                 /* to represent the locked */
2171                                 /* pages. */
2172 retry:
2173         if (error == IE_RETRY)
2174                 AS_LOCK_ENTER(as, RW_WRITER);
2175         else
2176                 AS_LOCK_ENTER(as, RW_READER);
2177
2178         /*
2179          * If these are address space lock/unlock operations, loop over
2180          * all segments in the address space, as appropriate.
2181          */
2182         if (func == MC_LOCKAS) {
2183                 size_t npages, idx;
2184                 size_t rlen = 0;        /* rounded as length */
2185
2186                 idx = pos;
2187
2188                 if (arg & MCL_FUTURE) {
2189                         mutex_enter(&as->a_contents);
2190                         AS_SETPGLCK(as);
2191                         mutex_exit(&as->a_contents);
2192                 }
2193                 if ((arg & MCL_CURRENT) == 0) {
2194                         AS_LOCK_EXIT(as);
2195                         return (0);
2196                 }
2197
2198                 seg = AS_SEGFIRST(as);
2199                 if (seg == NULL) {
2200                         AS_LOCK_EXIT(as);
2201                         return (0);
2202                 }
2203
2204                 do {
2205                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2206                             (uintptr_t)PAGEMASK);
2207                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2208                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2209                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2210
2211                 mlock_size = BT_BITOUL(btopr(rlen));
2212                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2213                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2214                                 AS_LOCK_EXIT(as);
2215                                 return (EAGAIN);
2216                 }
2217
2218                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2219                         error = segop_lockop(seg, seg->s_base,
2220                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2221                         if (error != 0)
2222                                 break;
2223                         pos += seg_pages(seg);
2224                 }
2225
2226                 if (error) {
2227                         for (seg = AS_SEGFIRST(as); seg != NULL;
2228                             seg = AS_SEGNEXT(as, seg)) {
2229
2230                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2231                                     (uintptr_t)PAGEMASK);
2232                                 npages = seg_pages(seg);
2233                                 as_segunlock(seg, raddr, attr, mlock_map,
2234                                     idx, npages);
2235                                 idx += npages;
2236                         }
2237                 }
2238
2239                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2240                 AS_LOCK_EXIT(as);
2241                 goto lockerr;
2242         } else if (func == MC_UNLOCKAS) {
2243                 mutex_enter(&as->a_contents);
2244                 AS_CLRPGLCK(as);
2245                 mutex_exit(&as->a_contents);
2246
2247                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2248                         error = segop_lockop(seg, seg->s_base,
2249                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2250                         if (error != 0)
2251                                 break;
2252                 }
2253
2254                 AS_LOCK_EXIT(as);
2255                 goto lockerr;
2256         }
2257
2258         /*
2259          * Normalize addresses and sizes.
2260          */
2261         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2262         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2263             (size_t)raddr;
2264
2265         if (raddr + rsize < raddr) {            /* check for wraparound */
2266                 AS_LOCK_EXIT(as);
2267                 return (ENOMEM);
2268         }
2269
2270         /*
2271          * Get initial segment.
2272          */
2273         if ((seg = as_segat(as, raddr)) == NULL) {
2274                 AS_LOCK_EXIT(as);
2275                 return (ENOMEM);
2276         }
2277
2278         if (func == MC_LOCK) {
2279                 mlock_size = BT_BITOUL(btopr(rsize));
2280                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2281                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2282                                 AS_LOCK_EXIT(as);
2283                                 return (EAGAIN);
2284                 }
2285         }
2286
2287         /*
2288          * Loop over all segments.  If a hole in the address range is
2289          * discovered, then fail.  For each segment, perform the appropriate
2290          * control operation.
2291          */
2292         while (rsize != 0) {
2293
2294                 /*
2295                  * Make sure there's no hole, calculate the portion
2296                  * of the next segment to be operated over.
2297                  */
2298                 if (raddr >= seg->s_base + seg->s_size) {
2299                         seg = AS_SEGNEXT(as, seg);
2300                         if (seg == NULL || raddr != seg->s_base) {
2301                                 if (func == MC_LOCK) {
2302                                         as_unlockerr(as, attr, mlock_map,
2303                                             initraddr, initrsize - rsize);
2304                                         kmem_free(mlock_map,
2305                                             mlock_size * sizeof (ulong_t));
2306                                 }
2307                                 AS_LOCK_EXIT(as);
2308                                 return (ENOMEM);
2309                         }
2310                 }
2311                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2312                         ssize = seg->s_base + seg->s_size - raddr;
2313                 else
2314                         ssize = rsize;
2315
2316                 /*
2317                  * Dispatch on specific function.
2318                  */
2319                 switch (func) {
2320
2321                 /*
2322                  * Synchronize cached data from mappings with backing
2323                  * objects.
2324                  */
2325                 case MC_SYNC:
2326                         if (error = segop_sync(seg, raddr, ssize,
2327                             attr, (uint_t)arg)) {
2328                                 AS_LOCK_EXIT(as);
2329                                 return (error);
2330                         }
2331                         break;
2332
2333                 /*
2334                  * Lock pages in memory.
2335                  */
2336                 case MC_LOCK:
2337                         if (error = segop_lockop(seg, raddr, ssize,
2338                             attr, func, mlock_map, pos)) {
2339                                 as_unlockerr(as, attr, mlock_map, initraddr,
2340                                     initrsize - rsize + ssize);
2341                                 kmem_free(mlock_map, mlock_size *
2342                                     sizeof (ulong_t));
2343                                 AS_LOCK_EXIT(as);
2344                                 goto lockerr;
2345                         }
2346                         break;
2347
2348                 /*
2349                  * Unlock mapped pages.
2350                  */
2351                 case MC_UNLOCK:
2352                         (void) segop_lockop(seg, raddr, ssize, attr, func,
2353                             NULL, 0);
2354                         break;
2355
2356                 /*
2357                  * Store VM advise for mapped pages in segment layer.
2358                  */
2359                 case MC_ADVISE:
2360                         error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2361
2362                         /*
2363                          * Check for regular errors and special retry error
2364                          */
2365                         if (error) {
2366                                 if (error == IE_RETRY) {
2367                                         /*
2368                                          * Need to acquire writers lock, so
2369                                          * have to drop readers lock and start
2370                                          * all over again
2371                                          */
2372                                         AS_LOCK_EXIT(as);
2373                                         goto retry;
2374                                 } else if (error == IE_REATTACH) {
2375                                         /*
2376                                          * Find segment for current address
2377                                          * because current segment just got
2378                                          * split or concatenated
2379                                          */
2380                                         seg = as_segat(as, raddr);
2381                                         if (seg == NULL) {
2382                                                 AS_LOCK_EXIT(as);
2383                                                 return (ENOMEM);
2384                                         }
2385                                 } else {
2386                                         /*
2387                                          * Regular error
2388                                          */
2389                                         AS_LOCK_EXIT(as);
2390                                         return (error);
2391                                 }
2392                         }
2393                         break;
2394
2395                 case MC_INHERIT_ZERO:
2396                         error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2397                         if (error != 0) {
2398                                 AS_LOCK_EXIT(as);
2399                                 return (error);
2400                         }
2401                         break;
2402
2403                 /*
2404                  * Can't happen.
2405                  */
2406                 default:
2407                         panic("as_ctl: bad operation %d", func);
2408                         /*NOTREACHED*/
2409                 }
2410
2411                 rsize -= ssize;
2412                 raddr += ssize;
2413         }
2414
2415         if (func == MC_LOCK)
2416                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2417         AS_LOCK_EXIT(as);
2418         return (0);
2419 lockerr:
2420
2421         /*
2422          * If the lower levels returned EDEADLK for a segment lockop,
2423          * it means that we should retry the operation.  Let's wait
2424          * a bit also to let the deadlock causing condition clear.
2425          * This is part of a gross hack to work around a design flaw
2426          * in the ufs/sds logging code and should go away when the
2427          * logging code is re-designed to fix the problem. See bug
2428          * 4125102 for details of the problem.
2429          */
2430         if (error == EDEADLK) {
2431                 delay(deadlk_wait);
2432                 error = 0;
2433                 goto retry;
2434         }
2435         return (error);
2436 }
2437
2438 int
2439 fc_decode(faultcode_t fault_err)
2440 {
2441         int error = 0;
2442
2443         switch (FC_CODE(fault_err)) {
2444         case FC_OBJERR:
2445                 error = FC_ERRNO(fault_err);
2446                 break;
2447         case FC_PROT:
2448                 error = EACCES;
2449                 break;
2450         default:
2451                 error = EFAULT;
2452                 break;
2453         }
2454         return (error);
2455 }
2456
2457 /*
2458  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2459  * lists from each segment and copy them to one contiguous shadow list (plist)
2460  * as expected by the caller.  Save pointers to per segment shadow lists at
2461  * the tail of plist so that they can be used during as_pageunlock().
2462  */
2463 static int
2464 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2465     caddr_t addr, size_t size, enum seg_rw rw)
2466 {
2467         caddr_t sv_addr = addr;
2468         size_t sv_size = size;
2469         struct seg *sv_seg = seg;
2470         ulong_t segcnt = 1;
2471         ulong_t cnt;
2472         size_t ssize;
2473         pgcnt_t npages = btop(size);
2474         page_t **plist;
2475         page_t **pl;
2476         int error;
2477         caddr_t eaddr;
2478         faultcode_t fault_err = 0;
2479         pgcnt_t pl_off;
2480         extern const struct seg_ops segspt_shmops;
2481
2482         ASSERT(AS_LOCK_HELD(as));
2483         ASSERT(seg != NULL);
2484         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2485         ASSERT(addr + size > seg->s_base + seg->s_size);
2486         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2487         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2488
2489         /*
2490          * Count the number of segments covered by the range we are about to
2491          * lock. The segment count is used to size the shadow list we return
2492          * back to the caller.
2493          */
2494         for (; size != 0; size -= ssize, addr += ssize) {
2495                 if (addr >= seg->s_base + seg->s_size) {
2496
2497                         seg = AS_SEGNEXT(as, seg);
2498                         if (seg == NULL || addr != seg->s_base) {
2499                                 AS_LOCK_EXIT(as);
2500                                 return (EFAULT);
2501                         }
2502                         /*
2503                          * Do a quick check if subsequent segments
2504                          * will most likely support pagelock.
2505                          */
2506                         if (seg->s_ops == &segvn_ops) {
2507                                 vnode_t *vp;
2508
2509                                 if (segop_getvp(seg, addr, &vp) != 0 ||
2510                                     vp != NULL) {
2511                                         AS_LOCK_EXIT(as);
2512                                         goto slow;
2513                                 }
2514                         } else if (seg->s_ops != &segspt_shmops) {
2515                                 AS_LOCK_EXIT(as);
2516                                 goto slow;
2517                         }
2518                         segcnt++;
2519                 }
2520                 if (addr + size > seg->s_base + seg->s_size) {
2521                         ssize = seg->s_base + seg->s_size - addr;
2522                 } else {
2523                         ssize = size;
2524                 }
2525         }
2526         ASSERT(segcnt > 1);
2527
2528         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2529
2530         addr = sv_addr;
2531         size = sv_size;
2532         seg = sv_seg;
2533
2534         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2535                 if (addr >= seg->s_base + seg->s_size) {
2536                         seg = AS_SEGNEXT(as, seg);
2537                         ASSERT(seg != NULL && addr == seg->s_base);
2538                         cnt++;
2539                         ASSERT(cnt < segcnt);
2540                 }
2541                 if (addr + size > seg->s_base + seg->s_size) {
2542                         ssize = seg->s_base + seg->s_size - addr;
2543                 } else {
2544                         ssize = size;
2545                 }
2546                 pl = &plist[npages + cnt];
2547                 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2548                     L_PAGELOCK, rw);
2549                 if (error) {
2550                         break;
2551                 }
2552                 ASSERT(plist[npages + cnt] != NULL);
2553                 ASSERT(pl_off + btop(ssize) <= npages);
2554                 bcopy(plist[npages + cnt], &plist[pl_off],
2555                     btop(ssize) * sizeof (page_t *));
2556                 pl_off += btop(ssize);
2557         }
2558
2559         if (size == 0) {
2560                 AS_LOCK_EXIT(as);
2561                 ASSERT(cnt == segcnt - 1);
2562                 *ppp = plist;
2563                 return (0);
2564         }
2565
2566         /*
2567          * one of pagelock calls failed. The error type is in error variable.
2568          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2569          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2570          * back to the caller.
2571          */
2572
2573         eaddr = addr;
2574         seg = sv_seg;
2575
2576         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2577                 if (addr >= seg->s_base + seg->s_size) {
2578                         seg = AS_SEGNEXT(as, seg);
2579                         ASSERT(seg != NULL && addr == seg->s_base);
2580                         cnt++;
2581                         ASSERT(cnt < segcnt);
2582                 }
2583                 if (eaddr > seg->s_base + seg->s_size) {
2584                         ssize = seg->s_base + seg->s_size - addr;
2585                 } else {
2586                         ssize = eaddr - addr;
2587                 }
2588                 pl = &plist[npages + cnt];
2589                 ASSERT(*pl != NULL);
2590                 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2591                     L_PAGEUNLOCK, rw);
2592         }
2593
2594         AS_LOCK_EXIT(as);
2595
2596         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2597
2598         if (error != ENOTSUP && error != EFAULT) {
2599                 return (error);
2600         }
2601
2602 slow:
2603         /*
2604          * If we are here because pagelock failed due to the need to cow fault
2605          * in the pages we want to lock F_SOFTLOCK will do this job and in
2606          * next as_pagelock() call for this address range pagelock will
2607          * hopefully succeed.
2608          */
2609         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2610         if (fault_err != 0) {
2611                 return (fc_decode(fault_err));
2612         }
2613         *ppp = NULL;
2614
2615         return (0);
2616 }
2617
2618 /*
2619  * lock pages in a given address space. Return shadow list. If
2620  * the list is NULL, the MMU mapping is also locked.
2621  */
2622 int
2623 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2624     size_t size, enum seg_rw rw)
2625 {
2626         size_t rsize;
2627         caddr_t raddr;
2628         faultcode_t fault_err;
2629         struct seg *seg;
2630         int err;
2631
2632         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2633         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2634             (size_t)raddr;
2635
2636         /*
2637          * if the request crosses two segments let
2638          * as_fault handle it.
2639          */
2640         AS_LOCK_ENTER(as, RW_READER);
2641
2642         seg = as_segat(as, raddr);
2643         if (seg == NULL) {
2644                 AS_LOCK_EXIT(as);
2645                 return (EFAULT);
2646         }
2647         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2648         if (raddr + rsize > seg->s_base + seg->s_size) {
2649                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2650         }
2651         if (raddr + rsize <= raddr) {
2652                 AS_LOCK_EXIT(as);
2653                 return (EFAULT);
2654         }
2655
2656         /*
2657          * try to lock pages and pass back shadow list
2658          */
2659         err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2660
2661         AS_LOCK_EXIT(as);
2662
2663         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2664                 return (err);
2665         }
2666
2667         /*
2668          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2669          * to no pagelock support for this segment or pages need to be cow
2670          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2671          * this as_pagelock() call and in the next as_pagelock() call for the
2672          * same address range pagelock call will hopefull succeed.
2673          */
2674         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2675         if (fault_err != 0) {
2676                 return (fc_decode(fault_err));
2677         }
2678         *ppp = NULL;
2679
2680         return (0);
2681 }
2682
2683 /*
2684  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2685  * lists from the end of plist and call pageunlock interface for each segment.
2686  * Drop as lock and free plist.
2687  */
2688 static void
2689 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2690     struct page **plist, enum seg_rw rw)
2691 {
2692         ulong_t cnt;
2693         caddr_t eaddr = addr + size;
2694         pgcnt_t npages = btop(size);
2695         size_t ssize;
2696         page_t **pl;
2697
2698         ASSERT(AS_LOCK_HELD(as));
2699         ASSERT(seg != NULL);
2700         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2701         ASSERT(addr + size > seg->s_base + seg->s_size);
2702         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2703         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2704         ASSERT(plist != NULL);
2705
2706         for (cnt = 0; addr < eaddr; addr += ssize) {
2707                 if (addr >= seg->s_base + seg->s_size) {
2708                         seg = AS_SEGNEXT(as, seg);
2709                         ASSERT(seg != NULL && addr == seg->s_base);
2710                         cnt++;
2711                 }
2712                 if (eaddr > seg->s_base + seg->s_size) {
2713                         ssize = seg->s_base + seg->s_size - addr;
2714                 } else {
2715                         ssize = eaddr - addr;
2716                 }
2717                 pl = &plist[npages + cnt];
2718                 ASSERT(*pl != NULL);
2719                 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2720                     L_PAGEUNLOCK, rw);
2721         }
2722         ASSERT(cnt > 0);
2723         AS_LOCK_EXIT(as);
2724
2725         cnt++;
2726         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2727 }
2728
2729 /*
2730  * unlock pages in a given address range
2731  */
2732 void
2733 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2734     enum seg_rw rw)
2735 {
2736         struct seg *seg;
2737         size_t rsize;
2738         caddr_t raddr;
2739
2740         /*
2741          * if the shadow list is NULL, as_pagelock was
2742          * falling back to as_fault
2743          */
2744         if (pp == NULL) {
2745                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2746                 return;
2747         }
2748
2749         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2750         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2751             (size_t)raddr;
2752
2753         AS_LOCK_ENTER(as, RW_READER);
2754         seg = as_segat(as, raddr);
2755         ASSERT(seg != NULL);
2756
2757         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2758         if (raddr + rsize <= seg->s_base + seg->s_size) {
2759                 (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2760         } else {
2761                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2762                 return;
2763         }
2764         AS_LOCK_EXIT(as);
2765 }
2766
2767 int
2768 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2769     boolean_t wait)
2770 {
2771         struct seg *seg;
2772         size_t ssize;
2773         caddr_t raddr;                  /* rounded down addr */
2774         size_t rsize;                   /* rounded up size */
2775         int error = 0;
2776         size_t pgsz = page_get_pagesize(szc);
2777
2778 setpgsz_top:
2779         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2780                 return (EINVAL);
2781         }
2782
2783         raddr = addr;
2784         rsize = size;
2785
2786         if (raddr + rsize < raddr)              /* check for wraparound */
2787                 return (ENOMEM);
2788
2789         AS_LOCK_ENTER(as, RW_WRITER);
2790         as_clearwatchprot(as, raddr, rsize);
2791         seg = as_segat(as, raddr);
2792         if (seg == NULL) {
2793                 as_setwatch(as);
2794                 AS_LOCK_EXIT(as);
2795                 return (ENOMEM);
2796         }
2797
2798         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2799                 if (raddr >= seg->s_base + seg->s_size) {
2800                         seg = AS_SEGNEXT(as, seg);
2801                         if (seg == NULL || raddr != seg->s_base) {
2802                                 error = ENOMEM;
2803                                 break;
2804                         }
2805                 }
2806                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2807                         ssize = seg->s_base + seg->s_size - raddr;
2808                 } else {
2809                         ssize = rsize;
2810                 }
2811
2812 retry:
2813                 error = segop_setpagesize(seg, raddr, ssize, szc);
2814
2815                 if (error == IE_NOMEM) {
2816                         error = EAGAIN;
2817                         break;
2818                 }
2819
2820                 if (error == IE_RETRY) {
2821                         AS_LOCK_EXIT(as);
2822                         goto setpgsz_top;
2823                 }
2824
2825                 if (error == ENOTSUP) {
2826                         error = EINVAL;
2827                         break;
2828                 }
2829
2830                 if (wait && (error == EAGAIN)) {
2831                         /*
2832                          * Memory is currently locked.  It must be unlocked
2833                          * before this operation can succeed through a retry.
2834                          * The possible reasons for locked memory and
2835                          * corresponding strategies for unlocking are:
2836                          * (1) Normal I/O
2837                          *      wait for a signal that the I/O operation
2838                          *      has completed and the memory is unlocked.
2839                          * (2) Asynchronous I/O
2840                          *      The aio subsystem does not unlock pages when
2841                          *      the I/O is completed. Those pages are unlocked
2842                          *      when the application calls aiowait/aioerror.
2843                          *      So, to prevent blocking forever, cv_broadcast()
2844                          *      is done to wake up aio_cleanup_thread.
2845                          *      Subsequently, segvn_reclaim will be called, and
2846                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
2847                          * (3) Long term page locking:
2848                          *      This is not relevant for as_setpagesize()
2849                          *      because we cannot change the page size for
2850                          *      driver memory. The attempt to do so will
2851                          *      fail with a different error than EAGAIN so
2852                          *      there's no need to trigger as callbacks like
2853                          *      as_unmap, as_setprot or as_free would do.
2854                          */
2855                         mutex_enter(&as->a_contents);
2856                         if (!AS_ISNOUNMAPWAIT(as)) {
2857                                 if (AS_ISUNMAPWAIT(as) == 0) {
2858                                         cv_broadcast(&as->a_cv);
2859                                 }
2860                                 AS_SETUNMAPWAIT(as);
2861                                 AS_LOCK_EXIT(as);
2862                                 while (AS_ISUNMAPWAIT(as)) {
2863                                         cv_wait(&as->a_cv, &as->a_contents);
2864                                 }
2865                         } else {
2866                                 /*
2867                                  * We may have raced with
2868                                  * segvn_reclaim()/segspt_reclaim(). In this
2869                                  * case clean nounmapwait flag and retry since
2870                                  * softlockcnt in this segment may be already
2871                                  * 0.  We don't drop as writer lock so our
2872                                  * number of retries without sleeping should
2873                                  * be very small. See segvn_reclaim() for
2874                                  * more comments.
2875                                  */
2876                                 AS_CLRNOUNMAPWAIT(as);
2877                                 mutex_exit(&as->a_contents);
2878                                 goto retry;
2879                         }
2880                         mutex_exit(&as->a_contents);
2881                         goto setpgsz_top;
2882                 } else if (error != 0) {
2883                         break;
2884                 }
2885         }
2886         as_setwatch(as);
2887         AS_LOCK_EXIT(as);
2888         return (error);
2889 }
2890
2891 /*
2892  * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2893  * in its chunk where s_szc is less than the szc we want to set.
2894  */
2895 static int
2896 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2897     int *retry)
2898 {
2899         struct seg *seg;
2900         size_t ssize;
2901         int error;
2902
2903         ASSERT(AS_WRITE_HELD(as));
2904
2905         seg = as_segat(as, raddr);
2906         if (seg == NULL) {
2907                 panic("as_iset3_default_lpsize: no seg");
2908         }
2909
2910         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2911                 if (raddr >= seg->s_base + seg->s_size) {
2912                         seg = AS_SEGNEXT(as, seg);
2913                         if (seg == NULL || raddr != seg->s_base) {
2914                                 panic("as_iset3_default_lpsize: as changed");
2915                         }
2916                 }
2917                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2918                         ssize = seg->s_base + seg->s_size - raddr;
2919                 } else {
2920                         ssize = rsize;
2921                 }
2922
2923                 if (szc > seg->s_szc) {
2924                         error = segop_setpagesize(seg, raddr, ssize, szc);
2925                         /* Only retry on EINVAL segments that have no vnode. */
2926                         if (error == EINVAL) {
2927                                 vnode_t *vp = NULL;
2928                                 if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
2929                                     (segop_getvp(seg, raddr, &vp) != 0 ||
2930                                     vp == NULL)) {
2931                                         *retry = 1;
2932                                 } else {
2933                                         *retry = 0;
2934                                 }
2935                         }
2936                         if (error) {
2937                                 return (error);
2938                         }
2939                 }
2940         }
2941         return (0);
2942 }
2943
2944 /*
2945  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2946  * pagesize on each segment in its range, but if any fails with EINVAL,
2947  * then it reduces the pagesizes to the next size in the bitmap and
2948  * retries as_iset3_default_lpsize(). The reason why the code retries
2949  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2950  * match the bigger sizes, and (b) it's hard to get this offset (to begin
2951  * with) to pass to map_pgszcvec().
2952  */
2953 static int
2954 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2955     uint_t szcvec)
2956 {
2957         int error;
2958         int retry;
2959
2960         ASSERT(AS_WRITE_HELD(as));
2961
2962         for (;;) {
2963                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
2964                 if (error == EINVAL && retry) {
2965                         szcvec &= ~(1 << szc);
2966                         if (szcvec <= 1) {
2967                                 return (EINVAL);
2968                         }
2969                         szc = highbit(szcvec) - 1;
2970                 } else {
2971                         return (error);
2972                 }
2973         }
2974 }
2975
2976 /*
2977  * as_iset1_default_lpsize() breaks its chunk into areas where existing
2978  * segments have a smaller szc than we want to set. For each such area,
2979  * it calls as_iset2_default_lpsize()
2980  */
2981 static int
2982 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2983     uint_t szcvec)
2984 {
2985         struct seg *seg;
2986         size_t ssize;
2987         caddr_t setaddr = raddr;
2988         size_t setsize = 0;
2989         int set;
2990         int error;
2991
2992         ASSERT(AS_WRITE_HELD(as));
2993
2994         seg = as_segat(as, raddr);
2995         if (seg == NULL) {
2996                 panic("as_iset1_default_lpsize: no seg");
2997         }
2998         if (seg->s_szc < szc) {
2999                 set = 1;
3000         } else {
3001                 set = 0;
3002         }
3003
3004         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3005                 if (raddr >= seg->s_base + seg->s_size) {
3006                         seg = AS_SEGNEXT(as, seg);
3007                         if (seg == NULL || raddr != seg->s_base) {
3008                                 panic("as_iset1_default_lpsize: as changed");
3009                         }
3010                         if (seg->s_szc >= szc && set) {
3011                                 ASSERT(setsize != 0);
3012                                 error = as_iset2_default_lpsize(as,
3013                                     setaddr, setsize, szc, szcvec);
3014                                 if (error) {
3015                                         return (error);
3016                                 }
3017                                 set = 0;
3018                         } else if (seg->s_szc < szc && !set) {
3019                                 setaddr = raddr;
3020                                 setsize = 0;
3021                                 set = 1;
3022                         }
3023                 }
3024                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3025                         ssize = seg->s_base + seg->s_size - raddr;
3026                 } else {
3027                         ssize = rsize;
3028                 }
3029         }
3030         error = 0;
3031         if (set) {
3032                 ASSERT(setsize != 0);
3033                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3034                     szc, szcvec);
3035         }
3036         return (error);
3037 }
3038
3039 /*
3040  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3041  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3042  * chunk to as_iset1_default_lpsize().
3043  */
3044 static int
3045 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3046     int type)
3047 {
3048         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3049         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3050             flags, rtype, 1);
3051         uint_t szc;
3052         uint_t nszc;
3053         int error;
3054         caddr_t a;
3055         caddr_t eaddr;
3056         size_t segsize;
3057         size_t pgsz;
3058         uint_t save_szcvec;
3059
3060         ASSERT(AS_WRITE_HELD(as));
3061         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3062         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3063
3064         szcvec &= ~1;
3065         if (szcvec <= 1) {      /* skip if base page size */
3066                 return (0);
3067         }
3068
3069         /* Get the pagesize of the first larger page size. */
3070         szc = lowbit(szcvec) - 1;
3071         pgsz = page_get_pagesize(szc);
3072         eaddr = addr + size;
3073         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3074         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3075
3076         save_szcvec = szcvec;
3077         szcvec >>= (szc + 1);
3078         nszc = szc;
3079         while (szcvec) {
3080                 if ((szcvec & 0x1) == 0) {
3081                         nszc++;
3082                         szcvec >>= 1;
3083                         continue;
3084                 }
3085                 nszc++;
3086                 pgsz = page_get_pagesize(nszc);
3087                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3088                 if (a != addr) {
3089                         ASSERT(szc > 0);
3090                         ASSERT(a < eaddr);
3091                         segsize = a - addr;
3092                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3093                             save_szcvec);
3094                         if (error) {
3095                                 return (error);
3096                         }
3097                         addr = a;
3098                 }
3099                 szc = nszc;
3100                 szcvec >>= 1;
3101         }
3102
3103         ASSERT(addr < eaddr);
3104         szcvec = save_szcvec;
3105         while (szcvec) {
3106                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3107                 ASSERT(a >= addr);
3108                 if (a != addr) {
3109                         ASSERT(szc > 0);
3110                         segsize = a - addr;
3111                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3112                             save_szcvec);
3113                         if (error) {
3114                                 return (error);
3115                         }
3116                         addr = a;
3117                 }
3118                 szcvec &= ~(1 << szc);
3119                 if (szcvec) {
3120                         szc = highbit(szcvec) - 1;
3121                         pgsz = page_get_pagesize(szc);
3122                 }
3123         }
3124         ASSERT(addr == eaddr);
3125
3126         return (0);
3127 }
3128
3129 /*
3130  * Set the default large page size for the range. Called via memcntl with
3131  * page size set to 0. as_set_default_lpsize breaks the range down into
3132  * chunks with the same type/flags, ignores-non segvn segments, and passes
3133  * each chunk to as_iset_default_lpsize().
3134  */
3135 int
3136 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3137 {
3138         struct seg *seg;
3139         caddr_t raddr;
3140         size_t rsize;
3141         size_t ssize;
3142         int rtype, rflags;
3143         int stype, sflags;
3144         int error;
3145         caddr_t setaddr;
3146         size_t setsize;
3147         int segvn;
3148
3149         if (size == 0)
3150                 return (0);
3151
3152         AS_LOCK_ENTER(as, RW_WRITER);
3153 again:
3154         error = 0;
3155
3156         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3157         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3158             (size_t)raddr;
3159
3160         if (raddr + rsize < raddr) {            /* check for wraparound */
3161                 AS_LOCK_EXIT(as);
3162                 return (ENOMEM);
3163         }
3164         as_clearwatchprot(as, raddr, rsize);
3165         seg = as_segat(as, raddr);
3166         if (seg == NULL) {
3167                 as_setwatch(as);
3168                 AS_LOCK_EXIT(as);
3169                 return (ENOMEM);
3170         }
3171         if (seg->s_ops == &segvn_ops) {
3172                 rtype = segop_gettype(seg, addr);
3173                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3174                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3175                 segvn = 1;
3176         } else {
3177                 segvn = 0;
3178         }
3179         setaddr = raddr;
3180         setsize = 0;
3181
3182         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3183                 if (raddr >= (seg->s_base + seg->s_size)) {
3184                         seg = AS_SEGNEXT(as, seg);
3185                         if (seg == NULL || raddr != seg->s_base) {
3186                                 error = ENOMEM;
3187                                 break;
3188                         }
3189                         if (seg->s_ops == &segvn_ops) {
3190                                 stype = segop_gettype(seg, raddr);
3191                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3192                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3193                                 if (segvn && (rflags != sflags ||
3194                                     rtype != stype)) {
3195                                         /*
3196                                          * The next segment is also segvn but
3197                                          * has different flags and/or type.
3198                                          */
3199                                         ASSERT(setsize != 0);
3200                                         error = as_iset_default_lpsize(as,
3201                                             setaddr, setsize, rflags, rtype);
3202                                         if (error) {
3203                                                 break;
3204                                         }
3205                                         rflags = sflags;
3206                                         rtype = stype;
3207                                         setaddr = raddr;
3208                                         setsize = 0;
3209                                 } else if (!segvn) {
3210                                         rflags = sflags;
3211                                         rtype = stype;
3212                                         setaddr = raddr;
3213                                         setsize = 0;
3214                                         segvn = 1;
3215                                 }
3216                         } else if (segvn) {
3217                                 /* The next segment is not segvn. */
3218                                 ASSERT(setsize != 0);
3219                                 error = as_iset_default_lpsize(as,
3220                                     setaddr, setsize, rflags, rtype);
3221                                 if (error) {
3222                                         break;
3223                                 }
3224                                 segvn = 0;
3225                         }
3226                 }
3227                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3228                         ssize = seg->s_base + seg->s_size - raddr;
3229                 } else {
3230                         ssize = rsize;
3231                 }
3232         }
3233         if (error == 0 && segvn) {
3234                 /* The last chunk when rsize == 0. */
3235                 ASSERT(setsize != 0);
3236                 error = as_iset_default_lpsize(as, setaddr, setsize,
3237                     rflags, rtype);
3238         }
3239
3240         if (error == IE_RETRY) {
3241                 goto again;
3242         } else if (error == IE_NOMEM) {
3243                 error = EAGAIN;
3244         } else if (error == ENOTSUP) {
3245                 error = EINVAL;
3246         } else if (error == EAGAIN) {
3247                 mutex_enter(&as->a_contents);
3248                 if (!AS_ISNOUNMAPWAIT(as)) {
3249                         if (AS_ISUNMAPWAIT(as) == 0) {
3250                                 cv_broadcast(&as->a_cv);
3251                         }
3252                         AS_SETUNMAPWAIT(as);
3253                         AS_LOCK_EXIT(as);
3254                         while (AS_ISUNMAPWAIT(as)) {
3255                                 cv_wait(&as->a_cv, &as->a_contents);
3256                         }
3257                         mutex_exit(&as->a_contents);
3258                         AS_LOCK_ENTER(as, RW_WRITER);
3259                 } else {
3260                         /*
3261                          * We may have raced with
3262                          * segvn_reclaim()/segspt_reclaim(). In this case
3263                          * clean nounmapwait flag and retry since softlockcnt
3264                          * in this segment may be already 0.  We don't drop as
3265                          * writer lock so our number of retries without
3266                          * sleeping should be very small. See segvn_reclaim()
3267                          * for more comments.
3268                          */
3269                         AS_CLRNOUNMAPWAIT(as);
3270                         mutex_exit(&as->a_contents);
3271                 }
3272                 goto again;
3273         }
3274
3275         as_setwatch(as);
3276         AS_LOCK_EXIT(as);
3277         return (error);
3278 }
3279
3280 /*
3281  * Setup all of the uninitialized watched pages that we can.
3282  */
3283 void
3284 as_setwatch(struct as *as)
3285 {
3286         struct watched_page *pwp;
3287         struct seg *seg;
3288         caddr_t vaddr;
3289         uint_t prot;
3290         int  err, retrycnt;
3291
3292         if (avl_numnodes(&as->a_wpage) == 0)
3293                 return;
3294
3295         ASSERT(AS_WRITE_HELD(as));
3296
3297         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3298             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3299                 retrycnt = 0;
3300         retry:
3301                 vaddr = pwp->wp_vaddr;
3302                 if (pwp->wp_oprot != 0 ||       /* already set up */
3303                     (seg = as_segat(as, vaddr)) == NULL ||
3304                     segop_getprot(seg, vaddr, 0, &prot) != 0)
3305                         continue;
3306
3307                 pwp->wp_oprot = prot;
3308                 if (pwp->wp_read)
3309                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3310                 if (pwp->wp_write)
3311                         prot &= ~PROT_WRITE;
3312                 if (pwp->wp_exec)
3313                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3314                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3315                         err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3316                         if (err == IE_RETRY) {
3317                                 pwp->wp_oprot = 0;
3318                                 ASSERT(retrycnt == 0);
3319                                 retrycnt++;
3320                                 goto retry;
3321                         }
3322                 }
3323                 pwp->wp_prot = prot;
3324         }
3325 }
3326
3327 /*
3328  * Clear all of the watched pages in the address space.
3329  */
3330 void
3331 as_clearwatch(struct as *as)
3332 {
3333         struct watched_page *pwp;
3334         struct seg *seg;
3335         caddr_t vaddr;
3336         uint_t prot;
3337         int err, retrycnt;
3338
3339         if (avl_numnodes(&as->a_wpage) == 0)
3340                 return;
3341
3342         ASSERT(AS_WRITE_HELD(as));
3343
3344         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3345             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3346                 retrycnt = 0;
3347         retry:
3348                 vaddr = pwp->wp_vaddr;
3349                 if (pwp->wp_oprot == 0 ||       /* not set up */
3350                     (seg = as_segat(as, vaddr)) == NULL)
3351                         continue;
3352
3353                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3354                         err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3355                         if (err == IE_RETRY) {
3356                                 ASSERT(retrycnt == 0);
3357                                 retrycnt++;
3358                                 goto retry;
3359                         }
3360                 }
3361                 pwp->wp_oprot = 0;
3362                 pwp->wp_prot = 0;
3363         }
3364 }
3365
3366 /*
3367  * Force a new setup for all the watched pages in the range.
3368  */
3369 static void
3370 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3371 {
3372         struct watched_page *pwp;
3373         struct watched_page tpw;
3374         caddr_t eaddr = addr + size;
3375         caddr_t vaddr;
3376         struct seg *seg;
3377         int err, retrycnt;
3378         uint_t  wprot;
3379         avl_index_t where;
3380
3381         if (avl_numnodes(&as->a_wpage) == 0)
3382                 return;
3383
3384         ASSERT(AS_WRITE_HELD(as));
3385
3386         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3387         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3388                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3389
3390         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3391                 retrycnt = 0;
3392                 vaddr = pwp->wp_vaddr;
3393
3394                 wprot = prot;
3395                 if (pwp->wp_read)
3396                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3397                 if (pwp->wp_write)
3398                         wprot &= ~PROT_WRITE;
3399                 if (pwp->wp_exec)
3400                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3401                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3402                 retry:
3403                         seg = as_segat(as, vaddr);
3404                         if (seg == NULL) {
3405                                 panic("as_setwatchprot: no seg");
3406                                 /*NOTREACHED*/
3407                         }
3408                         err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3409                         if (err == IE_RETRY) {
3410                                 ASSERT(retrycnt == 0);
3411                                 retrycnt++;
3412                                 goto retry;
3413                         }
3414                 }
3415                 pwp->wp_oprot = prot;
3416                 pwp->wp_prot = wprot;
3417
3418                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3419         }
3420 }
3421
3422 /*
3423  * Clear all of the watched pages in the range.
3424  */
3425 static void
3426 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3427 {
3428         caddr_t eaddr = addr + size;
3429         struct watched_page *pwp;
3430         struct watched_page tpw;
3431         uint_t prot;
3432         struct seg *seg;
3433         int err, retrycnt;
3434         avl_index_t where;
3435
3436         if (avl_numnodes(&as->a_wpage) == 0)
3437                 return;
3438
3439         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3440         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3441                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3442
3443         ASSERT(AS_WRITE_HELD(as));
3444
3445         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3446
3447                 if ((prot = pwp->wp_oprot) != 0) {
3448                         retrycnt = 0;
3449
3450                         if (prot != pwp->wp_prot) {
3451                         retry:
3452                                 seg = as_segat(as, pwp->wp_vaddr);
3453                                 if (seg == NULL)
3454                                         continue;
3455                                 err = segop_setprot(seg, pwp->wp_vaddr,
3456                                     PAGESIZE, prot);
3457                                 if (err == IE_RETRY) {
3458                                         ASSERT(retrycnt == 0);
3459                                         retrycnt++;
3460                                         goto retry;
3461
3462                                 }
3463                         }
3464                         pwp->wp_oprot = 0;
3465                         pwp->wp_prot = 0;
3466                 }
3467
3468                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3469         }
3470 }
3471
3472 void
3473 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3474 {
3475         struct proc *p;
3476
3477         mutex_enter(&pidlock);
3478         for (p = practive; p; p = p->p_next) {
3479                 if (p->p_as == as) {
3480                         mutex_enter(&p->p_lock);
3481                         if (p->p_as == as)
3482                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3483                         mutex_exit(&p->p_lock);
3484                 }
3485         }
3486         mutex_exit(&pidlock);
3487 }
3488
3489 /*
3490  * return memory object ID
3491  */
3492 int
3493 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3494 {
3495         struct seg      *seg;
3496         int             sts;
3497
3498         AS_LOCK_ENTER(as, RW_READER);
3499         seg = as_segat(as, addr);
3500         if (seg == NULL) {
3501                 AS_LOCK_EXIT(as);
3502                 return (EFAULT);
3503         }
3504
3505         sts = segop_getmemid(seg, addr, memidp);
3506
3507         AS_LOCK_EXIT(as);
3508         return (sts);
3509 }