uts: make emu10k non-verbose
[unleashed.git] / kernel / vm / vm_as.c
blobef8f49b38eed169cd4b3ce60005a6c3d6b4098c2
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
34 * All Rights Reserved
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
38 * contributors.
42 * VM - address spaces.
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
54 #include <sys/kmem.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
62 #include <vm/hat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
70 #include <vm/page.h>
72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
74 static struct kmem_cache *as_cache;
76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
82 * Verifying the segment lists is very time-consuming; it may not be
83 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
85 #ifdef DEBUG
86 #define VERIFY_SEGLIST
87 int do_as_verify = 0;
88 #endif
91 * Allocate a new callback data structure entry and fill in the events of
92 * interest, the address range of interest, and the callback argument.
93 * Link the entry on the as->a_callbacks list. A callback entry for the
94 * entire address space may be specified with vaddr = 0 and size = -1.
96 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 * the specified as, the caller must guarantee persistence of the specified as
98 * for the duration of this function (eg. pages being locked within the as
99 * will guarantee persistence).
102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
103 caddr_t vaddr, size_t size, int sleepflag)
105 struct as_callback *current_head, *cb;
106 caddr_t saddr;
107 size_t rsize;
109 /* callback function and an event are mandatory */
110 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
111 return (EINVAL);
113 /* Adding a callback after as_free has been called is not allowed */
114 if (as == &kas)
115 return (ENOMEM);
118 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 * is the entire address space so no rounding is done in that case.
121 if (size != -1) {
122 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
123 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
124 (size_t)saddr;
125 /* check for wraparound */
126 if (saddr + rsize < saddr)
127 return (ENOMEM);
128 } else {
129 if (vaddr != 0)
130 return (EINVAL);
131 saddr = vaddr;
132 rsize = size;
135 /* Allocate and initialize a callback entry */
136 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
137 if (cb == NULL)
138 return (EAGAIN);
140 cb->ascb_func = cb_func;
141 cb->ascb_arg = arg;
142 cb->ascb_events = events;
143 cb->ascb_saddr = saddr;
144 cb->ascb_len = rsize;
146 /* Add the entry to the list */
147 mutex_enter(&as->a_contents);
148 current_head = as->a_callbacks;
149 as->a_callbacks = cb;
150 cb->ascb_next = current_head;
153 * The call to this function may lose in a race with
154 * a pertinent event - eg. a thread does long term memory locking
155 * but before the callback is added another thread executes as_unmap.
156 * A broadcast here resolves that.
158 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
159 AS_CLRUNMAPWAIT(as);
160 cv_broadcast(&as->a_cv);
163 mutex_exit(&as->a_contents);
164 return (0);
168 * Search the callback list for an entry which pertains to arg.
170 * This is called from within the client upon completion of the callback.
171 * RETURN VALUES:
172 * AS_CALLBACK_DELETED (callback entry found and deleted)
173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 * entry will be made in as_do_callbacks)
177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 * set, it indicates that as_do_callbacks is processing this entry. The
179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 * to unblock as_do_callbacks, in case it is blocked.
182 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 * the specified as, the caller must guarantee persistence of the specified as
184 * for the duration of this function (eg. pages being locked within the as
185 * will guarantee persistence).
187 uint_t
188 as_delete_callback(struct as *as, void *arg)
190 struct as_callback **prevcb = &as->a_callbacks;
191 struct as_callback *cb;
192 uint_t rc = AS_CALLBACK_NOTFOUND;
194 mutex_enter(&as->a_contents);
195 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
196 if (cb->ascb_arg != arg)
197 continue;
200 * If the events indicate AS_CALLBACK_CALLED, just clear
201 * AS_ALL_EVENT in the events field and wakeup the thread
202 * that may be waiting in as_do_callbacks. as_do_callbacks
203 * will take care of removing this entry from the list. In
204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
205 * (AS_CALLBACK_CALLED not set), just remove it from the
206 * list, return the memory and return AS_CALLBACK_DELETED.
208 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
209 /* leave AS_CALLBACK_CALLED */
210 cb->ascb_events &= ~AS_ALL_EVENT;
211 rc = AS_CALLBACK_DELETE_DEFERRED;
212 cv_broadcast(&as->a_cv);
213 } else {
214 *prevcb = cb->ascb_next;
215 kmem_free(cb, sizeof (struct as_callback));
216 rc = AS_CALLBACK_DELETED;
218 break;
220 mutex_exit(&as->a_contents);
221 return (rc);
225 * Searches the as callback list for a matching entry.
226 * Returns a pointer to the first matching callback, or NULL if
227 * nothing is found.
228 * This function never sleeps so it is ok to call it with more
229 * locks held but the (required) a_contents mutex.
231 * See also comment on as_do_callbacks below.
233 static struct as_callback *
234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
235 size_t event_len)
237 struct as_callback *cb;
239 ASSERT(MUTEX_HELD(&as->a_contents));
240 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
242 * If the callback has not already been called, then
243 * check if events or address range pertains. An event_len
244 * of zero means do an unconditional callback.
246 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
247 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
248 (event_addr + event_len < cb->ascb_saddr) ||
249 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
250 continue;
252 break;
254 return (cb);
258 * Executes a given callback and removes it from the callback list for
259 * this address space.
260 * This function may sleep so the caller must drop all locks except
261 * a_contents before calling this func.
263 * See also comments on as_do_callbacks below.
265 static void
266 as_execute_callback(struct as *as, struct as_callback *cb,
267 uint_t events)
269 struct as_callback **prevcb;
270 void *cb_arg;
272 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
273 cb->ascb_events |= AS_CALLBACK_CALLED;
274 mutex_exit(&as->a_contents);
275 (*cb->ascb_func)(as, cb->ascb_arg, events);
276 mutex_enter(&as->a_contents);
278 * the callback function is required to delete the callback
279 * when the callback function determines it is OK for
280 * this thread to continue. as_delete_callback will clear
281 * the AS_ALL_EVENT in the events field when it is deleted.
282 * If the callback function called as_delete_callback,
283 * events will already be cleared and there will be no blocking.
285 while ((cb->ascb_events & events) != 0) {
286 cv_wait(&as->a_cv, &as->a_contents);
289 * This entry needs to be taken off the list. Normally, the
290 * callback func itself does that, but unfortunately the list
291 * may have changed while the callback was running because the
292 * a_contents mutex was dropped and someone else other than the
293 * callback func itself could have called as_delete_callback,
294 * so we have to search to find this entry again. The entry
295 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
297 cb_arg = cb->ascb_arg;
298 prevcb = &as->a_callbacks;
299 for (cb = as->a_callbacks; cb != NULL;
300 prevcb = &cb->ascb_next, cb = *prevcb) {
301 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
302 (cb_arg != cb->ascb_arg)) {
303 continue;
305 *prevcb = cb->ascb_next;
306 kmem_free(cb, sizeof (struct as_callback));
307 break;
312 * Check the callback list for a matching event and intersection of
313 * address range. If there is a match invoke the callback. Skip an entry if:
314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 * - not event of interest
316 * - not address range of interest
318 * An event_len of zero indicates a request for an unconditional callback
319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
320 * a_contents lock must be dropped before a callback, so only one callback
321 * can be done before returning. Return -1 (true) if a callback was
322 * executed and removed from the list, else return 0 (false).
324 * The logically separate parts, i.e. finding a matching callback and
325 * executing a given callback have been separated into two functions
326 * so that they can be called with different sets of locks held beyond
327 * the always-required a_contents. as_find_callback does not sleep so
328 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 * rwlock) are held. as_execute_callback on the other hand may sleep
330 * so all locks beyond a_contents must be dropped by the caller if one
331 * does not want to end comatose.
333 static int
334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
335 size_t event_len)
337 struct as_callback *cb;
339 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
340 as_execute_callback(as, cb, events);
341 return (-1);
343 return (0);
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
352 * list is returned.
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
357 struct seg *
358 as_findseg(struct as *as, caddr_t addr, int tail)
360 struct seg *seg = as->a_seglast;
361 avl_index_t where;
363 ASSERT(AS_LOCK_HELD(as));
365 if (seg != NULL &&
366 seg->s_base <= addr &&
367 addr < seg->s_base + seg->s_size)
368 return (seg);
370 seg = avl_find(&as->a_segtree, &addr, &where);
371 if (seg != NULL)
372 return (as->a_seglast = seg);
374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375 if (seg == NULL && tail)
376 seg = avl_last(&as->a_segtree);
377 return (as->a_seglast = seg);
380 #ifdef VERIFY_SEGLIST
382 * verify that the linked list is coherent
384 static void
385 as_verify(struct as *as)
387 struct seg *seg, *seglast, *p, *n;
388 uint_t nsegs = 0;
390 if (do_as_verify == 0)
391 return;
393 seglast = as->a_seglast;
395 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
396 ASSERT(seg->s_as == as);
397 p = AS_SEGPREV(as, seg);
398 n = AS_SEGNEXT(as, seg);
399 ASSERT(p == NULL || p->s_as == as);
400 ASSERT(p == NULL || p->s_base < seg->s_base);
401 ASSERT(n == NULL || n->s_base > seg->s_base);
402 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
403 if (seg == seglast)
404 seglast = NULL;
405 nsegs++;
407 ASSERT(seglast == NULL);
408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
410 #endif /* VERIFY_SEGLIST */
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
418 as_addseg(struct as *as, struct seg *newseg)
420 struct seg *seg;
421 caddr_t addr;
422 caddr_t eaddr;
423 avl_index_t where;
425 ASSERT(AS_WRITE_HELD(as));
427 as->a_updatedir = 1; /* inform /proc */
428 gethrestime(&as->a_updatetime);
430 if (as->a_lastgaphl != NULL) {
431 struct seg *hseg = NULL;
432 struct seg *lseg = NULL;
434 if (as->a_lastgaphl->s_base > newseg->s_base) {
435 hseg = as->a_lastgaphl;
436 lseg = AVL_PREV(&as->a_segtree, hseg);
437 } else {
438 lseg = as->a_lastgaphl;
439 hseg = AVL_NEXT(&as->a_segtree, lseg);
442 if (hseg && lseg && lseg->s_base < newseg->s_base &&
443 hseg->s_base > newseg->s_base) {
444 avl_insert_here(&as->a_segtree, newseg, lseg,
445 AVL_AFTER);
446 as->a_lastgaphl = NULL;
447 as->a_seglast = newseg;
448 return (0);
450 as->a_lastgaphl = NULL;
453 addr = newseg->s_base;
454 eaddr = addr + newseg->s_size;
455 again:
457 seg = avl_find(&as->a_segtree, &addr, &where);
459 if (seg == NULL)
460 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
462 if (seg == NULL)
463 seg = avl_last(&as->a_segtree);
465 if (seg != NULL) {
466 caddr_t base = seg->s_base;
469 * If top of seg is below the requested address, then
470 * the insertion point is at the end of the linked list,
471 * and seg points to the tail of the list. Otherwise,
472 * the insertion point is immediately before seg.
474 if (base + seg->s_size > addr) {
475 if (addr >= base || eaddr > base) {
476 #ifdef __sparc
477 extern const struct seg_ops segnf_ops;
480 * no-fault segs must disappear if overlaid.
481 * XXX need new segment type so
482 * we don't have to check s_ops
484 if (seg->s_ops == &segnf_ops) {
485 seg_unmap(seg);
486 goto again;
488 #endif
489 return (-1); /* overlapping segment */
493 as->a_seglast = newseg;
494 avl_insert(&as->a_segtree, newseg, where);
496 #ifdef VERIFY_SEGLIST
497 as_verify(as);
498 #endif
499 return (0);
502 struct seg *
503 as_removeseg(struct as *as, struct seg *seg)
505 avl_tree_t *t;
507 ASSERT(AS_WRITE_HELD(as));
509 as->a_updatedir = 1; /* inform /proc */
510 gethrestime(&as->a_updatetime);
512 if (seg == NULL)
513 return (NULL);
515 t = &as->a_segtree;
516 if (as->a_seglast == seg)
517 as->a_seglast = NULL;
518 as->a_lastgaphl = NULL;
521 * if this segment is at an address higher than
522 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
524 if (as->a_lastgap &&
525 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
526 as->a_lastgap = AVL_NEXT(t, seg);
529 * remove the segment from the seg tree
531 avl_remove(t, seg);
533 #ifdef VERIFY_SEGLIST
534 as_verify(as);
535 #endif
536 return (seg);
540 * Find a segment containing addr.
542 struct seg *
543 as_segat(struct as *as, caddr_t addr)
545 struct seg *seg = as->a_seglast;
547 ASSERT(AS_LOCK_HELD(as));
549 if (seg != NULL && seg->s_base <= addr &&
550 addr < seg->s_base + seg->s_size)
551 return (seg);
553 seg = avl_find(&as->a_segtree, &addr, NULL);
554 return (seg);
558 * Serialize all searches for holes in an address space to
559 * prevent two or more threads from allocating the same virtual
560 * address range. The address space must not be "read/write"
561 * locked by the caller since we may block.
563 void
564 as_rangelock(struct as *as)
566 mutex_enter(&as->a_contents);
567 while (AS_ISCLAIMGAP(as))
568 cv_wait(&as->a_cv, &as->a_contents);
569 AS_SETCLAIMGAP(as);
570 mutex_exit(&as->a_contents);
574 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
576 void
577 as_rangeunlock(struct as *as)
579 mutex_enter(&as->a_contents);
580 AS_CLRCLAIMGAP(as);
581 cv_signal(&as->a_cv);
582 mutex_exit(&as->a_contents);
586 * compar segments (or just an address) by segment address range
588 static int
589 as_segcompar(const void *x, const void *y)
591 struct seg *a = (struct seg *)x;
592 struct seg *b = (struct seg *)y;
594 if (a->s_base < b->s_base)
595 return (-1);
596 if (a->s_base >= b->s_base + b->s_size)
597 return (1);
598 return (0);
602 void
603 as_avlinit(struct as *as)
605 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
606 offsetof(struct seg, s_tree));
607 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
608 offsetof(struct watched_page, wp_link));
611 /*ARGSUSED*/
612 static int
613 as_constructor(void *buf, void *cdrarg, int kmflags)
615 struct as *as = buf;
617 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
618 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
619 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
620 as_avlinit(as);
621 return (0);
624 /*ARGSUSED1*/
625 static void
626 as_destructor(void *buf, void *cdrarg)
628 struct as *as = buf;
630 avl_destroy(&as->a_segtree);
631 mutex_destroy(&as->a_contents);
632 cv_destroy(&as->a_cv);
633 rw_destroy(&as->a_lock);
636 void
637 as_init(void)
639 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
640 as_constructor, as_destructor, NULL, NULL, NULL, 0);
644 * Allocate and initialize an address space data structure.
645 * We call hat_alloc to allow any machine dependent
646 * information in the hat structure to be initialized.
648 struct as *
649 as_alloc(void)
651 struct as *as;
653 as = kmem_cache_alloc(as_cache, KM_SLEEP);
655 as->a_flags = 0;
656 as->a_vbits = 0;
657 as->a_hrm = NULL;
658 as->a_seglast = NULL;
659 as->a_size = 0;
660 as->a_resvsize = 0;
661 as->a_updatedir = 0;
662 gethrestime(&as->a_updatetime);
663 as->a_objectdir = NULL;
664 as->a_sizedir = 0;
665 as->a_userlimit = (caddr_t)USERLIMIT;
666 as->a_lastgap = NULL;
667 as->a_lastgaphl = NULL;
668 as->a_callbacks = NULL;
669 as->a_proc = NULL;
671 AS_LOCK_ENTER(as, RW_WRITER);
672 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
673 AS_LOCK_EXIT(as);
675 return (as);
679 * Free an address space data structure.
680 * Need to free the hat first and then
681 * all the segments on this as and finally
682 * the space for the as struct itself.
684 void
685 as_free(struct as *as)
687 struct hat *hat = as->a_hat;
688 struct seg *seg, *next;
689 boolean_t free_started = B_FALSE;
691 top:
693 * Invoke ALL callbacks. as_do_callbacks will do one callback
694 * per call, and not return (-1) until the callback has completed.
695 * When as_do_callbacks returns zero, all callbacks have completed.
697 mutex_enter(&as->a_contents);
698 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
701 mutex_exit(&as->a_contents);
702 AS_LOCK_ENTER(as, RW_WRITER);
704 if (!free_started) {
705 free_started = B_TRUE;
706 hat_free_start(hat);
708 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
709 int err;
711 next = AS_SEGNEXT(as, seg);
712 retry:
713 err = segop_unmap(seg, seg->s_base, seg->s_size);
714 if (err == EAGAIN) {
715 mutex_enter(&as->a_contents);
716 if (as->a_callbacks) {
717 AS_LOCK_EXIT(as);
718 } else if (!AS_ISNOUNMAPWAIT(as)) {
720 * Memory is currently locked. Wait for a
721 * cv_signal that it has been unlocked, then
722 * try the operation again.
724 if (AS_ISUNMAPWAIT(as) == 0)
725 cv_broadcast(&as->a_cv);
726 AS_SETUNMAPWAIT(as);
727 AS_LOCK_EXIT(as);
728 while (AS_ISUNMAPWAIT(as))
729 cv_wait(&as->a_cv, &as->a_contents);
730 } else {
732 * We may have raced with
733 * segvn_reclaim()/segspt_reclaim(). In this
734 * case clean nounmapwait flag and retry since
735 * softlockcnt in this segment may be already
736 * 0. We don't drop as writer lock so our
737 * number of retries without sleeping should
738 * be very small. See segvn_reclaim() for
739 * more comments.
741 AS_CLRNOUNMAPWAIT(as);
742 mutex_exit(&as->a_contents);
743 goto retry;
745 mutex_exit(&as->a_contents);
746 goto top;
747 } else {
749 * We do not expect any other error return at this
750 * time. This is similar to an ASSERT in seg_unmap()
752 ASSERT(err == 0);
755 hat_free_end(hat);
756 AS_LOCK_EXIT(as);
758 /* /proc stuff */
759 ASSERT(avl_numnodes(&as->a_wpage) == 0);
760 if (as->a_objectdir) {
761 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
762 as->a_objectdir = NULL;
763 as->a_sizedir = 0;
767 * Free the struct as back to kmem. Assert it has no segments.
769 ASSERT(avl_numnodes(&as->a_segtree) == 0);
770 kmem_cache_free(as_cache, as);
774 as_dup(struct as *as, struct proc *forkedproc)
776 struct as *newas;
777 struct seg *seg, *newseg;
778 size_t purgesize = 0;
779 int error;
781 AS_LOCK_ENTER(as, RW_WRITER);
782 as_clearwatch(as);
783 newas = as_alloc();
784 newas->a_userlimit = as->a_userlimit;
785 newas->a_proc = forkedproc;
787 AS_LOCK_ENTER(newas, RW_WRITER);
789 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
791 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
793 if (seg->s_flags & S_PURGE) {
794 purgesize += seg->s_size;
795 continue;
798 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
799 if (newseg == NULL) {
800 AS_LOCK_EXIT(newas);
801 as_setwatch(as);
802 AS_LOCK_EXIT(as);
803 as_free(newas);
804 return (-1);
806 if ((error = segop_dup(seg, newseg)) != 0) {
808 * We call seg_free() on the new seg
809 * because the segment is not set up
810 * completely; i.e. it has no ops.
812 as_setwatch(as);
813 AS_LOCK_EXIT(as);
814 seg_free(newseg);
815 AS_LOCK_EXIT(newas);
816 as_free(newas);
817 return (error);
819 newas->a_size += seg->s_size;
821 newas->a_resvsize = as->a_resvsize - purgesize;
823 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
825 AS_LOCK_EXIT(newas);
827 as_setwatch(as);
828 AS_LOCK_EXIT(as);
829 if (error != 0) {
830 as_free(newas);
831 return (error);
833 forkedproc->p_as = newas;
834 return (0);
838 * Handle a ``fault'' at addr for size bytes.
840 faultcode_t
841 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
842 enum fault_type type, enum seg_rw rw)
844 struct seg *seg;
845 caddr_t raddr; /* rounded down addr */
846 size_t rsize; /* rounded up size */
847 size_t ssize;
848 faultcode_t res = 0;
849 caddr_t addrsav;
850 struct seg *segsav;
851 int as_lock_held;
852 klwp_t *lwp = ttolwp(curthread);
856 retry:
858 * Indicate that the lwp is not to be stopped while waiting for a
859 * pagefault. This is to avoid deadlock while debugging a process
860 * via /proc over NFS (in particular).
862 if (lwp != NULL)
863 lwp->lwp_nostop++;
866 * same length must be used when we softlock and softunlock. We
867 * don't support softunlocking lengths less than the original length
868 * when there is largepage support. See seg_dev.c for more
869 * comments.
871 switch (type) {
873 case F_SOFTLOCK:
874 CPU_STATS_ADD_K(vm, softlock, 1);
875 break;
877 case F_SOFTUNLOCK:
878 break;
880 case F_PROT:
881 CPU_STATS_ADD_K(vm, prot_fault, 1);
882 break;
884 case F_INVAL:
885 CPU_STATS_ENTER_K();
886 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
887 if (as == &kas)
888 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
889 CPU_STATS_EXIT_K();
890 break;
893 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
894 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
895 (size_t)raddr;
898 * XXX -- Don't grab the as lock for segkmap. We should grab it for
899 * correctness, but then we could be stuck holding this lock for
900 * a LONG time if the fault needs to be resolved on a slow
901 * filesystem, and then no-one will be able to exec new commands,
902 * as exec'ing requires the write lock on the as.
904 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
905 raddr + size < segkmap->s_base + segkmap->s_size) {
906 seg = segkmap;
907 as_lock_held = 0;
908 } else {
909 AS_LOCK_ENTER(as, RW_READER);
911 seg = as_segat(as, raddr);
912 if (seg == NULL) {
913 AS_LOCK_EXIT(as);
914 if (lwp != NULL)
915 lwp->lwp_nostop--;
916 return (FC_NOMAP);
919 as_lock_held = 1;
922 addrsav = raddr;
923 segsav = seg;
925 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
926 if (raddr >= seg->s_base + seg->s_size) {
927 seg = AS_SEGNEXT(as, seg);
928 if (seg == NULL || raddr != seg->s_base) {
929 res = FC_NOMAP;
930 break;
933 if (raddr + rsize > seg->s_base + seg->s_size)
934 ssize = seg->s_base + seg->s_size - raddr;
935 else
936 ssize = rsize;
938 res = segop_fault(hat, seg, raddr, ssize, type, rw);
939 if (res != 0)
940 break;
944 * If we were SOFTLOCKing and encountered a failure,
945 * we must SOFTUNLOCK the range we already did. (Maybe we
946 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
947 * right here...)
949 if (res != 0 && type == F_SOFTLOCK) {
950 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
951 if (addrsav >= seg->s_base + seg->s_size)
952 seg = AS_SEGNEXT(as, seg);
953 ASSERT(seg != NULL);
955 * Now call the fault routine again to perform the
956 * unlock using S_OTHER instead of the rw variable
957 * since we never got a chance to touch the pages.
959 if (raddr > seg->s_base + seg->s_size)
960 ssize = seg->s_base + seg->s_size - addrsav;
961 else
962 ssize = raddr - addrsav;
963 (void) segop_fault(hat, seg, addrsav, ssize,
964 F_SOFTUNLOCK, S_OTHER);
967 if (as_lock_held)
968 AS_LOCK_EXIT(as);
969 if (lwp != NULL)
970 lwp->lwp_nostop--;
973 * If the lower levels returned EDEADLK for a fault,
974 * It means that we should retry the fault. Let's wait
975 * a bit also to let the deadlock causing condition clear.
976 * This is part of a gross hack to work around a design flaw
977 * in the ufs/sds logging code and should go away when the
978 * logging code is re-designed to fix the problem. See bug
979 * 4125102 for details of the problem.
981 if (FC_ERRNO(res) == EDEADLK) {
982 delay(deadlk_wait);
983 res = 0;
984 goto retry;
986 return (res);
992 * Asynchronous ``fault'' at addr for size bytes.
994 faultcode_t
995 as_faulta(struct as *as, caddr_t addr, size_t size)
997 struct seg *seg;
998 caddr_t raddr; /* rounded down addr */
999 size_t rsize; /* rounded up size */
1000 faultcode_t res = 0;
1001 klwp_t *lwp = ttolwp(curthread);
1003 retry:
1005 * Indicate that the lwp is not to be stopped while waiting
1006 * for a pagefault. This is to avoid deadlock while debugging
1007 * a process via /proc over NFS (in particular).
1009 if (lwp != NULL)
1010 lwp->lwp_nostop++;
1012 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1013 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1014 (size_t)raddr;
1016 AS_LOCK_ENTER(as, RW_READER);
1017 seg = as_segat(as, raddr);
1018 if (seg == NULL) {
1019 AS_LOCK_EXIT(as);
1020 if (lwp != NULL)
1021 lwp->lwp_nostop--;
1022 return (FC_NOMAP);
1025 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1026 if (raddr >= seg->s_base + seg->s_size) {
1027 seg = AS_SEGNEXT(as, seg);
1028 if (seg == NULL || raddr != seg->s_base) {
1029 res = FC_NOMAP;
1030 break;
1033 res = segop_faulta(seg, raddr);
1034 if (res != 0)
1035 break;
1037 AS_LOCK_EXIT(as);
1038 if (lwp != NULL)
1039 lwp->lwp_nostop--;
1041 * If the lower levels returned EDEADLK for a fault,
1042 * It means that we should retry the fault. Let's wait
1043 * a bit also to let the deadlock causing condition clear.
1044 * This is part of a gross hack to work around a design flaw
1045 * in the ufs/sds logging code and should go away when the
1046 * logging code is re-designed to fix the problem. See bug
1047 * 4125102 for details of the problem.
1049 if (FC_ERRNO(res) == EDEADLK) {
1050 delay(deadlk_wait);
1051 res = 0;
1052 goto retry;
1054 return (res);
1058 * Set the virtual mapping for the interval from [addr : addr + size)
1059 * in address space `as' to have the specified protection.
1060 * It is ok for the range to cross over several segments,
1061 * as long as they are contiguous.
1064 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1066 struct seg *seg;
1067 struct as_callback *cb;
1068 size_t ssize;
1069 caddr_t raddr; /* rounded down addr */
1070 size_t rsize; /* rounded up size */
1071 int error = 0, writer = 0;
1072 caddr_t saveraddr;
1073 size_t saversize;
1075 setprot_top:
1076 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1077 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1078 (size_t)raddr;
1080 if (raddr + rsize < raddr) /* check for wraparound */
1081 return (ENOMEM);
1083 saveraddr = raddr;
1084 saversize = rsize;
1087 * Normally we only lock the as as a reader. But
1088 * if due to setprot the segment driver needs to split
1089 * a segment it will return IE_RETRY. Therefore we re-acquire
1090 * the as lock as a writer so the segment driver can change
1091 * the seg list. Also the segment driver will return IE_RETRY
1092 * after it has changed the segment list so we therefore keep
1093 * locking as a writer. Since these opeartions should be rare
1094 * want to only lock as a writer when necessary.
1096 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1097 AS_LOCK_ENTER(as, RW_WRITER);
1098 } else {
1099 AS_LOCK_ENTER(as, RW_READER);
1102 as_clearwatchprot(as, raddr, rsize);
1103 seg = as_segat(as, raddr);
1104 if (seg == NULL) {
1105 as_setwatch(as);
1106 AS_LOCK_EXIT(as);
1107 return (ENOMEM);
1110 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1111 if (raddr >= seg->s_base + seg->s_size) {
1112 seg = AS_SEGNEXT(as, seg);
1113 if (seg == NULL || raddr != seg->s_base) {
1114 error = ENOMEM;
1115 break;
1118 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1119 ssize = seg->s_base + seg->s_size - raddr;
1120 else
1121 ssize = rsize;
1122 retry:
1123 error = segop_setprot(seg, raddr, ssize, prot);
1125 if (error == IE_NOMEM) {
1126 error = EAGAIN;
1127 break;
1130 if (error == IE_RETRY) {
1131 AS_LOCK_EXIT(as);
1132 writer = 1;
1133 goto setprot_top;
1136 if (error == EAGAIN) {
1138 * Make sure we have a_lock as writer.
1140 if (writer == 0) {
1141 AS_LOCK_EXIT(as);
1142 writer = 1;
1143 goto setprot_top;
1147 * Memory is currently locked. It must be unlocked
1148 * before this operation can succeed through a retry.
1149 * The possible reasons for locked memory and
1150 * corresponding strategies for unlocking are:
1151 * (1) Normal I/O
1152 * wait for a signal that the I/O operation
1153 * has completed and the memory is unlocked.
1154 * (2) Asynchronous I/O
1155 * The aio subsystem does not unlock pages when
1156 * the I/O is completed. Those pages are unlocked
1157 * when the application calls aiowait/aioerror.
1158 * So, to prevent blocking forever, cv_broadcast()
1159 * is done to wake up aio_cleanup_thread.
1160 * Subsequently, segvn_reclaim will be called, and
1161 * that will do AS_CLRUNMAPWAIT() and wake us up.
1162 * (3) Long term page locking:
1163 * Drivers intending to have pages locked for a
1164 * period considerably longer than for normal I/O
1165 * (essentially forever) may have registered for a
1166 * callback so they may unlock these pages on
1167 * request. This is needed to allow this operation
1168 * to succeed. Each entry on the callback list is
1169 * examined. If the event or address range pertains
1170 * the callback is invoked (unless it already is in
1171 * progress). The a_contents lock must be dropped
1172 * before the callback, so only one callback can
1173 * be done at a time. Go to the top and do more
1174 * until zero is returned. If zero is returned,
1175 * either there were no callbacks for this event
1176 * or they were already in progress.
1178 mutex_enter(&as->a_contents);
1179 if (as->a_callbacks &&
1180 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1181 seg->s_base, seg->s_size))) {
1182 AS_LOCK_EXIT(as);
1183 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1184 } else if (!AS_ISNOUNMAPWAIT(as)) {
1185 if (AS_ISUNMAPWAIT(as) == 0)
1186 cv_broadcast(&as->a_cv);
1187 AS_SETUNMAPWAIT(as);
1188 AS_LOCK_EXIT(as);
1189 while (AS_ISUNMAPWAIT(as))
1190 cv_wait(&as->a_cv, &as->a_contents);
1191 } else {
1193 * We may have raced with
1194 * segvn_reclaim()/segspt_reclaim(). In this
1195 * case clean nounmapwait flag and retry since
1196 * softlockcnt in this segment may be already
1197 * 0. We don't drop as writer lock so our
1198 * number of retries without sleeping should
1199 * be very small. See segvn_reclaim() for
1200 * more comments.
1202 AS_CLRNOUNMAPWAIT(as);
1203 mutex_exit(&as->a_contents);
1204 goto retry;
1206 mutex_exit(&as->a_contents);
1207 goto setprot_top;
1208 } else if (error != 0)
1209 break;
1211 if (error != 0) {
1212 as_setwatch(as);
1213 } else {
1214 as_setwatchprot(as, saveraddr, saversize, prot);
1216 AS_LOCK_EXIT(as);
1217 return (error);
1221 * Check to make sure that the interval [addr, addr + size)
1222 * in address space `as' has at least the specified protection.
1223 * It is ok for the range to cross over several segments, as long
1224 * as they are contiguous.
1227 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1229 struct seg *seg;
1230 size_t ssize;
1231 caddr_t raddr; /* rounded down addr */
1232 size_t rsize; /* rounded up size */
1233 int error = 0;
1235 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1236 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1237 (size_t)raddr;
1239 if (raddr + rsize < raddr) /* check for wraparound */
1240 return (ENOMEM);
1243 * This is ugly as sin...
1244 * Normally, we only acquire the address space readers lock.
1245 * However, if the address space has watchpoints present,
1246 * we must acquire the writer lock on the address space for
1247 * the benefit of as_clearwatchprot() and as_setwatchprot().
1249 if (avl_numnodes(&as->a_wpage) != 0)
1250 AS_LOCK_ENTER(as, RW_WRITER);
1251 else
1252 AS_LOCK_ENTER(as, RW_READER);
1253 as_clearwatchprot(as, raddr, rsize);
1254 seg = as_segat(as, raddr);
1255 if (seg == NULL) {
1256 as_setwatch(as);
1257 AS_LOCK_EXIT(as);
1258 return (ENOMEM);
1261 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1262 if (raddr >= seg->s_base + seg->s_size) {
1263 seg = AS_SEGNEXT(as, seg);
1264 if (seg == NULL || raddr != seg->s_base) {
1265 error = ENOMEM;
1266 break;
1269 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1270 ssize = seg->s_base + seg->s_size - raddr;
1271 else
1272 ssize = rsize;
1274 error = segop_checkprot(seg, raddr, ssize, prot);
1275 if (error != 0)
1276 break;
1278 as_setwatch(as);
1279 AS_LOCK_EXIT(as);
1280 return (error);
1284 as_unmap(struct as *as, caddr_t addr, size_t size)
1286 struct seg *seg, *seg_next;
1287 struct as_callback *cb;
1288 caddr_t raddr, eaddr;
1289 size_t ssize, rsize = 0;
1290 int err;
1292 top:
1293 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1294 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1295 (uintptr_t)PAGEMASK);
1297 AS_LOCK_ENTER(as, RW_WRITER);
1299 as->a_updatedir = 1; /* inform /proc */
1300 gethrestime(&as->a_updatetime);
1303 * Use as_findseg to find the first segment in the range, then
1304 * step through the segments in order, following s_next.
1306 as_clearwatchprot(as, raddr, eaddr - raddr);
1308 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1309 if (eaddr <= seg->s_base)
1310 break; /* eaddr was in a gap; all done */
1312 /* this is implied by the test above */
1313 ASSERT(raddr < eaddr);
1315 if (raddr < seg->s_base)
1316 raddr = seg->s_base; /* raddr was in a gap */
1318 if (eaddr > (seg->s_base + seg->s_size))
1319 ssize = seg->s_base + seg->s_size - raddr;
1320 else
1321 ssize = eaddr - raddr;
1324 * Save next segment pointer since seg can be
1325 * destroyed during the segment unmap operation.
1327 seg_next = AS_SEGNEXT(as, seg);
1330 * We didn't count /dev/null mappings, so ignore them here.
1331 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1332 * we have to do this check here while we have seg.)
1334 rsize = 0;
1335 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1336 !SEG_IS_PARTIAL_RESV(seg))
1337 rsize = ssize;
1339 retry:
1340 err = segop_unmap(seg, raddr, ssize);
1341 if (err == EAGAIN) {
1343 * Memory is currently locked. It must be unlocked
1344 * before this operation can succeed through a retry.
1345 * The possible reasons for locked memory and
1346 * corresponding strategies for unlocking are:
1347 * (1) Normal I/O
1348 * wait for a signal that the I/O operation
1349 * has completed and the memory is unlocked.
1350 * (2) Asynchronous I/O
1351 * The aio subsystem does not unlock pages when
1352 * the I/O is completed. Those pages are unlocked
1353 * when the application calls aiowait/aioerror.
1354 * So, to prevent blocking forever, cv_broadcast()
1355 * is done to wake up aio_cleanup_thread.
1356 * Subsequently, segvn_reclaim will be called, and
1357 * that will do AS_CLRUNMAPWAIT() and wake us up.
1358 * (3) Long term page locking:
1359 * Drivers intending to have pages locked for a
1360 * period considerably longer than for normal I/O
1361 * (essentially forever) may have registered for a
1362 * callback so they may unlock these pages on
1363 * request. This is needed to allow this operation
1364 * to succeed. Each entry on the callback list is
1365 * examined. If the event or address range pertains
1366 * the callback is invoked (unless it already is in
1367 * progress). The a_contents lock must be dropped
1368 * before the callback, so only one callback can
1369 * be done at a time. Go to the top and do more
1370 * until zero is returned. If zero is returned,
1371 * either there were no callbacks for this event
1372 * or they were already in progress.
1374 mutex_enter(&as->a_contents);
1375 if (as->a_callbacks &&
1376 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1377 seg->s_base, seg->s_size))) {
1378 AS_LOCK_EXIT(as);
1379 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1380 } else if (!AS_ISNOUNMAPWAIT(as)) {
1381 if (AS_ISUNMAPWAIT(as) == 0)
1382 cv_broadcast(&as->a_cv);
1383 AS_SETUNMAPWAIT(as);
1384 AS_LOCK_EXIT(as);
1385 while (AS_ISUNMAPWAIT(as))
1386 cv_wait(&as->a_cv, &as->a_contents);
1387 } else {
1389 * We may have raced with
1390 * segvn_reclaim()/segspt_reclaim(). In this
1391 * case clean nounmapwait flag and retry since
1392 * softlockcnt in this segment may be already
1393 * 0. We don't drop as writer lock so our
1394 * number of retries without sleeping should
1395 * be very small. See segvn_reclaim() for
1396 * more comments.
1398 AS_CLRNOUNMAPWAIT(as);
1399 mutex_exit(&as->a_contents);
1400 goto retry;
1402 mutex_exit(&as->a_contents);
1403 goto top;
1404 } else if (err == IE_RETRY) {
1405 AS_LOCK_EXIT(as);
1406 goto top;
1407 } else if (err) {
1408 as_setwatch(as);
1409 AS_LOCK_EXIT(as);
1410 return (-1);
1413 as->a_size -= ssize;
1414 if (rsize)
1415 as->a_resvsize -= rsize;
1416 raddr += ssize;
1418 AS_LOCK_EXIT(as);
1419 return (0);
1422 static int
1423 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1424 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1426 uint_t szc;
1427 uint_t nszc;
1428 int error;
1429 caddr_t a;
1430 caddr_t eaddr;
1431 size_t segsize;
1432 struct seg *seg;
1433 size_t pgsz;
1434 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1435 uint_t save_szcvec;
1437 ASSERT(AS_WRITE_HELD(as));
1438 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1439 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1440 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1441 if (!do_off) {
1442 vn_a->offset = 0;
1445 if (szcvec <= 1) {
1446 seg = seg_alloc(as, addr, size);
1447 if (seg == NULL) {
1448 return (ENOMEM);
1450 vn_a->szc = 0;
1451 error = (*crfp)(seg, vn_a);
1452 if (error != 0) {
1453 seg_free(seg);
1454 } else {
1455 as->a_size += size;
1456 as->a_resvsize += size;
1458 return (error);
1461 eaddr = addr + size;
1462 save_szcvec = szcvec;
1463 szcvec >>= 1;
1464 szc = 0;
1465 nszc = 0;
1466 while (szcvec) {
1467 if ((szcvec & 0x1) == 0) {
1468 nszc++;
1469 szcvec >>= 1;
1470 continue;
1472 nszc++;
1473 pgsz = page_get_pagesize(nszc);
1474 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1475 if (a != addr) {
1476 ASSERT(a < eaddr);
1477 segsize = a - addr;
1478 seg = seg_alloc(as, addr, segsize);
1479 if (seg == NULL) {
1480 return (ENOMEM);
1482 vn_a->szc = szc;
1483 error = (*crfp)(seg, vn_a);
1484 if (error != 0) {
1485 seg_free(seg);
1486 return (error);
1488 as->a_size += segsize;
1489 as->a_resvsize += segsize;
1490 *segcreated = 1;
1491 if (do_off) {
1492 vn_a->offset += segsize;
1494 addr = a;
1496 szc = nszc;
1497 szcvec >>= 1;
1500 ASSERT(addr < eaddr);
1501 szcvec = save_szcvec | 1; /* add 8K pages */
1502 while (szcvec) {
1503 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1504 ASSERT(a >= addr);
1505 if (a != addr) {
1506 segsize = a - addr;
1507 seg = seg_alloc(as, addr, segsize);
1508 if (seg == NULL) {
1509 return (ENOMEM);
1511 vn_a->szc = szc;
1512 error = (*crfp)(seg, vn_a);
1513 if (error != 0) {
1514 seg_free(seg);
1515 return (error);
1517 as->a_size += segsize;
1518 as->a_resvsize += segsize;
1519 *segcreated = 1;
1520 if (do_off) {
1521 vn_a->offset += segsize;
1523 addr = a;
1525 szcvec &= ~(1 << szc);
1526 if (szcvec) {
1527 szc = highbit(szcvec) - 1;
1528 pgsz = page_get_pagesize(szc);
1531 ASSERT(addr == eaddr);
1533 return (0);
1536 static int
1537 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1538 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1540 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1541 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1542 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1543 type, 0);
1544 int error;
1545 struct seg *seg;
1546 struct vattr va;
1547 uoff_t eoff;
1548 size_t save_size = 0;
1549 extern size_t textrepl_size_thresh;
1551 ASSERT(AS_WRITE_HELD(as));
1552 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1553 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1554 ASSERT(vn_a->vp != NULL);
1555 ASSERT(vn_a->amp == NULL);
1557 again:
1558 if (szcvec <= 1) {
1559 seg = seg_alloc(as, addr, size);
1560 if (seg == NULL) {
1561 return (ENOMEM);
1563 vn_a->szc = 0;
1564 error = (*crfp)(seg, vn_a);
1565 if (error != 0) {
1566 seg_free(seg);
1567 } else {
1568 as->a_size += size;
1569 as->a_resvsize += size;
1571 return (error);
1574 va.va_mask = AT_SIZE;
1575 if (fop_getattr(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1576 szcvec = 0;
1577 goto again;
1579 eoff = vn_a->offset & PAGEMASK;
1580 if (eoff >= va.va_size) {
1581 szcvec = 0;
1582 goto again;
1584 eoff += size;
1585 if (btopr(va.va_size) < btopr(eoff)) {
1586 save_size = size;
1587 size = va.va_size - (vn_a->offset & PAGEMASK);
1588 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1589 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1590 type, 0);
1591 if (szcvec <= 1) {
1592 size = save_size;
1593 goto again;
1597 if (size > textrepl_size_thresh) {
1598 vn_a->flags |= _MAP_TEXTREPL;
1600 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1601 segcreated);
1602 if (error != 0) {
1603 return (error);
1605 if (save_size) {
1606 addr += size;
1607 size = save_size - size;
1608 szcvec = 0;
1609 goto again;
1611 return (0);
1615 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1616 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1618 static int
1619 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1620 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1622 uint_t szcvec;
1623 uchar_t type;
1625 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1626 if (vn_a->type == MAP_SHARED) {
1627 type = MAPPGSZC_SHM;
1628 } else if (vn_a->type == MAP_PRIVATE) {
1629 if (vn_a->szc == AS_MAP_HEAP) {
1630 type = MAPPGSZC_HEAP;
1631 } else if (vn_a->szc == AS_MAP_STACK) {
1632 type = MAPPGSZC_STACK;
1633 } else {
1634 type = MAPPGSZC_PRIVM;
1637 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1638 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1639 (vn_a->flags & MAP_TEXT), type, 0);
1640 ASSERT(AS_WRITE_HELD(as));
1641 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1642 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1643 ASSERT(vn_a->vp == NULL);
1645 return (as_map_segvn_segs(as, addr, size, szcvec,
1646 crfp, vn_a, segcreated));
1650 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1652 AS_LOCK_ENTER(as, RW_WRITER);
1653 return (as_map_locked(as, addr, size, crfp, argsp));
1657 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1658 void *argsp)
1660 struct seg *seg = NULL;
1661 caddr_t raddr; /* rounded down addr */
1662 size_t rsize; /* rounded up size */
1663 int error;
1664 int unmap = 0;
1666 * The use of a_proc is preferred to handle the case where curproc is
1667 * a door_call server and is allocating memory in the client's (a_proc)
1668 * address space.
1669 * When creating a shared memory segment a_proc will be NULL so we
1670 * fallback to curproc in that case.
1672 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1673 struct segvn_crargs crargs;
1675 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1676 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1677 (size_t)raddr;
1680 * check for wrap around
1682 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1683 AS_LOCK_EXIT(as);
1684 return (ENOMEM);
1687 as->a_updatedir = 1; /* inform /proc */
1688 gethrestime(&as->a_updatetime);
1690 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1691 AS_LOCK_EXIT(as);
1693 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1694 RCA_UNSAFE_ALL);
1696 return (ENOMEM);
1699 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1700 crargs = *(struct segvn_crargs *)argsp;
1701 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1702 if (error != 0) {
1703 AS_LOCK_EXIT(as);
1704 if (unmap) {
1705 (void) as_unmap(as, addr, size);
1707 return (error);
1709 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1710 crargs = *(struct segvn_crargs *)argsp;
1711 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1712 if (error != 0) {
1713 AS_LOCK_EXIT(as);
1714 if (unmap) {
1715 (void) as_unmap(as, addr, size);
1717 return (error);
1719 } else {
1720 seg = seg_alloc(as, addr, size);
1721 if (seg == NULL) {
1722 AS_LOCK_EXIT(as);
1723 return (ENOMEM);
1726 error = (*crfp)(seg, argsp);
1727 if (error != 0) {
1728 seg_free(seg);
1729 AS_LOCK_EXIT(as);
1730 return (error);
1733 * Add size now so as_unmap will work if as_ctl fails.
1735 as->a_size += rsize;
1736 as->a_resvsize += rsize;
1739 as_setwatch(as);
1742 * If the address space is locked,
1743 * establish memory locks for the new segment.
1745 mutex_enter(&as->a_contents);
1746 if (AS_ISPGLCK(as)) {
1747 mutex_exit(&as->a_contents);
1748 AS_LOCK_EXIT(as);
1749 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1750 if (error != 0)
1751 (void) as_unmap(as, addr, size);
1752 } else {
1753 mutex_exit(&as->a_contents);
1754 AS_LOCK_EXIT(as);
1756 return (error);
1761 * Delete all segments in the address space marked with S_PURGE.
1762 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1763 * These segments are deleted as a first step before calls to as_gap(), so
1764 * that they don't affect mmap() or shmat().
1766 void
1767 as_purge(struct as *as)
1769 struct seg *seg;
1770 struct seg *next_seg;
1773 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1774 * no need to grab a_contents mutex for this check
1776 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1777 return;
1779 AS_LOCK_ENTER(as, RW_WRITER);
1780 next_seg = NULL;
1781 seg = AS_SEGFIRST(as);
1782 while (seg != NULL) {
1783 next_seg = AS_SEGNEXT(as, seg);
1784 if (seg->s_flags & S_PURGE)
1785 (void) segop_unmap(seg, seg->s_base, seg->s_size);
1786 seg = next_seg;
1788 AS_LOCK_EXIT(as);
1790 mutex_enter(&as->a_contents);
1791 as->a_flags &= ~AS_NEEDSPURGE;
1792 mutex_exit(&as->a_contents);
1796 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1797 * range of addresses at least "minlen" long, where the base of the range is
1798 * at "off" phase from an "align" boundary and there is space for a
1799 * "redzone"-sized redzone on eithe rside of the range. Thus,
1800 * if align was 4M and off was 16k, the user wants a hole which will start
1801 * 16k into a 4M page.
1803 * If flags specifies AH_HI, the hole will have the highest possible address
1804 * in the range. We use the as->a_lastgap field to figure out where to
1805 * start looking for a gap.
1807 * Otherwise, the gap will have the lowest possible address.
1809 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1811 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1812 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1814 * NOTE: This routine is not correct when base+len overflows caddr_t.
1817 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1818 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1820 caddr_t lobound = *basep;
1821 caddr_t hibound = lobound + *lenp;
1822 struct seg *lseg, *hseg;
1823 caddr_t lo, hi;
1824 int forward;
1825 caddr_t save_base;
1826 size_t save_len;
1827 size_t save_minlen;
1828 size_t save_redzone;
1829 int fast_path = 1;
1831 save_base = *basep;
1832 save_len = *lenp;
1833 save_minlen = minlen;
1834 save_redzone = redzone;
1837 * For the first pass/fast_path, just add align and redzone into
1838 * minlen since if we get an allocation, we can guarantee that it
1839 * will fit the alignment and redzone requested.
1840 * This increases the chance that hibound will be adjusted to
1841 * a_lastgap->s_base which will likely allow us to find an
1842 * acceptable hole in the address space quicker.
1843 * If we can't find a hole with this fast_path, then we look for
1844 * smaller holes in which the alignment and offset may allow
1845 * the allocation to fit.
1847 minlen += align;
1848 minlen += 2 * redzone;
1849 redzone = 0;
1851 AS_LOCK_ENTER(as, RW_READER);
1852 if (AS_SEGFIRST(as) == NULL) {
1853 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1854 align, redzone, off)) {
1855 AS_LOCK_EXIT(as);
1856 return (0);
1857 } else {
1858 AS_LOCK_EXIT(as);
1859 *basep = save_base;
1860 *lenp = save_len;
1861 return (-1);
1865 retry:
1867 * Set up to iterate over all the inter-segment holes in the given
1868 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1869 * NULL for the highest-addressed hole. If moving backwards, we reset
1870 * sseg to denote the highest-addressed segment.
1872 forward = (flags & AH_DIR) == AH_LO;
1873 if (forward) {
1874 hseg = as_findseg(as, lobound, 1);
1875 lseg = AS_SEGPREV(as, hseg);
1876 } else {
1879 * If allocating at least as much as the last allocation,
1880 * use a_lastgap's base as a better estimate of hibound.
1882 if (as->a_lastgap &&
1883 minlen >= as->a_lastgap->s_size &&
1884 hibound >= as->a_lastgap->s_base)
1885 hibound = as->a_lastgap->s_base;
1887 hseg = as_findseg(as, hibound, 1);
1888 if (hseg->s_base + hseg->s_size < hibound) {
1889 lseg = hseg;
1890 hseg = NULL;
1891 } else {
1892 lseg = AS_SEGPREV(as, hseg);
1896 for (;;) {
1898 * Set lo and hi to the hole's boundaries. (We should really
1899 * use MAXADDR in place of hibound in the expression below,
1900 * but can't express it easily; using hibound in its place is
1901 * harmless.)
1903 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1904 hi = (hseg == NULL) ? hibound : hseg->s_base;
1906 * If the iteration has moved past the interval from lobound
1907 * to hibound it's pointless to continue.
1909 if ((forward && lo > hibound) || (!forward && hi < lobound))
1910 break;
1911 else if (lo > hibound || hi < lobound)
1912 goto cont;
1914 * Candidate hole lies at least partially within the allowable
1915 * range. Restrict it to fall completely within that range,
1916 * i.e., to [max(lo, lobound), min(hi, hibound)].
1918 if (lo < lobound)
1919 lo = lobound;
1920 if (hi > hibound)
1921 hi = hibound;
1923 * Verify that the candidate hole is big enough and meets
1924 * hardware constraints. If the hole is too small, no need
1925 * to do the further checks since they will fail.
1927 *basep = lo;
1928 *lenp = hi - lo;
1929 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1930 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1931 ((flags & AH_CONTAIN) == 0 ||
1932 (*basep <= addr && *basep + *lenp > addr))) {
1933 if (!forward)
1934 as->a_lastgap = hseg;
1935 if (hseg != NULL)
1936 as->a_lastgaphl = hseg;
1937 else
1938 as->a_lastgaphl = lseg;
1939 AS_LOCK_EXIT(as);
1940 return (0);
1942 cont:
1944 * Move to the next hole.
1946 if (forward) {
1947 lseg = hseg;
1948 if (lseg == NULL)
1949 break;
1950 hseg = AS_SEGNEXT(as, hseg);
1951 } else {
1952 hseg = lseg;
1953 if (hseg == NULL)
1954 break;
1955 lseg = AS_SEGPREV(as, lseg);
1958 if (fast_path && (align != 0 || save_redzone != 0)) {
1959 fast_path = 0;
1960 minlen = save_minlen;
1961 redzone = save_redzone;
1962 goto retry;
1964 *basep = save_base;
1965 *lenp = save_len;
1966 AS_LOCK_EXIT(as);
1967 return (-1);
1971 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1973 * If flags specifies AH_HI, the hole will have the highest possible address
1974 * in the range. We use the as->a_lastgap field to figure out where to
1975 * start looking for a gap.
1977 * Otherwise, the gap will have the lowest possible address.
1979 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1981 * If an adequate hole is found, base and len are set to reflect the part of
1982 * the hole that is within range, and 0 is returned, otherwise,
1983 * -1 is returned.
1985 * NOTE: This routine is not correct when base+len overflows caddr_t.
1988 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1989 caddr_t addr)
1992 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1996 * Return the next range within [base, base + len) that is backed
1997 * with "real memory". Skip holes and non-seg_vn segments.
1998 * We're lazy and only return one segment at a time.
2001 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2003 extern const struct seg_ops segspt_shmops; /* needs a header file */
2004 struct seg *seg;
2005 caddr_t addr, eaddr;
2006 caddr_t segend;
2008 AS_LOCK_ENTER(as, RW_READER);
2010 addr = *basep;
2011 eaddr = addr + *lenp;
2013 seg = as_findseg(as, addr, 0);
2014 if (seg != NULL)
2015 addr = MAX(seg->s_base, addr);
2017 for (;;) {
2018 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2019 AS_LOCK_EXIT(as);
2020 return (EINVAL);
2023 if (seg->s_ops == &segvn_ops) {
2024 segend = seg->s_base + seg->s_size;
2025 break;
2029 * We do ISM by looking into the private data
2030 * to determine the real size of the segment.
2032 if (seg->s_ops == &segspt_shmops) {
2033 segend = seg->s_base + spt_realsize(seg);
2034 if (addr < segend)
2035 break;
2038 seg = AS_SEGNEXT(as, seg);
2040 if (seg != NULL)
2041 addr = seg->s_base;
2044 *basep = addr;
2046 if (segend > eaddr)
2047 *lenp = eaddr - addr;
2048 else
2049 *lenp = segend - addr;
2051 AS_LOCK_EXIT(as);
2052 return (0);
2056 * Determine whether data from the mappings in interval [addr, addr + size)
2057 * are in the primary memory (core) cache.
2060 as_incore(struct as *as, caddr_t addr,
2061 size_t size, char *vec, size_t *sizep)
2063 struct seg *seg;
2064 size_t ssize;
2065 caddr_t raddr; /* rounded down addr */
2066 size_t rsize; /* rounded up size */
2067 size_t isize; /* iteration size */
2068 int error = 0; /* result, assume success */
2070 *sizep = 0;
2071 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2072 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2073 (size_t)raddr;
2075 if (raddr + rsize < raddr) /* check for wraparound */
2076 return (ENOMEM);
2078 AS_LOCK_ENTER(as, RW_READER);
2079 seg = as_segat(as, raddr);
2080 if (seg == NULL) {
2081 AS_LOCK_EXIT(as);
2082 return (-1);
2085 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2086 if (raddr >= seg->s_base + seg->s_size) {
2087 seg = AS_SEGNEXT(as, seg);
2088 if (seg == NULL || raddr != seg->s_base) {
2089 error = -1;
2090 break;
2093 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2094 ssize = seg->s_base + seg->s_size - raddr;
2095 else
2096 ssize = rsize;
2097 *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2098 if (isize != ssize) {
2099 error = -1;
2100 break;
2102 vec += btopr(ssize);
2104 AS_LOCK_EXIT(as);
2105 return (error);
2108 static void
2109 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2110 ulong_t *bitmap, size_t position, size_t npages)
2112 caddr_t range_start;
2113 size_t pos1 = position;
2114 size_t pos2;
2115 size_t size;
2116 size_t end_pos = npages + position;
2118 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2119 size = ptob((pos2 - pos1));
2120 range_start = (caddr_t)((uintptr_t)addr +
2121 ptob(pos1 - position));
2123 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2124 NULL, 0);
2125 pos1 = pos2;
2129 static void
2130 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2131 caddr_t raddr, size_t rsize)
2133 struct seg *seg = as_segat(as, raddr);
2134 size_t ssize;
2136 while (rsize != 0) {
2137 if (raddr >= seg->s_base + seg->s_size)
2138 seg = AS_SEGNEXT(as, seg);
2140 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2141 ssize = seg->s_base + seg->s_size - raddr;
2142 else
2143 ssize = rsize;
2145 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2147 rsize -= ssize;
2148 raddr += ssize;
2153 * Cache control operations over the interval [addr, addr + size) in
2154 * address space "as".
2156 /*ARGSUSED*/
2158 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2159 uintptr_t arg, ulong_t *lock_map, size_t pos)
2161 struct seg *seg; /* working segment */
2162 caddr_t raddr; /* rounded down addr */
2163 caddr_t initraddr; /* saved initial rounded down addr */
2164 size_t rsize; /* rounded up size */
2165 size_t initrsize; /* saved initial rounded up size */
2166 size_t ssize; /* size of seg */
2167 int error = 0; /* result */
2168 size_t mlock_size; /* size of bitmap */
2169 ulong_t *mlock_map; /* pointer to bitmap used */
2170 /* to represent the locked */
2171 /* pages. */
2172 retry:
2173 if (error == IE_RETRY)
2174 AS_LOCK_ENTER(as, RW_WRITER);
2175 else
2176 AS_LOCK_ENTER(as, RW_READER);
2179 * If these are address space lock/unlock operations, loop over
2180 * all segments in the address space, as appropriate.
2182 if (func == MC_LOCKAS) {
2183 size_t npages, idx;
2184 size_t rlen = 0; /* rounded as length */
2186 idx = pos;
2188 if (arg & MCL_FUTURE) {
2189 mutex_enter(&as->a_contents);
2190 AS_SETPGLCK(as);
2191 mutex_exit(&as->a_contents);
2193 if ((arg & MCL_CURRENT) == 0) {
2194 AS_LOCK_EXIT(as);
2195 return (0);
2198 seg = AS_SEGFIRST(as);
2199 if (seg == NULL) {
2200 AS_LOCK_EXIT(as);
2201 return (0);
2204 do {
2205 raddr = (caddr_t)((uintptr_t)seg->s_base &
2206 (uintptr_t)PAGEMASK);
2207 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2208 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2209 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2211 mlock_size = BT_BITOUL(btopr(rlen));
2212 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2213 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2214 AS_LOCK_EXIT(as);
2215 return (EAGAIN);
2218 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2219 error = segop_lockop(seg, seg->s_base,
2220 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2221 if (error != 0)
2222 break;
2223 pos += seg_pages(seg);
2226 if (error) {
2227 for (seg = AS_SEGFIRST(as); seg != NULL;
2228 seg = AS_SEGNEXT(as, seg)) {
2230 raddr = (caddr_t)((uintptr_t)seg->s_base &
2231 (uintptr_t)PAGEMASK);
2232 npages = seg_pages(seg);
2233 as_segunlock(seg, raddr, attr, mlock_map,
2234 idx, npages);
2235 idx += npages;
2239 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2240 AS_LOCK_EXIT(as);
2241 goto lockerr;
2242 } else if (func == MC_UNLOCKAS) {
2243 mutex_enter(&as->a_contents);
2244 AS_CLRPGLCK(as);
2245 mutex_exit(&as->a_contents);
2247 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2248 error = segop_lockop(seg, seg->s_base,
2249 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2250 if (error != 0)
2251 break;
2254 AS_LOCK_EXIT(as);
2255 goto lockerr;
2259 * Normalize addresses and sizes.
2261 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2262 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2263 (size_t)raddr;
2265 if (raddr + rsize < raddr) { /* check for wraparound */
2266 AS_LOCK_EXIT(as);
2267 return (ENOMEM);
2271 * Get initial segment.
2273 if ((seg = as_segat(as, raddr)) == NULL) {
2274 AS_LOCK_EXIT(as);
2275 return (ENOMEM);
2278 if (func == MC_LOCK) {
2279 mlock_size = BT_BITOUL(btopr(rsize));
2280 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2281 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2282 AS_LOCK_EXIT(as);
2283 return (EAGAIN);
2288 * Loop over all segments. If a hole in the address range is
2289 * discovered, then fail. For each segment, perform the appropriate
2290 * control operation.
2292 while (rsize != 0) {
2295 * Make sure there's no hole, calculate the portion
2296 * of the next segment to be operated over.
2298 if (raddr >= seg->s_base + seg->s_size) {
2299 seg = AS_SEGNEXT(as, seg);
2300 if (seg == NULL || raddr != seg->s_base) {
2301 if (func == MC_LOCK) {
2302 as_unlockerr(as, attr, mlock_map,
2303 initraddr, initrsize - rsize);
2304 kmem_free(mlock_map,
2305 mlock_size * sizeof (ulong_t));
2307 AS_LOCK_EXIT(as);
2308 return (ENOMEM);
2311 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2312 ssize = seg->s_base + seg->s_size - raddr;
2313 else
2314 ssize = rsize;
2317 * Dispatch on specific function.
2319 switch (func) {
2322 * Synchronize cached data from mappings with backing
2323 * objects.
2325 case MC_SYNC:
2326 if (error = segop_sync(seg, raddr, ssize,
2327 attr, (uint_t)arg)) {
2328 AS_LOCK_EXIT(as);
2329 return (error);
2331 break;
2334 * Lock pages in memory.
2336 case MC_LOCK:
2337 if (error = segop_lockop(seg, raddr, ssize,
2338 attr, func, mlock_map, pos)) {
2339 as_unlockerr(as, attr, mlock_map, initraddr,
2340 initrsize - rsize + ssize);
2341 kmem_free(mlock_map, mlock_size *
2342 sizeof (ulong_t));
2343 AS_LOCK_EXIT(as);
2344 goto lockerr;
2346 break;
2349 * Unlock mapped pages.
2351 case MC_UNLOCK:
2352 (void) segop_lockop(seg, raddr, ssize, attr, func,
2353 NULL, 0);
2354 break;
2357 * Store VM advise for mapped pages in segment layer.
2359 case MC_ADVISE:
2360 error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2363 * Check for regular errors and special retry error
2365 if (error) {
2366 if (error == IE_RETRY) {
2368 * Need to acquire writers lock, so
2369 * have to drop readers lock and start
2370 * all over again
2372 AS_LOCK_EXIT(as);
2373 goto retry;
2374 } else if (error == IE_REATTACH) {
2376 * Find segment for current address
2377 * because current segment just got
2378 * split or concatenated
2380 seg = as_segat(as, raddr);
2381 if (seg == NULL) {
2382 AS_LOCK_EXIT(as);
2383 return (ENOMEM);
2385 } else {
2387 * Regular error
2389 AS_LOCK_EXIT(as);
2390 return (error);
2393 break;
2395 case MC_INHERIT_ZERO:
2396 error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2397 if (error != 0) {
2398 AS_LOCK_EXIT(as);
2399 return (error);
2401 break;
2404 * Can't happen.
2406 default:
2407 panic("as_ctl: bad operation %d", func);
2408 /*NOTREACHED*/
2411 rsize -= ssize;
2412 raddr += ssize;
2415 if (func == MC_LOCK)
2416 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2417 AS_LOCK_EXIT(as);
2418 return (0);
2419 lockerr:
2422 * If the lower levels returned EDEADLK for a segment lockop,
2423 * it means that we should retry the operation. Let's wait
2424 * a bit also to let the deadlock causing condition clear.
2425 * This is part of a gross hack to work around a design flaw
2426 * in the ufs/sds logging code and should go away when the
2427 * logging code is re-designed to fix the problem. See bug
2428 * 4125102 for details of the problem.
2430 if (error == EDEADLK) {
2431 delay(deadlk_wait);
2432 error = 0;
2433 goto retry;
2435 return (error);
2439 fc_decode(faultcode_t fault_err)
2441 int error = 0;
2443 switch (FC_CODE(fault_err)) {
2444 case FC_OBJERR:
2445 error = FC_ERRNO(fault_err);
2446 break;
2447 case FC_PROT:
2448 error = EACCES;
2449 break;
2450 default:
2451 error = EFAULT;
2452 break;
2454 return (error);
2458 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2459 * lists from each segment and copy them to one contiguous shadow list (plist)
2460 * as expected by the caller. Save pointers to per segment shadow lists at
2461 * the tail of plist so that they can be used during as_pageunlock().
2463 static int
2464 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2465 caddr_t addr, size_t size, enum seg_rw rw)
2467 caddr_t sv_addr = addr;
2468 size_t sv_size = size;
2469 struct seg *sv_seg = seg;
2470 ulong_t segcnt = 1;
2471 ulong_t cnt;
2472 size_t ssize;
2473 pgcnt_t npages = btop(size);
2474 page_t **plist;
2475 page_t **pl;
2476 int error;
2477 caddr_t eaddr;
2478 faultcode_t fault_err = 0;
2479 pgcnt_t pl_off;
2480 extern const struct seg_ops segspt_shmops;
2482 ASSERT(AS_LOCK_HELD(as));
2483 ASSERT(seg != NULL);
2484 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2485 ASSERT(addr + size > seg->s_base + seg->s_size);
2486 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2487 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2490 * Count the number of segments covered by the range we are about to
2491 * lock. The segment count is used to size the shadow list we return
2492 * back to the caller.
2494 for (; size != 0; size -= ssize, addr += ssize) {
2495 if (addr >= seg->s_base + seg->s_size) {
2497 seg = AS_SEGNEXT(as, seg);
2498 if (seg == NULL || addr != seg->s_base) {
2499 AS_LOCK_EXIT(as);
2500 return (EFAULT);
2503 * Do a quick check if subsequent segments
2504 * will most likely support pagelock.
2506 if (seg->s_ops == &segvn_ops) {
2507 vnode_t *vp;
2509 if (segop_getvp(seg, addr, &vp) != 0 ||
2510 vp != NULL) {
2511 AS_LOCK_EXIT(as);
2512 goto slow;
2514 } else if (seg->s_ops != &segspt_shmops) {
2515 AS_LOCK_EXIT(as);
2516 goto slow;
2518 segcnt++;
2520 if (addr + size > seg->s_base + seg->s_size) {
2521 ssize = seg->s_base + seg->s_size - addr;
2522 } else {
2523 ssize = size;
2526 ASSERT(segcnt > 1);
2528 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2530 addr = sv_addr;
2531 size = sv_size;
2532 seg = sv_seg;
2534 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2535 if (addr >= seg->s_base + seg->s_size) {
2536 seg = AS_SEGNEXT(as, seg);
2537 ASSERT(seg != NULL && addr == seg->s_base);
2538 cnt++;
2539 ASSERT(cnt < segcnt);
2541 if (addr + size > seg->s_base + seg->s_size) {
2542 ssize = seg->s_base + seg->s_size - addr;
2543 } else {
2544 ssize = size;
2546 pl = &plist[npages + cnt];
2547 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2548 L_PAGELOCK, rw);
2549 if (error) {
2550 break;
2552 ASSERT(plist[npages + cnt] != NULL);
2553 ASSERT(pl_off + btop(ssize) <= npages);
2554 bcopy(plist[npages + cnt], &plist[pl_off],
2555 btop(ssize) * sizeof (page_t *));
2556 pl_off += btop(ssize);
2559 if (size == 0) {
2560 AS_LOCK_EXIT(as);
2561 ASSERT(cnt == segcnt - 1);
2562 *ppp = plist;
2563 return (0);
2567 * one of pagelock calls failed. The error type is in error variable.
2568 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2569 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2570 * back to the caller.
2573 eaddr = addr;
2574 seg = sv_seg;
2576 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2577 if (addr >= seg->s_base + seg->s_size) {
2578 seg = AS_SEGNEXT(as, seg);
2579 ASSERT(seg != NULL && addr == seg->s_base);
2580 cnt++;
2581 ASSERT(cnt < segcnt);
2583 if (eaddr > seg->s_base + seg->s_size) {
2584 ssize = seg->s_base + seg->s_size - addr;
2585 } else {
2586 ssize = eaddr - addr;
2588 pl = &plist[npages + cnt];
2589 ASSERT(*pl != NULL);
2590 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2591 L_PAGEUNLOCK, rw);
2594 AS_LOCK_EXIT(as);
2596 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2598 if (error != ENOTSUP && error != EFAULT) {
2599 return (error);
2602 slow:
2604 * If we are here because pagelock failed due to the need to cow fault
2605 * in the pages we want to lock F_SOFTLOCK will do this job and in
2606 * next as_pagelock() call for this address range pagelock will
2607 * hopefully succeed.
2609 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2610 if (fault_err != 0) {
2611 return (fc_decode(fault_err));
2613 *ppp = NULL;
2615 return (0);
2619 * lock pages in a given address space. Return shadow list. If
2620 * the list is NULL, the MMU mapping is also locked.
2623 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2624 size_t size, enum seg_rw rw)
2626 size_t rsize;
2627 caddr_t raddr;
2628 faultcode_t fault_err;
2629 struct seg *seg;
2630 int err;
2632 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2633 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2634 (size_t)raddr;
2637 * if the request crosses two segments let
2638 * as_fault handle it.
2640 AS_LOCK_ENTER(as, RW_READER);
2642 seg = as_segat(as, raddr);
2643 if (seg == NULL) {
2644 AS_LOCK_EXIT(as);
2645 return (EFAULT);
2647 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2648 if (raddr + rsize > seg->s_base + seg->s_size) {
2649 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2651 if (raddr + rsize <= raddr) {
2652 AS_LOCK_EXIT(as);
2653 return (EFAULT);
2657 * try to lock pages and pass back shadow list
2659 err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2661 AS_LOCK_EXIT(as);
2663 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2664 return (err);
2668 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2669 * to no pagelock support for this segment or pages need to be cow
2670 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2671 * this as_pagelock() call and in the next as_pagelock() call for the
2672 * same address range pagelock call will hopefull succeed.
2674 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2675 if (fault_err != 0) {
2676 return (fc_decode(fault_err));
2678 *ppp = NULL;
2680 return (0);
2684 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2685 * lists from the end of plist and call pageunlock interface for each segment.
2686 * Drop as lock and free plist.
2688 static void
2689 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2690 struct page **plist, enum seg_rw rw)
2692 ulong_t cnt;
2693 caddr_t eaddr = addr + size;
2694 pgcnt_t npages = btop(size);
2695 size_t ssize;
2696 page_t **pl;
2698 ASSERT(AS_LOCK_HELD(as));
2699 ASSERT(seg != NULL);
2700 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2701 ASSERT(addr + size > seg->s_base + seg->s_size);
2702 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2703 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2704 ASSERT(plist != NULL);
2706 for (cnt = 0; addr < eaddr; addr += ssize) {
2707 if (addr >= seg->s_base + seg->s_size) {
2708 seg = AS_SEGNEXT(as, seg);
2709 ASSERT(seg != NULL && addr == seg->s_base);
2710 cnt++;
2712 if (eaddr > seg->s_base + seg->s_size) {
2713 ssize = seg->s_base + seg->s_size - addr;
2714 } else {
2715 ssize = eaddr - addr;
2717 pl = &plist[npages + cnt];
2718 ASSERT(*pl != NULL);
2719 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2720 L_PAGEUNLOCK, rw);
2722 ASSERT(cnt > 0);
2723 AS_LOCK_EXIT(as);
2725 cnt++;
2726 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2730 * unlock pages in a given address range
2732 void
2733 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2734 enum seg_rw rw)
2736 struct seg *seg;
2737 size_t rsize;
2738 caddr_t raddr;
2741 * if the shadow list is NULL, as_pagelock was
2742 * falling back to as_fault
2744 if (pp == NULL) {
2745 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2746 return;
2749 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2750 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2751 (size_t)raddr;
2753 AS_LOCK_ENTER(as, RW_READER);
2754 seg = as_segat(as, raddr);
2755 ASSERT(seg != NULL);
2757 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2758 if (raddr + rsize <= seg->s_base + seg->s_size) {
2759 (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2760 } else {
2761 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2762 return;
2764 AS_LOCK_EXIT(as);
2768 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2769 boolean_t wait)
2771 struct seg *seg;
2772 size_t ssize;
2773 caddr_t raddr; /* rounded down addr */
2774 size_t rsize; /* rounded up size */
2775 int error = 0;
2776 size_t pgsz = page_get_pagesize(szc);
2778 setpgsz_top:
2779 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2780 return (EINVAL);
2783 raddr = addr;
2784 rsize = size;
2786 if (raddr + rsize < raddr) /* check for wraparound */
2787 return (ENOMEM);
2789 AS_LOCK_ENTER(as, RW_WRITER);
2790 as_clearwatchprot(as, raddr, rsize);
2791 seg = as_segat(as, raddr);
2792 if (seg == NULL) {
2793 as_setwatch(as);
2794 AS_LOCK_EXIT(as);
2795 return (ENOMEM);
2798 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2799 if (raddr >= seg->s_base + seg->s_size) {
2800 seg = AS_SEGNEXT(as, seg);
2801 if (seg == NULL || raddr != seg->s_base) {
2802 error = ENOMEM;
2803 break;
2806 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2807 ssize = seg->s_base + seg->s_size - raddr;
2808 } else {
2809 ssize = rsize;
2812 retry:
2813 error = segop_setpagesize(seg, raddr, ssize, szc);
2815 if (error == IE_NOMEM) {
2816 error = EAGAIN;
2817 break;
2820 if (error == IE_RETRY) {
2821 AS_LOCK_EXIT(as);
2822 goto setpgsz_top;
2825 if (error == ENOTSUP) {
2826 error = EINVAL;
2827 break;
2830 if (wait && (error == EAGAIN)) {
2832 * Memory is currently locked. It must be unlocked
2833 * before this operation can succeed through a retry.
2834 * The possible reasons for locked memory and
2835 * corresponding strategies for unlocking are:
2836 * (1) Normal I/O
2837 * wait for a signal that the I/O operation
2838 * has completed and the memory is unlocked.
2839 * (2) Asynchronous I/O
2840 * The aio subsystem does not unlock pages when
2841 * the I/O is completed. Those pages are unlocked
2842 * when the application calls aiowait/aioerror.
2843 * So, to prevent blocking forever, cv_broadcast()
2844 * is done to wake up aio_cleanup_thread.
2845 * Subsequently, segvn_reclaim will be called, and
2846 * that will do AS_CLRUNMAPWAIT() and wake us up.
2847 * (3) Long term page locking:
2848 * This is not relevant for as_setpagesize()
2849 * because we cannot change the page size for
2850 * driver memory. The attempt to do so will
2851 * fail with a different error than EAGAIN so
2852 * there's no need to trigger as callbacks like
2853 * as_unmap, as_setprot or as_free would do.
2855 mutex_enter(&as->a_contents);
2856 if (!AS_ISNOUNMAPWAIT(as)) {
2857 if (AS_ISUNMAPWAIT(as) == 0) {
2858 cv_broadcast(&as->a_cv);
2860 AS_SETUNMAPWAIT(as);
2861 AS_LOCK_EXIT(as);
2862 while (AS_ISUNMAPWAIT(as)) {
2863 cv_wait(&as->a_cv, &as->a_contents);
2865 } else {
2867 * We may have raced with
2868 * segvn_reclaim()/segspt_reclaim(). In this
2869 * case clean nounmapwait flag and retry since
2870 * softlockcnt in this segment may be already
2871 * 0. We don't drop as writer lock so our
2872 * number of retries without sleeping should
2873 * be very small. See segvn_reclaim() for
2874 * more comments.
2876 AS_CLRNOUNMAPWAIT(as);
2877 mutex_exit(&as->a_contents);
2878 goto retry;
2880 mutex_exit(&as->a_contents);
2881 goto setpgsz_top;
2882 } else if (error != 0) {
2883 break;
2886 as_setwatch(as);
2887 AS_LOCK_EXIT(as);
2888 return (error);
2892 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2893 * in its chunk where s_szc is less than the szc we want to set.
2895 static int
2896 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2897 int *retry)
2899 struct seg *seg;
2900 size_t ssize;
2901 int error;
2903 ASSERT(AS_WRITE_HELD(as));
2905 seg = as_segat(as, raddr);
2906 if (seg == NULL) {
2907 panic("as_iset3_default_lpsize: no seg");
2910 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2911 if (raddr >= seg->s_base + seg->s_size) {
2912 seg = AS_SEGNEXT(as, seg);
2913 if (seg == NULL || raddr != seg->s_base) {
2914 panic("as_iset3_default_lpsize: as changed");
2917 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2918 ssize = seg->s_base + seg->s_size - raddr;
2919 } else {
2920 ssize = rsize;
2923 if (szc > seg->s_szc) {
2924 error = segop_setpagesize(seg, raddr, ssize, szc);
2925 /* Only retry on EINVAL segments that have no vnode. */
2926 if (error == EINVAL) {
2927 vnode_t *vp = NULL;
2928 if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
2929 (segop_getvp(seg, raddr, &vp) != 0 ||
2930 vp == NULL)) {
2931 *retry = 1;
2932 } else {
2933 *retry = 0;
2936 if (error) {
2937 return (error);
2941 return (0);
2945 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2946 * pagesize on each segment in its range, but if any fails with EINVAL,
2947 * then it reduces the pagesizes to the next size in the bitmap and
2948 * retries as_iset3_default_lpsize(). The reason why the code retries
2949 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2950 * match the bigger sizes, and (b) it's hard to get this offset (to begin
2951 * with) to pass to map_pgszcvec().
2953 static int
2954 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2955 uint_t szcvec)
2957 int error;
2958 int retry;
2960 ASSERT(AS_WRITE_HELD(as));
2962 for (;;) {
2963 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
2964 if (error == EINVAL && retry) {
2965 szcvec &= ~(1 << szc);
2966 if (szcvec <= 1) {
2967 return (EINVAL);
2969 szc = highbit(szcvec) - 1;
2970 } else {
2971 return (error);
2977 * as_iset1_default_lpsize() breaks its chunk into areas where existing
2978 * segments have a smaller szc than we want to set. For each such area,
2979 * it calls as_iset2_default_lpsize()
2981 static int
2982 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2983 uint_t szcvec)
2985 struct seg *seg;
2986 size_t ssize;
2987 caddr_t setaddr = raddr;
2988 size_t setsize = 0;
2989 int set;
2990 int error;
2992 ASSERT(AS_WRITE_HELD(as));
2994 seg = as_segat(as, raddr);
2995 if (seg == NULL) {
2996 panic("as_iset1_default_lpsize: no seg");
2998 if (seg->s_szc < szc) {
2999 set = 1;
3000 } else {
3001 set = 0;
3004 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3005 if (raddr >= seg->s_base + seg->s_size) {
3006 seg = AS_SEGNEXT(as, seg);
3007 if (seg == NULL || raddr != seg->s_base) {
3008 panic("as_iset1_default_lpsize: as changed");
3010 if (seg->s_szc >= szc && set) {
3011 ASSERT(setsize != 0);
3012 error = as_iset2_default_lpsize(as,
3013 setaddr, setsize, szc, szcvec);
3014 if (error) {
3015 return (error);
3017 set = 0;
3018 } else if (seg->s_szc < szc && !set) {
3019 setaddr = raddr;
3020 setsize = 0;
3021 set = 1;
3024 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3025 ssize = seg->s_base + seg->s_size - raddr;
3026 } else {
3027 ssize = rsize;
3030 error = 0;
3031 if (set) {
3032 ASSERT(setsize != 0);
3033 error = as_iset2_default_lpsize(as, setaddr, setsize,
3034 szc, szcvec);
3036 return (error);
3040 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3041 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3042 * chunk to as_iset1_default_lpsize().
3044 static int
3045 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3046 int type)
3048 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3049 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3050 flags, rtype, 1);
3051 uint_t szc;
3052 uint_t nszc;
3053 int error;
3054 caddr_t a;
3055 caddr_t eaddr;
3056 size_t segsize;
3057 size_t pgsz;
3058 uint_t save_szcvec;
3060 ASSERT(AS_WRITE_HELD(as));
3061 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3062 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3064 szcvec &= ~1;
3065 if (szcvec <= 1) { /* skip if base page size */
3066 return (0);
3069 /* Get the pagesize of the first larger page size. */
3070 szc = lowbit(szcvec) - 1;
3071 pgsz = page_get_pagesize(szc);
3072 eaddr = addr + size;
3073 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3074 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3076 save_szcvec = szcvec;
3077 szcvec >>= (szc + 1);
3078 nszc = szc;
3079 while (szcvec) {
3080 if ((szcvec & 0x1) == 0) {
3081 nszc++;
3082 szcvec >>= 1;
3083 continue;
3085 nszc++;
3086 pgsz = page_get_pagesize(nszc);
3087 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3088 if (a != addr) {
3089 ASSERT(szc > 0);
3090 ASSERT(a < eaddr);
3091 segsize = a - addr;
3092 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3093 save_szcvec);
3094 if (error) {
3095 return (error);
3097 addr = a;
3099 szc = nszc;
3100 szcvec >>= 1;
3103 ASSERT(addr < eaddr);
3104 szcvec = save_szcvec;
3105 while (szcvec) {
3106 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3107 ASSERT(a >= addr);
3108 if (a != addr) {
3109 ASSERT(szc > 0);
3110 segsize = a - addr;
3111 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3112 save_szcvec);
3113 if (error) {
3114 return (error);
3116 addr = a;
3118 szcvec &= ~(1 << szc);
3119 if (szcvec) {
3120 szc = highbit(szcvec) - 1;
3121 pgsz = page_get_pagesize(szc);
3124 ASSERT(addr == eaddr);
3126 return (0);
3130 * Set the default large page size for the range. Called via memcntl with
3131 * page size set to 0. as_set_default_lpsize breaks the range down into
3132 * chunks with the same type/flags, ignores-non segvn segments, and passes
3133 * each chunk to as_iset_default_lpsize().
3136 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3138 struct seg *seg;
3139 caddr_t raddr;
3140 size_t rsize;
3141 size_t ssize;
3142 int rtype, rflags;
3143 int stype, sflags;
3144 int error;
3145 caddr_t setaddr;
3146 size_t setsize;
3147 int segvn;
3149 if (size == 0)
3150 return (0);
3152 AS_LOCK_ENTER(as, RW_WRITER);
3153 again:
3154 error = 0;
3156 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3157 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3158 (size_t)raddr;
3160 if (raddr + rsize < raddr) { /* check for wraparound */
3161 AS_LOCK_EXIT(as);
3162 return (ENOMEM);
3164 as_clearwatchprot(as, raddr, rsize);
3165 seg = as_segat(as, raddr);
3166 if (seg == NULL) {
3167 as_setwatch(as);
3168 AS_LOCK_EXIT(as);
3169 return (ENOMEM);
3171 if (seg->s_ops == &segvn_ops) {
3172 rtype = segop_gettype(seg, addr);
3173 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3174 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3175 segvn = 1;
3176 } else {
3177 segvn = 0;
3179 setaddr = raddr;
3180 setsize = 0;
3182 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3183 if (raddr >= (seg->s_base + seg->s_size)) {
3184 seg = AS_SEGNEXT(as, seg);
3185 if (seg == NULL || raddr != seg->s_base) {
3186 error = ENOMEM;
3187 break;
3189 if (seg->s_ops == &segvn_ops) {
3190 stype = segop_gettype(seg, raddr);
3191 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3192 stype &= (MAP_SHARED | MAP_PRIVATE);
3193 if (segvn && (rflags != sflags ||
3194 rtype != stype)) {
3196 * The next segment is also segvn but
3197 * has different flags and/or type.
3199 ASSERT(setsize != 0);
3200 error = as_iset_default_lpsize(as,
3201 setaddr, setsize, rflags, rtype);
3202 if (error) {
3203 break;
3205 rflags = sflags;
3206 rtype = stype;
3207 setaddr = raddr;
3208 setsize = 0;
3209 } else if (!segvn) {
3210 rflags = sflags;
3211 rtype = stype;
3212 setaddr = raddr;
3213 setsize = 0;
3214 segvn = 1;
3216 } else if (segvn) {
3217 /* The next segment is not segvn. */
3218 ASSERT(setsize != 0);
3219 error = as_iset_default_lpsize(as,
3220 setaddr, setsize, rflags, rtype);
3221 if (error) {
3222 break;
3224 segvn = 0;
3227 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3228 ssize = seg->s_base + seg->s_size - raddr;
3229 } else {
3230 ssize = rsize;
3233 if (error == 0 && segvn) {
3234 /* The last chunk when rsize == 0. */
3235 ASSERT(setsize != 0);
3236 error = as_iset_default_lpsize(as, setaddr, setsize,
3237 rflags, rtype);
3240 if (error == IE_RETRY) {
3241 goto again;
3242 } else if (error == IE_NOMEM) {
3243 error = EAGAIN;
3244 } else if (error == ENOTSUP) {
3245 error = EINVAL;
3246 } else if (error == EAGAIN) {
3247 mutex_enter(&as->a_contents);
3248 if (!AS_ISNOUNMAPWAIT(as)) {
3249 if (AS_ISUNMAPWAIT(as) == 0) {
3250 cv_broadcast(&as->a_cv);
3252 AS_SETUNMAPWAIT(as);
3253 AS_LOCK_EXIT(as);
3254 while (AS_ISUNMAPWAIT(as)) {
3255 cv_wait(&as->a_cv, &as->a_contents);
3257 mutex_exit(&as->a_contents);
3258 AS_LOCK_ENTER(as, RW_WRITER);
3259 } else {
3261 * We may have raced with
3262 * segvn_reclaim()/segspt_reclaim(). In this case
3263 * clean nounmapwait flag and retry since softlockcnt
3264 * in this segment may be already 0. We don't drop as
3265 * writer lock so our number of retries without
3266 * sleeping should be very small. See segvn_reclaim()
3267 * for more comments.
3269 AS_CLRNOUNMAPWAIT(as);
3270 mutex_exit(&as->a_contents);
3272 goto again;
3275 as_setwatch(as);
3276 AS_LOCK_EXIT(as);
3277 return (error);
3281 * Setup all of the uninitialized watched pages that we can.
3283 void
3284 as_setwatch(struct as *as)
3286 struct watched_page *pwp;
3287 struct seg *seg;
3288 caddr_t vaddr;
3289 uint_t prot;
3290 int err, retrycnt;
3292 if (avl_numnodes(&as->a_wpage) == 0)
3293 return;
3295 ASSERT(AS_WRITE_HELD(as));
3297 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3298 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3299 retrycnt = 0;
3300 retry:
3301 vaddr = pwp->wp_vaddr;
3302 if (pwp->wp_oprot != 0 || /* already set up */
3303 (seg = as_segat(as, vaddr)) == NULL ||
3304 segop_getprot(seg, vaddr, 0, &prot) != 0)
3305 continue;
3307 pwp->wp_oprot = prot;
3308 if (pwp->wp_read)
3309 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3310 if (pwp->wp_write)
3311 prot &= ~PROT_WRITE;
3312 if (pwp->wp_exec)
3313 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3314 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3315 err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3316 if (err == IE_RETRY) {
3317 pwp->wp_oprot = 0;
3318 ASSERT(retrycnt == 0);
3319 retrycnt++;
3320 goto retry;
3323 pwp->wp_prot = prot;
3328 * Clear all of the watched pages in the address space.
3330 void
3331 as_clearwatch(struct as *as)
3333 struct watched_page *pwp;
3334 struct seg *seg;
3335 caddr_t vaddr;
3336 uint_t prot;
3337 int err, retrycnt;
3339 if (avl_numnodes(&as->a_wpage) == 0)
3340 return;
3342 ASSERT(AS_WRITE_HELD(as));
3344 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3345 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3346 retrycnt = 0;
3347 retry:
3348 vaddr = pwp->wp_vaddr;
3349 if (pwp->wp_oprot == 0 || /* not set up */
3350 (seg = as_segat(as, vaddr)) == NULL)
3351 continue;
3353 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3354 err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3355 if (err == IE_RETRY) {
3356 ASSERT(retrycnt == 0);
3357 retrycnt++;
3358 goto retry;
3361 pwp->wp_oprot = 0;
3362 pwp->wp_prot = 0;
3367 * Force a new setup for all the watched pages in the range.
3369 static void
3370 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3372 struct watched_page *pwp;
3373 struct watched_page tpw;
3374 caddr_t eaddr = addr + size;
3375 caddr_t vaddr;
3376 struct seg *seg;
3377 int err, retrycnt;
3378 uint_t wprot;
3379 avl_index_t where;
3381 if (avl_numnodes(&as->a_wpage) == 0)
3382 return;
3384 ASSERT(AS_WRITE_HELD(as));
3386 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3387 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3388 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3390 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3391 retrycnt = 0;
3392 vaddr = pwp->wp_vaddr;
3394 wprot = prot;
3395 if (pwp->wp_read)
3396 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3397 if (pwp->wp_write)
3398 wprot &= ~PROT_WRITE;
3399 if (pwp->wp_exec)
3400 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3401 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3402 retry:
3403 seg = as_segat(as, vaddr);
3404 if (seg == NULL) {
3405 panic("as_setwatchprot: no seg");
3406 /*NOTREACHED*/
3408 err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3409 if (err == IE_RETRY) {
3410 ASSERT(retrycnt == 0);
3411 retrycnt++;
3412 goto retry;
3415 pwp->wp_oprot = prot;
3416 pwp->wp_prot = wprot;
3418 pwp = AVL_NEXT(&as->a_wpage, pwp);
3423 * Clear all of the watched pages in the range.
3425 static void
3426 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3428 caddr_t eaddr = addr + size;
3429 struct watched_page *pwp;
3430 struct watched_page tpw;
3431 uint_t prot;
3432 struct seg *seg;
3433 int err, retrycnt;
3434 avl_index_t where;
3436 if (avl_numnodes(&as->a_wpage) == 0)
3437 return;
3439 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3440 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3441 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3443 ASSERT(AS_WRITE_HELD(as));
3445 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3447 if ((prot = pwp->wp_oprot) != 0) {
3448 retrycnt = 0;
3450 if (prot != pwp->wp_prot) {
3451 retry:
3452 seg = as_segat(as, pwp->wp_vaddr);
3453 if (seg == NULL)
3454 continue;
3455 err = segop_setprot(seg, pwp->wp_vaddr,
3456 PAGESIZE, prot);
3457 if (err == IE_RETRY) {
3458 ASSERT(retrycnt == 0);
3459 retrycnt++;
3460 goto retry;
3464 pwp->wp_oprot = 0;
3465 pwp->wp_prot = 0;
3468 pwp = AVL_NEXT(&as->a_wpage, pwp);
3472 void
3473 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3475 struct proc *p;
3477 mutex_enter(&pidlock);
3478 for (p = practive; p; p = p->p_next) {
3479 if (p->p_as == as) {
3480 mutex_enter(&p->p_lock);
3481 if (p->p_as == as)
3482 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3483 mutex_exit(&p->p_lock);
3486 mutex_exit(&pidlock);
3490 * return memory object ID
3493 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3495 struct seg *seg;
3496 int sts;
3498 AS_LOCK_ENTER(as, RW_READER);
3499 seg = as_segat(as, addr);
3500 if (seg == NULL) {
3501 AS_LOCK_EXIT(as);
3502 return (EFAULT);
3505 sts = segop_getmemid(seg, addr, memidp);
3507 AS_LOCK_EXIT(as);
3508 return (sts);