Merge commit '0b2e8253986c5c761129b58cfdac46d204903de1'
[unleashed.git] / kernel / vm / vm_as.c
blob834bbedcf830b92e6f9b29d47f8ba7698a252060
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
34 * All Rights Reserved
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
38 * contributors.
42 * VM - address spaces.
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
54 #include <sys/kmem.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
62 #include <vm/hat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
70 #include <vm/page.h>
72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
74 static struct kmem_cache *as_cache;
76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
82 * Verifying the segment lists is very time-consuming; it may not be
83 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
85 #ifdef DEBUG
86 #define VERIFY_SEGLIST
87 int do_as_verify = 0;
88 #endif
91 * Allocate a new callback data structure entry and fill in the events of
92 * interest, the address range of interest, and the callback argument.
93 * Link the entry on the as->a_callbacks list. A callback entry for the
94 * entire address space may be specified with vaddr = 0 and size = -1.
96 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 * the specified as, the caller must guarantee persistence of the specified as
98 * for the duration of this function (eg. pages being locked within the as
99 * will guarantee persistence).
102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
103 caddr_t vaddr, size_t size, int sleepflag)
105 struct as_callback *current_head, *cb;
106 caddr_t saddr;
107 size_t rsize;
109 /* callback function and an event are mandatory */
110 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
111 return (EINVAL);
113 /* Adding a callback after as_free has been called is not allowed */
114 if (as == &kas)
115 return (ENOMEM);
118 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 * is the entire address space so no rounding is done in that case.
121 if (size != -1) {
122 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
123 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
124 (size_t)saddr;
125 /* check for wraparound */
126 if (saddr + rsize < saddr)
127 return (ENOMEM);
128 } else {
129 if (vaddr != 0)
130 return (EINVAL);
131 saddr = vaddr;
132 rsize = size;
135 /* Allocate and initialize a callback entry */
136 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
137 if (cb == NULL)
138 return (EAGAIN);
140 cb->ascb_func = cb_func;
141 cb->ascb_arg = arg;
142 cb->ascb_events = events;
143 cb->ascb_saddr = saddr;
144 cb->ascb_len = rsize;
146 /* Add the entry to the list */
147 mutex_enter(&as->a_contents);
148 current_head = as->a_callbacks;
149 as->a_callbacks = cb;
150 cb->ascb_next = current_head;
153 * The call to this function may lose in a race with
154 * a pertinent event - eg. a thread does long term memory locking
155 * but before the callback is added another thread executes as_unmap.
156 * A broadcast here resolves that.
158 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
159 AS_CLRUNMAPWAIT(as);
160 cv_broadcast(&as->a_cv);
163 mutex_exit(&as->a_contents);
164 return (0);
168 * Search the callback list for an entry which pertains to arg.
170 * This is called from within the client upon completion of the callback.
171 * RETURN VALUES:
172 * AS_CALLBACK_DELETED (callback entry found and deleted)
173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 * entry will be made in as_do_callbacks)
177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 * set, it indicates that as_do_callbacks is processing this entry. The
179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 * to unblock as_do_callbacks, in case it is blocked.
182 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 * the specified as, the caller must guarantee persistence of the specified as
184 * for the duration of this function (eg. pages being locked within the as
185 * will guarantee persistence).
187 uint_t
188 as_delete_callback(struct as *as, void *arg)
190 struct as_callback **prevcb = &as->a_callbacks;
191 struct as_callback *cb;
192 uint_t rc = AS_CALLBACK_NOTFOUND;
194 mutex_enter(&as->a_contents);
195 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
196 if (cb->ascb_arg != arg)
197 continue;
200 * If the events indicate AS_CALLBACK_CALLED, just clear
201 * AS_ALL_EVENT in the events field and wakeup the thread
202 * that may be waiting in as_do_callbacks. as_do_callbacks
203 * will take care of removing this entry from the list. In
204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
205 * (AS_CALLBACK_CALLED not set), just remove it from the
206 * list, return the memory and return AS_CALLBACK_DELETED.
208 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
209 /* leave AS_CALLBACK_CALLED */
210 cb->ascb_events &= ~AS_ALL_EVENT;
211 rc = AS_CALLBACK_DELETE_DEFERRED;
212 cv_broadcast(&as->a_cv);
213 } else {
214 *prevcb = cb->ascb_next;
215 kmem_free(cb, sizeof (struct as_callback));
216 rc = AS_CALLBACK_DELETED;
218 break;
220 mutex_exit(&as->a_contents);
221 return (rc);
225 * Searches the as callback list for a matching entry.
226 * Returns a pointer to the first matching callback, or NULL if
227 * nothing is found.
228 * This function never sleeps so it is ok to call it with more
229 * locks held but the (required) a_contents mutex.
231 * See also comment on as_do_callbacks below.
233 static struct as_callback *
234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
235 size_t event_len)
237 struct as_callback *cb;
239 ASSERT(MUTEX_HELD(&as->a_contents));
240 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
242 * If the callback has not already been called, then
243 * check if events or address range pertains. An event_len
244 * of zero means do an unconditional callback.
246 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
247 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
248 (event_addr + event_len < cb->ascb_saddr) ||
249 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
250 continue;
252 break;
254 return (cb);
258 * Executes a given callback and removes it from the callback list for
259 * this address space.
260 * This function may sleep so the caller must drop all locks except
261 * a_contents before calling this func.
263 * See also comments on as_do_callbacks below.
265 static void
266 as_execute_callback(struct as *as, struct as_callback *cb,
267 uint_t events)
269 struct as_callback **prevcb;
270 void *cb_arg;
272 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
273 cb->ascb_events |= AS_CALLBACK_CALLED;
274 mutex_exit(&as->a_contents);
275 (*cb->ascb_func)(as, cb->ascb_arg, events);
276 mutex_enter(&as->a_contents);
278 * the callback function is required to delete the callback
279 * when the callback function determines it is OK for
280 * this thread to continue. as_delete_callback will clear
281 * the AS_ALL_EVENT in the events field when it is deleted.
282 * If the callback function called as_delete_callback,
283 * events will already be cleared and there will be no blocking.
285 while ((cb->ascb_events & events) != 0) {
286 cv_wait(&as->a_cv, &as->a_contents);
289 * This entry needs to be taken off the list. Normally, the
290 * callback func itself does that, but unfortunately the list
291 * may have changed while the callback was running because the
292 * a_contents mutex was dropped and someone else other than the
293 * callback func itself could have called as_delete_callback,
294 * so we have to search to find this entry again. The entry
295 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
297 cb_arg = cb->ascb_arg;
298 prevcb = &as->a_callbacks;
299 for (cb = as->a_callbacks; cb != NULL;
300 prevcb = &cb->ascb_next, cb = *prevcb) {
301 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
302 (cb_arg != cb->ascb_arg)) {
303 continue;
305 *prevcb = cb->ascb_next;
306 kmem_free(cb, sizeof (struct as_callback));
307 break;
312 * Check the callback list for a matching event and intersection of
313 * address range. If there is a match invoke the callback. Skip an entry if:
314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 * - not event of interest
316 * - not address range of interest
318 * An event_len of zero indicates a request for an unconditional callback
319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
320 * a_contents lock must be dropped before a callback, so only one callback
321 * can be done before returning. Return -1 (true) if a callback was
322 * executed and removed from the list, else return 0 (false).
324 * The logically separate parts, i.e. finding a matching callback and
325 * executing a given callback have been separated into two functions
326 * so that they can be called with different sets of locks held beyond
327 * the always-required a_contents. as_find_callback does not sleep so
328 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 * rwlock) are held. as_execute_callback on the other hand may sleep
330 * so all locks beyond a_contents must be dropped by the caller if one
331 * does not want to end comatose.
333 static int
334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
335 size_t event_len)
337 struct as_callback *cb;
339 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
340 as_execute_callback(as, cb, events);
341 return (-1);
343 return (0);
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
352 * list is returned.
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
357 struct seg *
358 as_findseg(struct as *as, caddr_t addr, int tail)
360 struct seg *seg = as->a_seglast;
361 avl_index_t where;
363 ASSERT(AS_LOCK_HELD(as));
365 if (seg != NULL &&
366 seg->s_base <= addr &&
367 addr < seg->s_base + seg->s_size)
368 return (seg);
370 seg = avl_find(&as->a_segtree, &addr, &where);
371 if (seg != NULL)
372 return (as->a_seglast = seg);
374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375 if (seg == NULL && tail)
376 seg = avl_last(&as->a_segtree);
377 return (as->a_seglast = seg);
380 #ifdef VERIFY_SEGLIST
382 * verify that the linked list is coherent
384 static void
385 as_verify(struct as *as)
387 struct seg *seg, *seglast, *p, *n;
388 uint_t nsegs = 0;
390 if (do_as_verify == 0)
391 return;
393 seglast = as->a_seglast;
395 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
396 ASSERT(seg->s_as == as);
397 p = AS_SEGPREV(as, seg);
398 n = AS_SEGNEXT(as, seg);
399 ASSERT(p == NULL || p->s_as == as);
400 ASSERT(p == NULL || p->s_base < seg->s_base);
401 ASSERT(n == NULL || n->s_base > seg->s_base);
402 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
403 if (seg == seglast)
404 seglast = NULL;
405 nsegs++;
407 ASSERT(seglast == NULL);
408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
410 #endif /* VERIFY_SEGLIST */
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
418 as_addseg(struct as *as, struct seg *newseg)
420 struct seg *seg;
421 caddr_t addr;
422 caddr_t eaddr;
423 avl_index_t where;
425 ASSERT(AS_WRITE_HELD(as));
427 as->a_updatedir = 1; /* inform /proc */
428 gethrestime(&as->a_updatetime);
430 if (as->a_lastgaphl != NULL) {
431 struct seg *hseg = NULL;
432 struct seg *lseg = NULL;
434 if (as->a_lastgaphl->s_base > newseg->s_base) {
435 hseg = as->a_lastgaphl;
436 lseg = AVL_PREV(&as->a_segtree, hseg);
437 } else {
438 lseg = as->a_lastgaphl;
439 hseg = AVL_NEXT(&as->a_segtree, lseg);
442 if (hseg && lseg && lseg->s_base < newseg->s_base &&
443 hseg->s_base > newseg->s_base) {
444 avl_insert_here(&as->a_segtree, newseg, lseg,
445 AVL_AFTER);
446 as->a_lastgaphl = NULL;
447 as->a_seglast = newseg;
448 return (0);
450 as->a_lastgaphl = NULL;
453 addr = newseg->s_base;
454 eaddr = addr + newseg->s_size;
455 again:
457 seg = avl_find(&as->a_segtree, &addr, &where);
459 if (seg == NULL)
460 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
462 if (seg == NULL)
463 seg = avl_last(&as->a_segtree);
465 if (seg != NULL) {
466 caddr_t base = seg->s_base;
469 * If top of seg is below the requested address, then
470 * the insertion point is at the end of the linked list,
471 * and seg points to the tail of the list. Otherwise,
472 * the insertion point is immediately before seg.
474 if (base + seg->s_size > addr) {
475 if (addr >= base || eaddr > base) {
476 return (-1); /* overlapping segment */
480 as->a_seglast = newseg;
481 avl_insert(&as->a_segtree, newseg, where);
483 #ifdef VERIFY_SEGLIST
484 as_verify(as);
485 #endif
486 return (0);
489 struct seg *
490 as_removeseg(struct as *as, struct seg *seg)
492 avl_tree_t *t;
494 ASSERT(AS_WRITE_HELD(as));
496 as->a_updatedir = 1; /* inform /proc */
497 gethrestime(&as->a_updatetime);
499 if (seg == NULL)
500 return (NULL);
502 t = &as->a_segtree;
503 if (as->a_seglast == seg)
504 as->a_seglast = NULL;
505 as->a_lastgaphl = NULL;
508 * if this segment is at an address higher than
509 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
511 if (as->a_lastgap &&
512 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
513 as->a_lastgap = AVL_NEXT(t, seg);
516 * remove the segment from the seg tree
518 avl_remove(t, seg);
520 #ifdef VERIFY_SEGLIST
521 as_verify(as);
522 #endif
523 return (seg);
527 * Find a segment containing addr.
529 struct seg *
530 as_segat(struct as *as, caddr_t addr)
532 struct seg *seg = as->a_seglast;
534 ASSERT(AS_LOCK_HELD(as));
536 if (seg != NULL && seg->s_base <= addr &&
537 addr < seg->s_base + seg->s_size)
538 return (seg);
540 seg = avl_find(&as->a_segtree, &addr, NULL);
541 return (seg);
545 * Serialize all searches for holes in an address space to
546 * prevent two or more threads from allocating the same virtual
547 * address range. The address space must not be "read/write"
548 * locked by the caller since we may block.
550 void
551 as_rangelock(struct as *as)
553 mutex_enter(&as->a_contents);
554 while (AS_ISCLAIMGAP(as))
555 cv_wait(&as->a_cv, &as->a_contents);
556 AS_SETCLAIMGAP(as);
557 mutex_exit(&as->a_contents);
561 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
563 void
564 as_rangeunlock(struct as *as)
566 mutex_enter(&as->a_contents);
567 AS_CLRCLAIMGAP(as);
568 cv_signal(&as->a_cv);
569 mutex_exit(&as->a_contents);
573 * compar segments (or just an address) by segment address range
575 static int
576 as_segcompar(const void *x, const void *y)
578 struct seg *a = (struct seg *)x;
579 struct seg *b = (struct seg *)y;
581 if (a->s_base < b->s_base)
582 return (-1);
583 if (a->s_base >= b->s_base + b->s_size)
584 return (1);
585 return (0);
589 void
590 as_avlinit(struct as *as)
592 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
593 offsetof(struct seg, s_tree));
594 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
595 offsetof(struct watched_page, wp_link));
598 /*ARGSUSED*/
599 static int
600 as_constructor(void *buf, void *cdrarg, int kmflags)
602 struct as *as = buf;
604 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
605 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
606 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
607 as_avlinit(as);
608 return (0);
611 /*ARGSUSED1*/
612 static void
613 as_destructor(void *buf, void *cdrarg)
615 struct as *as = buf;
617 avl_destroy(&as->a_segtree);
618 mutex_destroy(&as->a_contents);
619 cv_destroy(&as->a_cv);
620 rw_destroy(&as->a_lock);
623 void
624 as_init(void)
626 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
627 as_constructor, as_destructor, NULL, NULL, NULL, 0);
631 * Allocate and initialize an address space data structure.
632 * We call hat_alloc to allow any machine dependent
633 * information in the hat structure to be initialized.
635 struct as *
636 as_alloc(void)
638 struct as *as;
640 as = kmem_cache_alloc(as_cache, KM_SLEEP);
642 as->a_flags = 0;
643 as->a_vbits = 0;
644 as->a_hrm = NULL;
645 as->a_seglast = NULL;
646 as->a_size = 0;
647 as->a_resvsize = 0;
648 as->a_updatedir = 0;
649 gethrestime(&as->a_updatetime);
650 as->a_objectdir = NULL;
651 as->a_sizedir = 0;
652 as->a_userlimit = (caddr_t)USERLIMIT;
653 as->a_lastgap = NULL;
654 as->a_lastgaphl = NULL;
655 as->a_callbacks = NULL;
656 as->a_proc = NULL;
658 AS_LOCK_ENTER(as, RW_WRITER);
659 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
660 AS_LOCK_EXIT(as);
662 return (as);
666 * Free an address space data structure.
667 * Need to free the hat first and then
668 * all the segments on this as and finally
669 * the space for the as struct itself.
671 void
672 as_free(struct as *as)
674 struct hat *hat = as->a_hat;
675 struct seg *seg, *next;
676 boolean_t free_started = B_FALSE;
678 top:
680 * Invoke ALL callbacks. as_do_callbacks will do one callback
681 * per call, and not return (-1) until the callback has completed.
682 * When as_do_callbacks returns zero, all callbacks have completed.
684 mutex_enter(&as->a_contents);
685 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
688 mutex_exit(&as->a_contents);
689 AS_LOCK_ENTER(as, RW_WRITER);
691 if (!free_started) {
692 free_started = B_TRUE;
693 hat_free_start(hat);
695 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
696 int err;
698 next = AS_SEGNEXT(as, seg);
699 retry:
700 err = segop_unmap(seg, seg->s_base, seg->s_size);
701 if (err == EAGAIN) {
702 mutex_enter(&as->a_contents);
703 if (as->a_callbacks) {
704 AS_LOCK_EXIT(as);
705 } else if (!AS_ISNOUNMAPWAIT(as)) {
707 * Memory is currently locked. Wait for a
708 * cv_signal that it has been unlocked, then
709 * try the operation again.
711 if (AS_ISUNMAPWAIT(as) == 0)
712 cv_broadcast(&as->a_cv);
713 AS_SETUNMAPWAIT(as);
714 AS_LOCK_EXIT(as);
715 while (AS_ISUNMAPWAIT(as))
716 cv_wait(&as->a_cv, &as->a_contents);
717 } else {
719 * We may have raced with
720 * segvn_reclaim()/segspt_reclaim(). In this
721 * case clean nounmapwait flag and retry since
722 * softlockcnt in this segment may be already
723 * 0. We don't drop as writer lock so our
724 * number of retries without sleeping should
725 * be very small. See segvn_reclaim() for
726 * more comments.
728 AS_CLRNOUNMAPWAIT(as);
729 mutex_exit(&as->a_contents);
730 goto retry;
732 mutex_exit(&as->a_contents);
733 goto top;
734 } else {
736 * We do not expect any other error return at this
737 * time. This is similar to an ASSERT in seg_unmap()
739 ASSERT(err == 0);
742 hat_free_end(hat);
743 AS_LOCK_EXIT(as);
745 /* /proc stuff */
746 ASSERT(avl_numnodes(&as->a_wpage) == 0);
747 if (as->a_objectdir) {
748 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
749 as->a_objectdir = NULL;
750 as->a_sizedir = 0;
754 * Free the struct as back to kmem. Assert it has no segments.
756 ASSERT(avl_numnodes(&as->a_segtree) == 0);
757 kmem_cache_free(as_cache, as);
761 as_dup(struct as *as, struct proc *forkedproc)
763 struct as *newas;
764 struct seg *seg, *newseg;
765 size_t purgesize = 0;
766 int error;
768 AS_LOCK_ENTER(as, RW_WRITER);
769 as_clearwatch(as);
770 newas = as_alloc();
771 newas->a_userlimit = as->a_userlimit;
772 newas->a_proc = forkedproc;
774 AS_LOCK_ENTER(newas, RW_WRITER);
776 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
778 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
780 if (seg->s_flags & S_PURGE) {
781 purgesize += seg->s_size;
782 continue;
785 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
786 if (newseg == NULL) {
787 AS_LOCK_EXIT(newas);
788 as_setwatch(as);
789 AS_LOCK_EXIT(as);
790 as_free(newas);
791 return (-1);
793 if ((error = segop_dup(seg, newseg)) != 0) {
795 * We call seg_free() on the new seg
796 * because the segment is not set up
797 * completely; i.e. it has no ops.
799 as_setwatch(as);
800 AS_LOCK_EXIT(as);
801 seg_free(newseg);
802 AS_LOCK_EXIT(newas);
803 as_free(newas);
804 return (error);
806 newas->a_size += seg->s_size;
808 newas->a_resvsize = as->a_resvsize - purgesize;
810 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
812 AS_LOCK_EXIT(newas);
814 as_setwatch(as);
815 AS_LOCK_EXIT(as);
816 if (error != 0) {
817 as_free(newas);
818 return (error);
820 forkedproc->p_as = newas;
821 return (0);
825 * Handle a ``fault'' at addr for size bytes.
827 faultcode_t
828 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
829 enum fault_type type, enum seg_rw rw)
831 struct seg *seg;
832 caddr_t raddr; /* rounded down addr */
833 size_t rsize; /* rounded up size */
834 size_t ssize;
835 faultcode_t res = 0;
836 caddr_t addrsav;
837 struct seg *segsav;
838 int as_lock_held;
839 klwp_t *lwp = ttolwp(curthread);
843 retry:
845 * Indicate that the lwp is not to be stopped while waiting for a
846 * pagefault. This is to avoid deadlock while debugging a process
847 * via /proc over NFS (in particular).
849 if (lwp != NULL)
850 lwp->lwp_nostop++;
853 * same length must be used when we softlock and softunlock. We
854 * don't support softunlocking lengths less than the original length
855 * when there is largepage support. See seg_dev.c for more
856 * comments.
858 switch (type) {
860 case F_SOFTLOCK:
861 CPU_STATS_ADD_K(vm, softlock, 1);
862 break;
864 case F_SOFTUNLOCK:
865 break;
867 case F_PROT:
868 CPU_STATS_ADD_K(vm, prot_fault, 1);
869 break;
871 case F_INVAL:
872 CPU_STATS_ENTER_K();
873 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
874 if (as == &kas)
875 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
876 CPU_STATS_EXIT_K();
877 break;
880 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
881 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
882 (size_t)raddr;
885 * XXX -- Don't grab the as lock for segkmap. We should grab it for
886 * correctness, but then we could be stuck holding this lock for
887 * a LONG time if the fault needs to be resolved on a slow
888 * filesystem, and then no-one will be able to exec new commands,
889 * as exec'ing requires the write lock on the as.
891 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
892 raddr + size < segkmap->s_base + segkmap->s_size) {
893 seg = segkmap;
894 as_lock_held = 0;
895 } else {
896 AS_LOCK_ENTER(as, RW_READER);
898 seg = as_segat(as, raddr);
899 if (seg == NULL) {
900 AS_LOCK_EXIT(as);
901 if (lwp != NULL)
902 lwp->lwp_nostop--;
903 return (FC_NOMAP);
906 as_lock_held = 1;
909 addrsav = raddr;
910 segsav = seg;
912 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
913 if (raddr >= seg->s_base + seg->s_size) {
914 seg = AS_SEGNEXT(as, seg);
915 if (seg == NULL || raddr != seg->s_base) {
916 res = FC_NOMAP;
917 break;
920 if (raddr + rsize > seg->s_base + seg->s_size)
921 ssize = seg->s_base + seg->s_size - raddr;
922 else
923 ssize = rsize;
925 res = segop_fault(hat, seg, raddr, ssize, type, rw);
926 if (res != 0)
927 break;
931 * If we were SOFTLOCKing and encountered a failure,
932 * we must SOFTUNLOCK the range we already did. (Maybe we
933 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
934 * right here...)
936 if (res != 0 && type == F_SOFTLOCK) {
937 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
938 if (addrsav >= seg->s_base + seg->s_size)
939 seg = AS_SEGNEXT(as, seg);
940 ASSERT(seg != NULL);
942 * Now call the fault routine again to perform the
943 * unlock using S_OTHER instead of the rw variable
944 * since we never got a chance to touch the pages.
946 if (raddr > seg->s_base + seg->s_size)
947 ssize = seg->s_base + seg->s_size - addrsav;
948 else
949 ssize = raddr - addrsav;
950 (void) segop_fault(hat, seg, addrsav, ssize,
951 F_SOFTUNLOCK, S_OTHER);
954 if (as_lock_held)
955 AS_LOCK_EXIT(as);
956 if (lwp != NULL)
957 lwp->lwp_nostop--;
960 * If the lower levels returned EDEADLK for a fault,
961 * It means that we should retry the fault. Let's wait
962 * a bit also to let the deadlock causing condition clear.
963 * This is part of a gross hack to work around a design flaw
964 * in the ufs/sds logging code and should go away when the
965 * logging code is re-designed to fix the problem. See bug
966 * 4125102 for details of the problem.
968 if (FC_ERRNO(res) == EDEADLK) {
969 delay(deadlk_wait);
970 res = 0;
971 goto retry;
973 return (res);
979 * Asynchronous ``fault'' at addr for size bytes.
981 faultcode_t
982 as_faulta(struct as *as, caddr_t addr, size_t size)
984 struct seg *seg;
985 caddr_t raddr; /* rounded down addr */
986 size_t rsize; /* rounded up size */
987 faultcode_t res = 0;
988 klwp_t *lwp = ttolwp(curthread);
990 retry:
992 * Indicate that the lwp is not to be stopped while waiting
993 * for a pagefault. This is to avoid deadlock while debugging
994 * a process via /proc over NFS (in particular).
996 if (lwp != NULL)
997 lwp->lwp_nostop++;
999 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1000 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1001 (size_t)raddr;
1003 AS_LOCK_ENTER(as, RW_READER);
1004 seg = as_segat(as, raddr);
1005 if (seg == NULL) {
1006 AS_LOCK_EXIT(as);
1007 if (lwp != NULL)
1008 lwp->lwp_nostop--;
1009 return (FC_NOMAP);
1012 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1013 if (raddr >= seg->s_base + seg->s_size) {
1014 seg = AS_SEGNEXT(as, seg);
1015 if (seg == NULL || raddr != seg->s_base) {
1016 res = FC_NOMAP;
1017 break;
1020 res = segop_faulta(seg, raddr);
1021 if (res != 0)
1022 break;
1024 AS_LOCK_EXIT(as);
1025 if (lwp != NULL)
1026 lwp->lwp_nostop--;
1028 * If the lower levels returned EDEADLK for a fault,
1029 * It means that we should retry the fault. Let's wait
1030 * a bit also to let the deadlock causing condition clear.
1031 * This is part of a gross hack to work around a design flaw
1032 * in the ufs/sds logging code and should go away when the
1033 * logging code is re-designed to fix the problem. See bug
1034 * 4125102 for details of the problem.
1036 if (FC_ERRNO(res) == EDEADLK) {
1037 delay(deadlk_wait);
1038 res = 0;
1039 goto retry;
1041 return (res);
1045 * Set the virtual mapping for the interval from [addr : addr + size)
1046 * in address space `as' to have the specified protection.
1047 * It is ok for the range to cross over several segments,
1048 * as long as they are contiguous.
1051 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1053 struct seg *seg;
1054 struct as_callback *cb;
1055 size_t ssize;
1056 caddr_t raddr; /* rounded down addr */
1057 size_t rsize; /* rounded up size */
1058 int error = 0, writer = 0;
1059 caddr_t saveraddr;
1060 size_t saversize;
1062 setprot_top:
1063 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1064 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1065 (size_t)raddr;
1067 if (raddr + rsize < raddr) /* check for wraparound */
1068 return (ENOMEM);
1070 saveraddr = raddr;
1071 saversize = rsize;
1074 * Normally we only lock the as as a reader. But
1075 * if due to setprot the segment driver needs to split
1076 * a segment it will return IE_RETRY. Therefore we re-acquire
1077 * the as lock as a writer so the segment driver can change
1078 * the seg list. Also the segment driver will return IE_RETRY
1079 * after it has changed the segment list so we therefore keep
1080 * locking as a writer. Since these opeartions should be rare
1081 * want to only lock as a writer when necessary.
1083 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1084 AS_LOCK_ENTER(as, RW_WRITER);
1085 } else {
1086 AS_LOCK_ENTER(as, RW_READER);
1089 as_clearwatchprot(as, raddr, rsize);
1090 seg = as_segat(as, raddr);
1091 if (seg == NULL) {
1092 as_setwatch(as);
1093 AS_LOCK_EXIT(as);
1094 return (ENOMEM);
1097 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1098 if (raddr >= seg->s_base + seg->s_size) {
1099 seg = AS_SEGNEXT(as, seg);
1100 if (seg == NULL || raddr != seg->s_base) {
1101 error = ENOMEM;
1102 break;
1105 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1106 ssize = seg->s_base + seg->s_size - raddr;
1107 else
1108 ssize = rsize;
1109 retry:
1110 error = segop_setprot(seg, raddr, ssize, prot);
1112 if (error == IE_NOMEM) {
1113 error = EAGAIN;
1114 break;
1117 if (error == IE_RETRY) {
1118 AS_LOCK_EXIT(as);
1119 writer = 1;
1120 goto setprot_top;
1123 if (error == EAGAIN) {
1125 * Make sure we have a_lock as writer.
1127 if (writer == 0) {
1128 AS_LOCK_EXIT(as);
1129 writer = 1;
1130 goto setprot_top;
1134 * Memory is currently locked. It must be unlocked
1135 * before this operation can succeed through a retry.
1136 * The possible reasons for locked memory and
1137 * corresponding strategies for unlocking are:
1138 * (1) Normal I/O
1139 * wait for a signal that the I/O operation
1140 * has completed and the memory is unlocked.
1141 * (2) Asynchronous I/O
1142 * The aio subsystem does not unlock pages when
1143 * the I/O is completed. Those pages are unlocked
1144 * when the application calls aiowait/aioerror.
1145 * So, to prevent blocking forever, cv_broadcast()
1146 * is done to wake up aio_cleanup_thread.
1147 * Subsequently, segvn_reclaim will be called, and
1148 * that will do AS_CLRUNMAPWAIT() and wake us up.
1149 * (3) Long term page locking:
1150 * Drivers intending to have pages locked for a
1151 * period considerably longer than for normal I/O
1152 * (essentially forever) may have registered for a
1153 * callback so they may unlock these pages on
1154 * request. This is needed to allow this operation
1155 * to succeed. Each entry on the callback list is
1156 * examined. If the event or address range pertains
1157 * the callback is invoked (unless it already is in
1158 * progress). The a_contents lock must be dropped
1159 * before the callback, so only one callback can
1160 * be done at a time. Go to the top and do more
1161 * until zero is returned. If zero is returned,
1162 * either there were no callbacks for this event
1163 * or they were already in progress.
1165 mutex_enter(&as->a_contents);
1166 if (as->a_callbacks &&
1167 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1168 seg->s_base, seg->s_size))) {
1169 AS_LOCK_EXIT(as);
1170 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1171 } else if (!AS_ISNOUNMAPWAIT(as)) {
1172 if (AS_ISUNMAPWAIT(as) == 0)
1173 cv_broadcast(&as->a_cv);
1174 AS_SETUNMAPWAIT(as);
1175 AS_LOCK_EXIT(as);
1176 while (AS_ISUNMAPWAIT(as))
1177 cv_wait(&as->a_cv, &as->a_contents);
1178 } else {
1180 * We may have raced with
1181 * segvn_reclaim()/segspt_reclaim(). In this
1182 * case clean nounmapwait flag and retry since
1183 * softlockcnt in this segment may be already
1184 * 0. We don't drop as writer lock so our
1185 * number of retries without sleeping should
1186 * be very small. See segvn_reclaim() for
1187 * more comments.
1189 AS_CLRNOUNMAPWAIT(as);
1190 mutex_exit(&as->a_contents);
1191 goto retry;
1193 mutex_exit(&as->a_contents);
1194 goto setprot_top;
1195 } else if (error != 0)
1196 break;
1198 if (error != 0) {
1199 as_setwatch(as);
1200 } else {
1201 as_setwatchprot(as, saveraddr, saversize, prot);
1203 AS_LOCK_EXIT(as);
1204 return (error);
1208 * Check to make sure that the interval [addr, addr + size)
1209 * in address space `as' has at least the specified protection.
1210 * It is ok for the range to cross over several segments, as long
1211 * as they are contiguous.
1214 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1216 struct seg *seg;
1217 size_t ssize;
1218 caddr_t raddr; /* rounded down addr */
1219 size_t rsize; /* rounded up size */
1220 int error = 0;
1222 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1223 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1224 (size_t)raddr;
1226 if (raddr + rsize < raddr) /* check for wraparound */
1227 return (ENOMEM);
1230 * This is ugly as sin...
1231 * Normally, we only acquire the address space readers lock.
1232 * However, if the address space has watchpoints present,
1233 * we must acquire the writer lock on the address space for
1234 * the benefit of as_clearwatchprot() and as_setwatchprot().
1236 if (avl_numnodes(&as->a_wpage) != 0)
1237 AS_LOCK_ENTER(as, RW_WRITER);
1238 else
1239 AS_LOCK_ENTER(as, RW_READER);
1240 as_clearwatchprot(as, raddr, rsize);
1241 seg = as_segat(as, raddr);
1242 if (seg == NULL) {
1243 as_setwatch(as);
1244 AS_LOCK_EXIT(as);
1245 return (ENOMEM);
1248 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1249 if (raddr >= seg->s_base + seg->s_size) {
1250 seg = AS_SEGNEXT(as, seg);
1251 if (seg == NULL || raddr != seg->s_base) {
1252 error = ENOMEM;
1253 break;
1256 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1257 ssize = seg->s_base + seg->s_size - raddr;
1258 else
1259 ssize = rsize;
1261 error = segop_checkprot(seg, raddr, ssize, prot);
1262 if (error != 0)
1263 break;
1265 as_setwatch(as);
1266 AS_LOCK_EXIT(as);
1267 return (error);
1271 as_unmap(struct as *as, caddr_t addr, size_t size)
1273 struct seg *seg, *seg_next;
1274 struct as_callback *cb;
1275 caddr_t raddr, eaddr;
1276 size_t ssize, rsize = 0;
1277 int err;
1279 top:
1280 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1281 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1282 (uintptr_t)PAGEMASK);
1284 AS_LOCK_ENTER(as, RW_WRITER);
1286 as->a_updatedir = 1; /* inform /proc */
1287 gethrestime(&as->a_updatetime);
1290 * Use as_findseg to find the first segment in the range, then
1291 * step through the segments in order, following s_next.
1293 as_clearwatchprot(as, raddr, eaddr - raddr);
1295 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1296 if (eaddr <= seg->s_base)
1297 break; /* eaddr was in a gap; all done */
1299 /* this is implied by the test above */
1300 ASSERT(raddr < eaddr);
1302 if (raddr < seg->s_base)
1303 raddr = seg->s_base; /* raddr was in a gap */
1305 if (eaddr > (seg->s_base + seg->s_size))
1306 ssize = seg->s_base + seg->s_size - raddr;
1307 else
1308 ssize = eaddr - raddr;
1311 * Save next segment pointer since seg can be
1312 * destroyed during the segment unmap operation.
1314 seg_next = AS_SEGNEXT(as, seg);
1317 * We didn't count /dev/null mappings, so ignore them here.
1318 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1319 * we have to do this check here while we have seg.)
1321 rsize = 0;
1322 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1323 !SEG_IS_PARTIAL_RESV(seg))
1324 rsize = ssize;
1326 retry:
1327 err = segop_unmap(seg, raddr, ssize);
1328 if (err == EAGAIN) {
1330 * Memory is currently locked. It must be unlocked
1331 * before this operation can succeed through a retry.
1332 * The possible reasons for locked memory and
1333 * corresponding strategies for unlocking are:
1334 * (1) Normal I/O
1335 * wait for a signal that the I/O operation
1336 * has completed and the memory is unlocked.
1337 * (2) Asynchronous I/O
1338 * The aio subsystem does not unlock pages when
1339 * the I/O is completed. Those pages are unlocked
1340 * when the application calls aiowait/aioerror.
1341 * So, to prevent blocking forever, cv_broadcast()
1342 * is done to wake up aio_cleanup_thread.
1343 * Subsequently, segvn_reclaim will be called, and
1344 * that will do AS_CLRUNMAPWAIT() and wake us up.
1345 * (3) Long term page locking:
1346 * Drivers intending to have pages locked for a
1347 * period considerably longer than for normal I/O
1348 * (essentially forever) may have registered for a
1349 * callback so they may unlock these pages on
1350 * request. This is needed to allow this operation
1351 * to succeed. Each entry on the callback list is
1352 * examined. If the event or address range pertains
1353 * the callback is invoked (unless it already is in
1354 * progress). The a_contents lock must be dropped
1355 * before the callback, so only one callback can
1356 * be done at a time. Go to the top and do more
1357 * until zero is returned. If zero is returned,
1358 * either there were no callbacks for this event
1359 * or they were already in progress.
1361 mutex_enter(&as->a_contents);
1362 if (as->a_callbacks &&
1363 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1364 seg->s_base, seg->s_size))) {
1365 AS_LOCK_EXIT(as);
1366 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1367 } else if (!AS_ISNOUNMAPWAIT(as)) {
1368 if (AS_ISUNMAPWAIT(as) == 0)
1369 cv_broadcast(&as->a_cv);
1370 AS_SETUNMAPWAIT(as);
1371 AS_LOCK_EXIT(as);
1372 while (AS_ISUNMAPWAIT(as))
1373 cv_wait(&as->a_cv, &as->a_contents);
1374 } else {
1376 * We may have raced with
1377 * segvn_reclaim()/segspt_reclaim(). In this
1378 * case clean nounmapwait flag and retry since
1379 * softlockcnt in this segment may be already
1380 * 0. We don't drop as writer lock so our
1381 * number of retries without sleeping should
1382 * be very small. See segvn_reclaim() for
1383 * more comments.
1385 AS_CLRNOUNMAPWAIT(as);
1386 mutex_exit(&as->a_contents);
1387 goto retry;
1389 mutex_exit(&as->a_contents);
1390 goto top;
1391 } else if (err == IE_RETRY) {
1392 AS_LOCK_EXIT(as);
1393 goto top;
1394 } else if (err) {
1395 as_setwatch(as);
1396 AS_LOCK_EXIT(as);
1397 return (-1);
1400 as->a_size -= ssize;
1401 if (rsize)
1402 as->a_resvsize -= rsize;
1403 raddr += ssize;
1405 AS_LOCK_EXIT(as);
1406 return (0);
1409 static int
1410 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1411 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1413 uint_t szc;
1414 uint_t nszc;
1415 int error;
1416 caddr_t a;
1417 caddr_t eaddr;
1418 size_t segsize;
1419 struct seg *seg;
1420 size_t pgsz;
1421 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1422 uint_t save_szcvec;
1424 ASSERT(AS_WRITE_HELD(as));
1425 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1426 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1427 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1428 if (!do_off) {
1429 vn_a->offset = 0;
1432 if (szcvec <= 1) {
1433 seg = seg_alloc(as, addr, size);
1434 if (seg == NULL) {
1435 return (ENOMEM);
1437 vn_a->szc = 0;
1438 error = (*crfp)(seg, vn_a);
1439 if (error != 0) {
1440 seg_free(seg);
1441 } else {
1442 as->a_size += size;
1443 as->a_resvsize += size;
1445 return (error);
1448 eaddr = addr + size;
1449 save_szcvec = szcvec;
1450 szcvec >>= 1;
1451 szc = 0;
1452 nszc = 0;
1453 while (szcvec) {
1454 if ((szcvec & 0x1) == 0) {
1455 nszc++;
1456 szcvec >>= 1;
1457 continue;
1459 nszc++;
1460 pgsz = page_get_pagesize(nszc);
1461 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1462 if (a != addr) {
1463 ASSERT(a < eaddr);
1464 segsize = a - addr;
1465 seg = seg_alloc(as, addr, segsize);
1466 if (seg == NULL) {
1467 return (ENOMEM);
1469 vn_a->szc = szc;
1470 error = (*crfp)(seg, vn_a);
1471 if (error != 0) {
1472 seg_free(seg);
1473 return (error);
1475 as->a_size += segsize;
1476 as->a_resvsize += segsize;
1477 *segcreated = 1;
1478 if (do_off) {
1479 vn_a->offset += segsize;
1481 addr = a;
1483 szc = nszc;
1484 szcvec >>= 1;
1487 ASSERT(addr < eaddr);
1488 szcvec = save_szcvec | 1; /* add 8K pages */
1489 while (szcvec) {
1490 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1491 ASSERT(a >= addr);
1492 if (a != addr) {
1493 segsize = a - addr;
1494 seg = seg_alloc(as, addr, segsize);
1495 if (seg == NULL) {
1496 return (ENOMEM);
1498 vn_a->szc = szc;
1499 error = (*crfp)(seg, vn_a);
1500 if (error != 0) {
1501 seg_free(seg);
1502 return (error);
1504 as->a_size += segsize;
1505 as->a_resvsize += segsize;
1506 *segcreated = 1;
1507 if (do_off) {
1508 vn_a->offset += segsize;
1510 addr = a;
1512 szcvec &= ~(1 << szc);
1513 if (szcvec) {
1514 szc = highbit(szcvec) - 1;
1515 pgsz = page_get_pagesize(szc);
1518 ASSERT(addr == eaddr);
1520 return (0);
1523 static int
1524 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1525 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1527 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1528 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1529 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1530 type, 0);
1531 int error;
1532 struct seg *seg;
1533 struct vattr va;
1534 uoff_t eoff;
1535 size_t save_size = 0;
1536 extern size_t textrepl_size_thresh;
1538 ASSERT(AS_WRITE_HELD(as));
1539 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1540 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1541 ASSERT(vn_a->vp != NULL);
1542 ASSERT(vn_a->amp == NULL);
1544 again:
1545 if (szcvec <= 1) {
1546 seg = seg_alloc(as, addr, size);
1547 if (seg == NULL) {
1548 return (ENOMEM);
1550 vn_a->szc = 0;
1551 error = (*crfp)(seg, vn_a);
1552 if (error != 0) {
1553 seg_free(seg);
1554 } else {
1555 as->a_size += size;
1556 as->a_resvsize += size;
1558 return (error);
1561 va.va_mask = AT_SIZE;
1562 if (fop_getattr(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1563 szcvec = 0;
1564 goto again;
1566 eoff = vn_a->offset & PAGEMASK;
1567 if (eoff >= va.va_size) {
1568 szcvec = 0;
1569 goto again;
1571 eoff += size;
1572 if (btopr(va.va_size) < btopr(eoff)) {
1573 save_size = size;
1574 size = va.va_size - (vn_a->offset & PAGEMASK);
1575 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1576 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1577 type, 0);
1578 if (szcvec <= 1) {
1579 size = save_size;
1580 goto again;
1584 if (size > textrepl_size_thresh) {
1585 vn_a->flags |= _MAP_TEXTREPL;
1587 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1588 segcreated);
1589 if (error != 0) {
1590 return (error);
1592 if (save_size) {
1593 addr += size;
1594 size = save_size - size;
1595 szcvec = 0;
1596 goto again;
1598 return (0);
1602 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1603 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1605 static int
1606 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1607 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1609 uint_t szcvec;
1610 uchar_t type;
1612 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1613 if (vn_a->type == MAP_SHARED) {
1614 type = MAPPGSZC_SHM;
1615 } else if (vn_a->type == MAP_PRIVATE) {
1616 if (vn_a->szc == AS_MAP_HEAP) {
1617 type = MAPPGSZC_HEAP;
1618 } else if (vn_a->szc == AS_MAP_STACK) {
1619 type = MAPPGSZC_STACK;
1620 } else {
1621 type = MAPPGSZC_PRIVM;
1624 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1625 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1626 (vn_a->flags & MAP_TEXT), type, 0);
1627 ASSERT(AS_WRITE_HELD(as));
1628 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1629 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1630 ASSERT(vn_a->vp == NULL);
1632 return (as_map_segvn_segs(as, addr, size, szcvec,
1633 crfp, vn_a, segcreated));
1637 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1639 AS_LOCK_ENTER(as, RW_WRITER);
1640 return (as_map_locked(as, addr, size, crfp, argsp));
1644 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1645 void *argsp)
1647 struct seg *seg = NULL;
1648 caddr_t raddr; /* rounded down addr */
1649 size_t rsize; /* rounded up size */
1650 int error;
1651 int unmap = 0;
1653 * The use of a_proc is preferred to handle the case where curproc is
1654 * a door_call server and is allocating memory in the client's (a_proc)
1655 * address space.
1656 * When creating a shared memory segment a_proc will be NULL so we
1657 * fallback to curproc in that case.
1659 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1660 struct segvn_crargs crargs;
1662 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1663 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1664 (size_t)raddr;
1667 * check for wrap around
1669 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1670 AS_LOCK_EXIT(as);
1671 return (ENOMEM);
1674 as->a_updatedir = 1; /* inform /proc */
1675 gethrestime(&as->a_updatetime);
1677 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1678 AS_LOCK_EXIT(as);
1680 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1681 RCA_UNSAFE_ALL);
1683 return (ENOMEM);
1686 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1687 crargs = *(struct segvn_crargs *)argsp;
1688 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1689 if (error != 0) {
1690 AS_LOCK_EXIT(as);
1691 if (unmap) {
1692 (void) as_unmap(as, addr, size);
1694 return (error);
1696 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1697 crargs = *(struct segvn_crargs *)argsp;
1698 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1699 if (error != 0) {
1700 AS_LOCK_EXIT(as);
1701 if (unmap) {
1702 (void) as_unmap(as, addr, size);
1704 return (error);
1706 } else {
1707 seg = seg_alloc(as, addr, size);
1708 if (seg == NULL) {
1709 AS_LOCK_EXIT(as);
1710 return (ENOMEM);
1713 error = (*crfp)(seg, argsp);
1714 if (error != 0) {
1715 seg_free(seg);
1716 AS_LOCK_EXIT(as);
1717 return (error);
1720 * Add size now so as_unmap will work if as_ctl fails.
1722 as->a_size += rsize;
1723 as->a_resvsize += rsize;
1726 as_setwatch(as);
1729 * If the address space is locked,
1730 * establish memory locks for the new segment.
1732 mutex_enter(&as->a_contents);
1733 if (AS_ISPGLCK(as)) {
1734 mutex_exit(&as->a_contents);
1735 AS_LOCK_EXIT(as);
1736 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1737 if (error != 0)
1738 (void) as_unmap(as, addr, size);
1739 } else {
1740 mutex_exit(&as->a_contents);
1741 AS_LOCK_EXIT(as);
1743 return (error);
1748 * Delete all segments in the address space marked with S_PURGE.
1749 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1750 * These segments are deleted as a first step before calls to as_gap(), so
1751 * that they don't affect mmap() or shmat().
1753 void
1754 as_purge(struct as *as)
1756 struct seg *seg;
1757 struct seg *next_seg;
1760 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1761 * no need to grab a_contents mutex for this check
1763 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1764 return;
1766 AS_LOCK_ENTER(as, RW_WRITER);
1767 next_seg = NULL;
1768 seg = AS_SEGFIRST(as);
1769 while (seg != NULL) {
1770 next_seg = AS_SEGNEXT(as, seg);
1771 if (seg->s_flags & S_PURGE)
1772 (void) segop_unmap(seg, seg->s_base, seg->s_size);
1773 seg = next_seg;
1775 AS_LOCK_EXIT(as);
1777 mutex_enter(&as->a_contents);
1778 as->a_flags &= ~AS_NEEDSPURGE;
1779 mutex_exit(&as->a_contents);
1783 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1784 * range of addresses at least "minlen" long, where the base of the range is
1785 * at "off" phase from an "align" boundary and there is space for a
1786 * "redzone"-sized redzone on eithe rside of the range. Thus,
1787 * if align was 4M and off was 16k, the user wants a hole which will start
1788 * 16k into a 4M page.
1790 * If flags specifies AH_HI, the hole will have the highest possible address
1791 * in the range. We use the as->a_lastgap field to figure out where to
1792 * start looking for a gap.
1794 * Otherwise, the gap will have the lowest possible address.
1796 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1798 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1799 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1801 * NOTE: This routine is not correct when base+len overflows caddr_t.
1804 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1805 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1807 caddr_t lobound = *basep;
1808 caddr_t hibound = lobound + *lenp;
1809 struct seg *lseg, *hseg;
1810 caddr_t lo, hi;
1811 int forward;
1812 caddr_t save_base;
1813 size_t save_len;
1814 size_t save_minlen;
1815 size_t save_redzone;
1816 int fast_path = 1;
1818 save_base = *basep;
1819 save_len = *lenp;
1820 save_minlen = minlen;
1821 save_redzone = redzone;
1824 * For the first pass/fast_path, just add align and redzone into
1825 * minlen since if we get an allocation, we can guarantee that it
1826 * will fit the alignment and redzone requested.
1827 * This increases the chance that hibound will be adjusted to
1828 * a_lastgap->s_base which will likely allow us to find an
1829 * acceptable hole in the address space quicker.
1830 * If we can't find a hole with this fast_path, then we look for
1831 * smaller holes in which the alignment and offset may allow
1832 * the allocation to fit.
1834 minlen += align;
1835 minlen += 2 * redzone;
1836 redzone = 0;
1838 AS_LOCK_ENTER(as, RW_READER);
1839 if (AS_SEGFIRST(as) == NULL) {
1840 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1841 align, redzone, off)) {
1842 AS_LOCK_EXIT(as);
1843 return (0);
1844 } else {
1845 AS_LOCK_EXIT(as);
1846 *basep = save_base;
1847 *lenp = save_len;
1848 return (-1);
1852 retry:
1854 * Set up to iterate over all the inter-segment holes in the given
1855 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1856 * NULL for the highest-addressed hole. If moving backwards, we reset
1857 * sseg to denote the highest-addressed segment.
1859 forward = (flags & AH_DIR) == AH_LO;
1860 if (forward) {
1861 hseg = as_findseg(as, lobound, 1);
1862 lseg = AS_SEGPREV(as, hseg);
1863 } else {
1866 * If allocating at least as much as the last allocation,
1867 * use a_lastgap's base as a better estimate of hibound.
1869 if (as->a_lastgap &&
1870 minlen >= as->a_lastgap->s_size &&
1871 hibound >= as->a_lastgap->s_base)
1872 hibound = as->a_lastgap->s_base;
1874 hseg = as_findseg(as, hibound, 1);
1875 if (hseg->s_base + hseg->s_size < hibound) {
1876 lseg = hseg;
1877 hseg = NULL;
1878 } else {
1879 lseg = AS_SEGPREV(as, hseg);
1883 for (;;) {
1885 * Set lo and hi to the hole's boundaries. (We should really
1886 * use MAXADDR in place of hibound in the expression below,
1887 * but can't express it easily; using hibound in its place is
1888 * harmless.)
1890 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1891 hi = (hseg == NULL) ? hibound : hseg->s_base;
1893 * If the iteration has moved past the interval from lobound
1894 * to hibound it's pointless to continue.
1896 if ((forward && lo > hibound) || (!forward && hi < lobound))
1897 break;
1898 else if (lo > hibound || hi < lobound)
1899 goto cont;
1901 * Candidate hole lies at least partially within the allowable
1902 * range. Restrict it to fall completely within that range,
1903 * i.e., to [max(lo, lobound), min(hi, hibound)].
1905 if (lo < lobound)
1906 lo = lobound;
1907 if (hi > hibound)
1908 hi = hibound;
1910 * Verify that the candidate hole is big enough and meets
1911 * hardware constraints. If the hole is too small, no need
1912 * to do the further checks since they will fail.
1914 *basep = lo;
1915 *lenp = hi - lo;
1916 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1917 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1918 ((flags & AH_CONTAIN) == 0 ||
1919 (*basep <= addr && *basep + *lenp > addr))) {
1920 if (!forward)
1921 as->a_lastgap = hseg;
1922 if (hseg != NULL)
1923 as->a_lastgaphl = hseg;
1924 else
1925 as->a_lastgaphl = lseg;
1926 AS_LOCK_EXIT(as);
1927 return (0);
1929 cont:
1931 * Move to the next hole.
1933 if (forward) {
1934 lseg = hseg;
1935 if (lseg == NULL)
1936 break;
1937 hseg = AS_SEGNEXT(as, hseg);
1938 } else {
1939 hseg = lseg;
1940 if (hseg == NULL)
1941 break;
1942 lseg = AS_SEGPREV(as, lseg);
1945 if (fast_path && (align != 0 || save_redzone != 0)) {
1946 fast_path = 0;
1947 minlen = save_minlen;
1948 redzone = save_redzone;
1949 goto retry;
1951 *basep = save_base;
1952 *lenp = save_len;
1953 AS_LOCK_EXIT(as);
1954 return (-1);
1958 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1960 * If flags specifies AH_HI, the hole will have the highest possible address
1961 * in the range. We use the as->a_lastgap field to figure out where to
1962 * start looking for a gap.
1964 * Otherwise, the gap will have the lowest possible address.
1966 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1968 * If an adequate hole is found, base and len are set to reflect the part of
1969 * the hole that is within range, and 0 is returned, otherwise,
1970 * -1 is returned.
1972 * NOTE: This routine is not correct when base+len overflows caddr_t.
1975 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1976 caddr_t addr)
1979 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1983 * Return the next range within [base, base + len) that is backed
1984 * with "real memory". Skip holes and non-seg_vn segments.
1985 * We're lazy and only return one segment at a time.
1988 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
1990 extern const struct seg_ops segspt_shmops; /* needs a header file */
1991 struct seg *seg;
1992 caddr_t addr, eaddr;
1993 caddr_t segend;
1995 AS_LOCK_ENTER(as, RW_READER);
1997 addr = *basep;
1998 eaddr = addr + *lenp;
2000 seg = as_findseg(as, addr, 0);
2001 if (seg != NULL)
2002 addr = MAX(seg->s_base, addr);
2004 for (;;) {
2005 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2006 AS_LOCK_EXIT(as);
2007 return (EINVAL);
2010 if (seg->s_ops == &segvn_ops) {
2011 segend = seg->s_base + seg->s_size;
2012 break;
2016 * We do ISM by looking into the private data
2017 * to determine the real size of the segment.
2019 if (seg->s_ops == &segspt_shmops) {
2020 segend = seg->s_base + spt_realsize(seg);
2021 if (addr < segend)
2022 break;
2025 seg = AS_SEGNEXT(as, seg);
2027 if (seg != NULL)
2028 addr = seg->s_base;
2031 *basep = addr;
2033 if (segend > eaddr)
2034 *lenp = eaddr - addr;
2035 else
2036 *lenp = segend - addr;
2038 AS_LOCK_EXIT(as);
2039 return (0);
2043 * Determine whether data from the mappings in interval [addr, addr + size)
2044 * are in the primary memory (core) cache.
2047 as_incore(struct as *as, caddr_t addr,
2048 size_t size, char *vec, size_t *sizep)
2050 struct seg *seg;
2051 size_t ssize;
2052 caddr_t raddr; /* rounded down addr */
2053 size_t rsize; /* rounded up size */
2054 size_t isize; /* iteration size */
2055 int error = 0; /* result, assume success */
2057 *sizep = 0;
2058 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2059 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2060 (size_t)raddr;
2062 if (raddr + rsize < raddr) /* check for wraparound */
2063 return (ENOMEM);
2065 AS_LOCK_ENTER(as, RW_READER);
2066 seg = as_segat(as, raddr);
2067 if (seg == NULL) {
2068 AS_LOCK_EXIT(as);
2069 return (-1);
2072 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2073 if (raddr >= seg->s_base + seg->s_size) {
2074 seg = AS_SEGNEXT(as, seg);
2075 if (seg == NULL || raddr != seg->s_base) {
2076 error = -1;
2077 break;
2080 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2081 ssize = seg->s_base + seg->s_size - raddr;
2082 else
2083 ssize = rsize;
2084 *sizep += isize = segop_incore(seg, raddr, ssize, vec);
2085 if (isize != ssize) {
2086 error = -1;
2087 break;
2089 vec += btopr(ssize);
2091 AS_LOCK_EXIT(as);
2092 return (error);
2095 static void
2096 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2097 ulong_t *bitmap, size_t position, size_t npages)
2099 caddr_t range_start;
2100 size_t pos1 = position;
2101 size_t pos2;
2102 size_t size;
2103 size_t end_pos = npages + position;
2105 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2106 size = ptob((pos2 - pos1));
2107 range_start = (caddr_t)((uintptr_t)addr +
2108 ptob(pos1 - position));
2110 (void) segop_lockop(seg, range_start, size, attr, MC_UNLOCK,
2111 NULL, 0);
2112 pos1 = pos2;
2116 static void
2117 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2118 caddr_t raddr, size_t rsize)
2120 struct seg *seg = as_segat(as, raddr);
2121 size_t ssize;
2123 while (rsize != 0) {
2124 if (raddr >= seg->s_base + seg->s_size)
2125 seg = AS_SEGNEXT(as, seg);
2127 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2128 ssize = seg->s_base + seg->s_size - raddr;
2129 else
2130 ssize = rsize;
2132 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2134 rsize -= ssize;
2135 raddr += ssize;
2140 * Cache control operations over the interval [addr, addr + size) in
2141 * address space "as".
2143 /*ARGSUSED*/
2145 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2146 uintptr_t arg, ulong_t *lock_map, size_t pos)
2148 struct seg *seg; /* working segment */
2149 caddr_t raddr; /* rounded down addr */
2150 caddr_t initraddr; /* saved initial rounded down addr */
2151 size_t rsize; /* rounded up size */
2152 size_t initrsize; /* saved initial rounded up size */
2153 size_t ssize; /* size of seg */
2154 int error = 0; /* result */
2155 size_t mlock_size; /* size of bitmap */
2156 ulong_t *mlock_map; /* pointer to bitmap used */
2157 /* to represent the locked */
2158 /* pages. */
2159 retry:
2160 if (error == IE_RETRY)
2161 AS_LOCK_ENTER(as, RW_WRITER);
2162 else
2163 AS_LOCK_ENTER(as, RW_READER);
2166 * If these are address space lock/unlock operations, loop over
2167 * all segments in the address space, as appropriate.
2169 if (func == MC_LOCKAS) {
2170 size_t npages, idx;
2171 size_t rlen = 0; /* rounded as length */
2173 idx = pos;
2175 if (arg & MCL_FUTURE) {
2176 mutex_enter(&as->a_contents);
2177 AS_SETPGLCK(as);
2178 mutex_exit(&as->a_contents);
2180 if ((arg & MCL_CURRENT) == 0) {
2181 AS_LOCK_EXIT(as);
2182 return (0);
2185 seg = AS_SEGFIRST(as);
2186 if (seg == NULL) {
2187 AS_LOCK_EXIT(as);
2188 return (0);
2191 do {
2192 raddr = (caddr_t)((uintptr_t)seg->s_base &
2193 (uintptr_t)PAGEMASK);
2194 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2195 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2196 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2198 mlock_size = BT_BITOUL(btopr(rlen));
2199 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2200 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2201 AS_LOCK_EXIT(as);
2202 return (EAGAIN);
2205 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2206 error = segop_lockop(seg, seg->s_base,
2207 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2208 if (error != 0)
2209 break;
2210 pos += seg_pages(seg);
2213 if (error) {
2214 for (seg = AS_SEGFIRST(as); seg != NULL;
2215 seg = AS_SEGNEXT(as, seg)) {
2217 raddr = (caddr_t)((uintptr_t)seg->s_base &
2218 (uintptr_t)PAGEMASK);
2219 npages = seg_pages(seg);
2220 as_segunlock(seg, raddr, attr, mlock_map,
2221 idx, npages);
2222 idx += npages;
2226 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2227 AS_LOCK_EXIT(as);
2228 goto lockerr;
2229 } else if (func == MC_UNLOCKAS) {
2230 mutex_enter(&as->a_contents);
2231 AS_CLRPGLCK(as);
2232 mutex_exit(&as->a_contents);
2234 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2235 error = segop_lockop(seg, seg->s_base,
2236 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2237 if (error != 0)
2238 break;
2241 AS_LOCK_EXIT(as);
2242 goto lockerr;
2246 * Normalize addresses and sizes.
2248 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2249 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2250 (size_t)raddr;
2252 if (raddr + rsize < raddr) { /* check for wraparound */
2253 AS_LOCK_EXIT(as);
2254 return (ENOMEM);
2258 * Get initial segment.
2260 if ((seg = as_segat(as, raddr)) == NULL) {
2261 AS_LOCK_EXIT(as);
2262 return (ENOMEM);
2265 if (func == MC_LOCK) {
2266 mlock_size = BT_BITOUL(btopr(rsize));
2267 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2268 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2269 AS_LOCK_EXIT(as);
2270 return (EAGAIN);
2275 * Loop over all segments. If a hole in the address range is
2276 * discovered, then fail. For each segment, perform the appropriate
2277 * control operation.
2279 while (rsize != 0) {
2282 * Make sure there's no hole, calculate the portion
2283 * of the next segment to be operated over.
2285 if (raddr >= seg->s_base + seg->s_size) {
2286 seg = AS_SEGNEXT(as, seg);
2287 if (seg == NULL || raddr != seg->s_base) {
2288 if (func == MC_LOCK) {
2289 as_unlockerr(as, attr, mlock_map,
2290 initraddr, initrsize - rsize);
2291 kmem_free(mlock_map,
2292 mlock_size * sizeof (ulong_t));
2294 AS_LOCK_EXIT(as);
2295 return (ENOMEM);
2298 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2299 ssize = seg->s_base + seg->s_size - raddr;
2300 else
2301 ssize = rsize;
2304 * Dispatch on specific function.
2306 switch (func) {
2309 * Synchronize cached data from mappings with backing
2310 * objects.
2312 case MC_SYNC:
2313 if (error = segop_sync(seg, raddr, ssize,
2314 attr, (uint_t)arg)) {
2315 AS_LOCK_EXIT(as);
2316 return (error);
2318 break;
2321 * Lock pages in memory.
2323 case MC_LOCK:
2324 if (error = segop_lockop(seg, raddr, ssize,
2325 attr, func, mlock_map, pos)) {
2326 as_unlockerr(as, attr, mlock_map, initraddr,
2327 initrsize - rsize + ssize);
2328 kmem_free(mlock_map, mlock_size *
2329 sizeof (ulong_t));
2330 AS_LOCK_EXIT(as);
2331 goto lockerr;
2333 break;
2336 * Unlock mapped pages.
2338 case MC_UNLOCK:
2339 (void) segop_lockop(seg, raddr, ssize, attr, func,
2340 NULL, 0);
2341 break;
2344 * Store VM advise for mapped pages in segment layer.
2346 case MC_ADVISE:
2347 error = segop_advise(seg, raddr, ssize, (uint_t)arg);
2350 * Check for regular errors and special retry error
2352 if (error) {
2353 if (error == IE_RETRY) {
2355 * Need to acquire writers lock, so
2356 * have to drop readers lock and start
2357 * all over again
2359 AS_LOCK_EXIT(as);
2360 goto retry;
2361 } else if (error == IE_REATTACH) {
2363 * Find segment for current address
2364 * because current segment just got
2365 * split or concatenated
2367 seg = as_segat(as, raddr);
2368 if (seg == NULL) {
2369 AS_LOCK_EXIT(as);
2370 return (ENOMEM);
2372 } else {
2374 * Regular error
2376 AS_LOCK_EXIT(as);
2377 return (error);
2380 break;
2382 case MC_INHERIT_ZERO:
2383 error = segop_inherit(seg, raddr, ssize, SEGP_INH_ZERO);
2384 if (error != 0) {
2385 AS_LOCK_EXIT(as);
2386 return (error);
2388 break;
2391 * Can't happen.
2393 default:
2394 panic("as_ctl: bad operation %d", func);
2395 /*NOTREACHED*/
2398 rsize -= ssize;
2399 raddr += ssize;
2402 if (func == MC_LOCK)
2403 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2404 AS_LOCK_EXIT(as);
2405 return (0);
2406 lockerr:
2409 * If the lower levels returned EDEADLK for a segment lockop,
2410 * it means that we should retry the operation. Let's wait
2411 * a bit also to let the deadlock causing condition clear.
2412 * This is part of a gross hack to work around a design flaw
2413 * in the ufs/sds logging code and should go away when the
2414 * logging code is re-designed to fix the problem. See bug
2415 * 4125102 for details of the problem.
2417 if (error == EDEADLK) {
2418 delay(deadlk_wait);
2419 error = 0;
2420 goto retry;
2422 return (error);
2426 fc_decode(faultcode_t fault_err)
2428 int error = 0;
2430 switch (FC_CODE(fault_err)) {
2431 case FC_OBJERR:
2432 error = FC_ERRNO(fault_err);
2433 break;
2434 case FC_PROT:
2435 error = EACCES;
2436 break;
2437 default:
2438 error = EFAULT;
2439 break;
2441 return (error);
2445 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2446 * lists from each segment and copy them to one contiguous shadow list (plist)
2447 * as expected by the caller. Save pointers to per segment shadow lists at
2448 * the tail of plist so that they can be used during as_pageunlock().
2450 static int
2451 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2452 caddr_t addr, size_t size, enum seg_rw rw)
2454 caddr_t sv_addr = addr;
2455 size_t sv_size = size;
2456 struct seg *sv_seg = seg;
2457 ulong_t segcnt = 1;
2458 ulong_t cnt;
2459 size_t ssize;
2460 pgcnt_t npages = btop(size);
2461 page_t **plist;
2462 page_t **pl;
2463 int error;
2464 caddr_t eaddr;
2465 faultcode_t fault_err = 0;
2466 pgcnt_t pl_off;
2467 extern const struct seg_ops segspt_shmops;
2469 ASSERT(AS_LOCK_HELD(as));
2470 ASSERT(seg != NULL);
2471 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2472 ASSERT(addr + size > seg->s_base + seg->s_size);
2473 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2474 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2477 * Count the number of segments covered by the range we are about to
2478 * lock. The segment count is used to size the shadow list we return
2479 * back to the caller.
2481 for (; size != 0; size -= ssize, addr += ssize) {
2482 if (addr >= seg->s_base + seg->s_size) {
2484 seg = AS_SEGNEXT(as, seg);
2485 if (seg == NULL || addr != seg->s_base) {
2486 AS_LOCK_EXIT(as);
2487 return (EFAULT);
2490 * Do a quick check if subsequent segments
2491 * will most likely support pagelock.
2493 if (seg->s_ops == &segvn_ops) {
2494 vnode_t *vp;
2496 if (segop_getvp(seg, addr, &vp) != 0 ||
2497 vp != NULL) {
2498 AS_LOCK_EXIT(as);
2499 goto slow;
2501 } else if (seg->s_ops != &segspt_shmops) {
2502 AS_LOCK_EXIT(as);
2503 goto slow;
2505 segcnt++;
2507 if (addr + size > seg->s_base + seg->s_size) {
2508 ssize = seg->s_base + seg->s_size - addr;
2509 } else {
2510 ssize = size;
2513 ASSERT(segcnt > 1);
2515 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2517 addr = sv_addr;
2518 size = sv_size;
2519 seg = sv_seg;
2521 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2522 if (addr >= seg->s_base + seg->s_size) {
2523 seg = AS_SEGNEXT(as, seg);
2524 ASSERT(seg != NULL && addr == seg->s_base);
2525 cnt++;
2526 ASSERT(cnt < segcnt);
2528 if (addr + size > seg->s_base + seg->s_size) {
2529 ssize = seg->s_base + seg->s_size - addr;
2530 } else {
2531 ssize = size;
2533 pl = &plist[npages + cnt];
2534 error = segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2535 L_PAGELOCK, rw);
2536 if (error) {
2537 break;
2539 ASSERT(plist[npages + cnt] != NULL);
2540 ASSERT(pl_off + btop(ssize) <= npages);
2541 bcopy(plist[npages + cnt], &plist[pl_off],
2542 btop(ssize) * sizeof (page_t *));
2543 pl_off += btop(ssize);
2546 if (size == 0) {
2547 AS_LOCK_EXIT(as);
2548 ASSERT(cnt == segcnt - 1);
2549 *ppp = plist;
2550 return (0);
2554 * one of pagelock calls failed. The error type is in error variable.
2555 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2556 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2557 * back to the caller.
2560 eaddr = addr;
2561 seg = sv_seg;
2563 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2564 if (addr >= seg->s_base + seg->s_size) {
2565 seg = AS_SEGNEXT(as, seg);
2566 ASSERT(seg != NULL && addr == seg->s_base);
2567 cnt++;
2568 ASSERT(cnt < segcnt);
2570 if (eaddr > seg->s_base + seg->s_size) {
2571 ssize = seg->s_base + seg->s_size - addr;
2572 } else {
2573 ssize = eaddr - addr;
2575 pl = &plist[npages + cnt];
2576 ASSERT(*pl != NULL);
2577 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2578 L_PAGEUNLOCK, rw);
2581 AS_LOCK_EXIT(as);
2583 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2585 if (error != ENOTSUP && error != EFAULT) {
2586 return (error);
2589 slow:
2591 * If we are here because pagelock failed due to the need to cow fault
2592 * in the pages we want to lock F_SOFTLOCK will do this job and in
2593 * next as_pagelock() call for this address range pagelock will
2594 * hopefully succeed.
2596 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2597 if (fault_err != 0) {
2598 return (fc_decode(fault_err));
2600 *ppp = NULL;
2602 return (0);
2606 * lock pages in a given address space. Return shadow list. If
2607 * the list is NULL, the MMU mapping is also locked.
2610 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2611 size_t size, enum seg_rw rw)
2613 size_t rsize;
2614 caddr_t raddr;
2615 faultcode_t fault_err;
2616 struct seg *seg;
2617 int err;
2619 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2620 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2621 (size_t)raddr;
2624 * if the request crosses two segments let
2625 * as_fault handle it.
2627 AS_LOCK_ENTER(as, RW_READER);
2629 seg = as_segat(as, raddr);
2630 if (seg == NULL) {
2631 AS_LOCK_EXIT(as);
2632 return (EFAULT);
2634 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2635 if (raddr + rsize > seg->s_base + seg->s_size) {
2636 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2638 if (raddr + rsize <= raddr) {
2639 AS_LOCK_EXIT(as);
2640 return (EFAULT);
2644 * try to lock pages and pass back shadow list
2646 err = segop_pagelock(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2648 AS_LOCK_EXIT(as);
2650 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2651 return (err);
2655 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2656 * to no pagelock support for this segment or pages need to be cow
2657 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2658 * this as_pagelock() call and in the next as_pagelock() call for the
2659 * same address range pagelock call will hopefull succeed.
2661 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2662 if (fault_err != 0) {
2663 return (fc_decode(fault_err));
2665 *ppp = NULL;
2667 return (0);
2671 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2672 * lists from the end of plist and call pageunlock interface for each segment.
2673 * Drop as lock and free plist.
2675 static void
2676 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2677 struct page **plist, enum seg_rw rw)
2679 ulong_t cnt;
2680 caddr_t eaddr = addr + size;
2681 pgcnt_t npages = btop(size);
2682 size_t ssize;
2683 page_t **pl;
2685 ASSERT(AS_LOCK_HELD(as));
2686 ASSERT(seg != NULL);
2687 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2688 ASSERT(addr + size > seg->s_base + seg->s_size);
2689 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2690 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2691 ASSERT(plist != NULL);
2693 for (cnt = 0; addr < eaddr; addr += ssize) {
2694 if (addr >= seg->s_base + seg->s_size) {
2695 seg = AS_SEGNEXT(as, seg);
2696 ASSERT(seg != NULL && addr == seg->s_base);
2697 cnt++;
2699 if (eaddr > seg->s_base + seg->s_size) {
2700 ssize = seg->s_base + seg->s_size - addr;
2701 } else {
2702 ssize = eaddr - addr;
2704 pl = &plist[npages + cnt];
2705 ASSERT(*pl != NULL);
2706 (void) segop_pagelock(seg, addr, ssize, (page_t ***)pl,
2707 L_PAGEUNLOCK, rw);
2709 ASSERT(cnt > 0);
2710 AS_LOCK_EXIT(as);
2712 cnt++;
2713 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2717 * unlock pages in a given address range
2719 void
2720 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2721 enum seg_rw rw)
2723 struct seg *seg;
2724 size_t rsize;
2725 caddr_t raddr;
2728 * if the shadow list is NULL, as_pagelock was
2729 * falling back to as_fault
2731 if (pp == NULL) {
2732 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2733 return;
2736 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2737 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2738 (size_t)raddr;
2740 AS_LOCK_ENTER(as, RW_READER);
2741 seg = as_segat(as, raddr);
2742 ASSERT(seg != NULL);
2744 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2745 if (raddr + rsize <= seg->s_base + seg->s_size) {
2746 (void) segop_pagelock(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2747 } else {
2748 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2749 return;
2751 AS_LOCK_EXIT(as);
2755 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2756 boolean_t wait)
2758 struct seg *seg;
2759 size_t ssize;
2760 caddr_t raddr; /* rounded down addr */
2761 size_t rsize; /* rounded up size */
2762 int error = 0;
2763 size_t pgsz = page_get_pagesize(szc);
2765 setpgsz_top:
2766 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2767 return (EINVAL);
2770 raddr = addr;
2771 rsize = size;
2773 if (raddr + rsize < raddr) /* check for wraparound */
2774 return (ENOMEM);
2776 AS_LOCK_ENTER(as, RW_WRITER);
2777 as_clearwatchprot(as, raddr, rsize);
2778 seg = as_segat(as, raddr);
2779 if (seg == NULL) {
2780 as_setwatch(as);
2781 AS_LOCK_EXIT(as);
2782 return (ENOMEM);
2785 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2786 if (raddr >= seg->s_base + seg->s_size) {
2787 seg = AS_SEGNEXT(as, seg);
2788 if (seg == NULL || raddr != seg->s_base) {
2789 error = ENOMEM;
2790 break;
2793 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2794 ssize = seg->s_base + seg->s_size - raddr;
2795 } else {
2796 ssize = rsize;
2799 retry:
2800 error = segop_setpagesize(seg, raddr, ssize, szc);
2802 if (error == IE_NOMEM) {
2803 error = EAGAIN;
2804 break;
2807 if (error == IE_RETRY) {
2808 AS_LOCK_EXIT(as);
2809 goto setpgsz_top;
2812 if (error == ENOTSUP) {
2813 error = EINVAL;
2814 break;
2817 if (wait && (error == EAGAIN)) {
2819 * Memory is currently locked. It must be unlocked
2820 * before this operation can succeed through a retry.
2821 * The possible reasons for locked memory and
2822 * corresponding strategies for unlocking are:
2823 * (1) Normal I/O
2824 * wait for a signal that the I/O operation
2825 * has completed and the memory is unlocked.
2826 * (2) Asynchronous I/O
2827 * The aio subsystem does not unlock pages when
2828 * the I/O is completed. Those pages are unlocked
2829 * when the application calls aiowait/aioerror.
2830 * So, to prevent blocking forever, cv_broadcast()
2831 * is done to wake up aio_cleanup_thread.
2832 * Subsequently, segvn_reclaim will be called, and
2833 * that will do AS_CLRUNMAPWAIT() and wake us up.
2834 * (3) Long term page locking:
2835 * This is not relevant for as_setpagesize()
2836 * because we cannot change the page size for
2837 * driver memory. The attempt to do so will
2838 * fail with a different error than EAGAIN so
2839 * there's no need to trigger as callbacks like
2840 * as_unmap, as_setprot or as_free would do.
2842 mutex_enter(&as->a_contents);
2843 if (!AS_ISNOUNMAPWAIT(as)) {
2844 if (AS_ISUNMAPWAIT(as) == 0) {
2845 cv_broadcast(&as->a_cv);
2847 AS_SETUNMAPWAIT(as);
2848 AS_LOCK_EXIT(as);
2849 while (AS_ISUNMAPWAIT(as)) {
2850 cv_wait(&as->a_cv, &as->a_contents);
2852 } else {
2854 * We may have raced with
2855 * segvn_reclaim()/segspt_reclaim(). In this
2856 * case clean nounmapwait flag and retry since
2857 * softlockcnt in this segment may be already
2858 * 0. We don't drop as writer lock so our
2859 * number of retries without sleeping should
2860 * be very small. See segvn_reclaim() for
2861 * more comments.
2863 AS_CLRNOUNMAPWAIT(as);
2864 mutex_exit(&as->a_contents);
2865 goto retry;
2867 mutex_exit(&as->a_contents);
2868 goto setpgsz_top;
2869 } else if (error != 0) {
2870 break;
2873 as_setwatch(as);
2874 AS_LOCK_EXIT(as);
2875 return (error);
2879 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2880 * in its chunk where s_szc is less than the szc we want to set.
2882 static int
2883 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2884 int *retry)
2886 struct seg *seg;
2887 size_t ssize;
2888 int error;
2890 ASSERT(AS_WRITE_HELD(as));
2892 seg = as_segat(as, raddr);
2893 if (seg == NULL) {
2894 panic("as_iset3_default_lpsize: no seg");
2897 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2898 if (raddr >= seg->s_base + seg->s_size) {
2899 seg = AS_SEGNEXT(as, seg);
2900 if (seg == NULL || raddr != seg->s_base) {
2901 panic("as_iset3_default_lpsize: as changed");
2904 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2905 ssize = seg->s_base + seg->s_size - raddr;
2906 } else {
2907 ssize = rsize;
2910 if (szc > seg->s_szc) {
2911 error = segop_setpagesize(seg, raddr, ssize, szc);
2912 /* Only retry on EINVAL segments that have no vnode. */
2913 if (error == EINVAL) {
2914 vnode_t *vp = NULL;
2915 if ((segop_gettype(seg, raddr) & MAP_SHARED) &&
2916 (segop_getvp(seg, raddr, &vp) != 0 ||
2917 vp == NULL)) {
2918 *retry = 1;
2919 } else {
2920 *retry = 0;
2923 if (error) {
2924 return (error);
2928 return (0);
2932 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2933 * pagesize on each segment in its range, but if any fails with EINVAL,
2934 * then it reduces the pagesizes to the next size in the bitmap and
2935 * retries as_iset3_default_lpsize(). The reason why the code retries
2936 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2937 * match the bigger sizes, and (b) it's hard to get this offset (to begin
2938 * with) to pass to map_pgszcvec().
2940 static int
2941 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2942 uint_t szcvec)
2944 int error;
2945 int retry;
2947 ASSERT(AS_WRITE_HELD(as));
2949 for (;;) {
2950 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
2951 if (error == EINVAL && retry) {
2952 szcvec &= ~(1 << szc);
2953 if (szcvec <= 1) {
2954 return (EINVAL);
2956 szc = highbit(szcvec) - 1;
2957 } else {
2958 return (error);
2964 * as_iset1_default_lpsize() breaks its chunk into areas where existing
2965 * segments have a smaller szc than we want to set. For each such area,
2966 * it calls as_iset2_default_lpsize()
2968 static int
2969 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2970 uint_t szcvec)
2972 struct seg *seg;
2973 size_t ssize;
2974 caddr_t setaddr = raddr;
2975 size_t setsize = 0;
2976 int set;
2977 int error;
2979 ASSERT(AS_WRITE_HELD(as));
2981 seg = as_segat(as, raddr);
2982 if (seg == NULL) {
2983 panic("as_iset1_default_lpsize: no seg");
2985 if (seg->s_szc < szc) {
2986 set = 1;
2987 } else {
2988 set = 0;
2991 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
2992 if (raddr >= seg->s_base + seg->s_size) {
2993 seg = AS_SEGNEXT(as, seg);
2994 if (seg == NULL || raddr != seg->s_base) {
2995 panic("as_iset1_default_lpsize: as changed");
2997 if (seg->s_szc >= szc && set) {
2998 ASSERT(setsize != 0);
2999 error = as_iset2_default_lpsize(as,
3000 setaddr, setsize, szc, szcvec);
3001 if (error) {
3002 return (error);
3004 set = 0;
3005 } else if (seg->s_szc < szc && !set) {
3006 setaddr = raddr;
3007 setsize = 0;
3008 set = 1;
3011 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3012 ssize = seg->s_base + seg->s_size - raddr;
3013 } else {
3014 ssize = rsize;
3017 error = 0;
3018 if (set) {
3019 ASSERT(setsize != 0);
3020 error = as_iset2_default_lpsize(as, setaddr, setsize,
3021 szc, szcvec);
3023 return (error);
3027 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3028 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3029 * chunk to as_iset1_default_lpsize().
3031 static int
3032 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3033 int type)
3035 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3036 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3037 flags, rtype, 1);
3038 uint_t szc;
3039 uint_t nszc;
3040 int error;
3041 caddr_t a;
3042 caddr_t eaddr;
3043 size_t segsize;
3044 size_t pgsz;
3045 uint_t save_szcvec;
3047 ASSERT(AS_WRITE_HELD(as));
3048 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3049 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3051 szcvec &= ~1;
3052 if (szcvec <= 1) { /* skip if base page size */
3053 return (0);
3056 /* Get the pagesize of the first larger page size. */
3057 szc = lowbit(szcvec) - 1;
3058 pgsz = page_get_pagesize(szc);
3059 eaddr = addr + size;
3060 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3061 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3063 save_szcvec = szcvec;
3064 szcvec >>= (szc + 1);
3065 nszc = szc;
3066 while (szcvec) {
3067 if ((szcvec & 0x1) == 0) {
3068 nszc++;
3069 szcvec >>= 1;
3070 continue;
3072 nszc++;
3073 pgsz = page_get_pagesize(nszc);
3074 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3075 if (a != addr) {
3076 ASSERT(szc > 0);
3077 ASSERT(a < eaddr);
3078 segsize = a - addr;
3079 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3080 save_szcvec);
3081 if (error) {
3082 return (error);
3084 addr = a;
3086 szc = nszc;
3087 szcvec >>= 1;
3090 ASSERT(addr < eaddr);
3091 szcvec = save_szcvec;
3092 while (szcvec) {
3093 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3094 ASSERT(a >= addr);
3095 if (a != addr) {
3096 ASSERT(szc > 0);
3097 segsize = a - addr;
3098 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3099 save_szcvec);
3100 if (error) {
3101 return (error);
3103 addr = a;
3105 szcvec &= ~(1 << szc);
3106 if (szcvec) {
3107 szc = highbit(szcvec) - 1;
3108 pgsz = page_get_pagesize(szc);
3111 ASSERT(addr == eaddr);
3113 return (0);
3117 * Set the default large page size for the range. Called via memcntl with
3118 * page size set to 0. as_set_default_lpsize breaks the range down into
3119 * chunks with the same type/flags, ignores-non segvn segments, and passes
3120 * each chunk to as_iset_default_lpsize().
3123 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3125 struct seg *seg;
3126 caddr_t raddr;
3127 size_t rsize;
3128 size_t ssize;
3129 int rtype, rflags;
3130 int stype, sflags;
3131 int error;
3132 caddr_t setaddr;
3133 size_t setsize;
3134 int segvn;
3136 if (size == 0)
3137 return (0);
3139 AS_LOCK_ENTER(as, RW_WRITER);
3140 again:
3141 error = 0;
3143 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3144 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3145 (size_t)raddr;
3147 if (raddr + rsize < raddr) { /* check for wraparound */
3148 AS_LOCK_EXIT(as);
3149 return (ENOMEM);
3151 as_clearwatchprot(as, raddr, rsize);
3152 seg = as_segat(as, raddr);
3153 if (seg == NULL) {
3154 as_setwatch(as);
3155 AS_LOCK_EXIT(as);
3156 return (ENOMEM);
3158 if (seg->s_ops == &segvn_ops) {
3159 rtype = segop_gettype(seg, addr);
3160 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3161 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3162 segvn = 1;
3163 } else {
3164 segvn = 0;
3166 setaddr = raddr;
3167 setsize = 0;
3169 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3170 if (raddr >= (seg->s_base + seg->s_size)) {
3171 seg = AS_SEGNEXT(as, seg);
3172 if (seg == NULL || raddr != seg->s_base) {
3173 error = ENOMEM;
3174 break;
3176 if (seg->s_ops == &segvn_ops) {
3177 stype = segop_gettype(seg, raddr);
3178 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3179 stype &= (MAP_SHARED | MAP_PRIVATE);
3180 if (segvn && (rflags != sflags ||
3181 rtype != stype)) {
3183 * The next segment is also segvn but
3184 * has different flags and/or type.
3186 ASSERT(setsize != 0);
3187 error = as_iset_default_lpsize(as,
3188 setaddr, setsize, rflags, rtype);
3189 if (error) {
3190 break;
3192 rflags = sflags;
3193 rtype = stype;
3194 setaddr = raddr;
3195 setsize = 0;
3196 } else if (!segvn) {
3197 rflags = sflags;
3198 rtype = stype;
3199 setaddr = raddr;
3200 setsize = 0;
3201 segvn = 1;
3203 } else if (segvn) {
3204 /* The next segment is not segvn. */
3205 ASSERT(setsize != 0);
3206 error = as_iset_default_lpsize(as,
3207 setaddr, setsize, rflags, rtype);
3208 if (error) {
3209 break;
3211 segvn = 0;
3214 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3215 ssize = seg->s_base + seg->s_size - raddr;
3216 } else {
3217 ssize = rsize;
3220 if (error == 0 && segvn) {
3221 /* The last chunk when rsize == 0. */
3222 ASSERT(setsize != 0);
3223 error = as_iset_default_lpsize(as, setaddr, setsize,
3224 rflags, rtype);
3227 if (error == IE_RETRY) {
3228 goto again;
3229 } else if (error == IE_NOMEM) {
3230 error = EAGAIN;
3231 } else if (error == ENOTSUP) {
3232 error = EINVAL;
3233 } else if (error == EAGAIN) {
3234 mutex_enter(&as->a_contents);
3235 if (!AS_ISNOUNMAPWAIT(as)) {
3236 if (AS_ISUNMAPWAIT(as) == 0) {
3237 cv_broadcast(&as->a_cv);
3239 AS_SETUNMAPWAIT(as);
3240 AS_LOCK_EXIT(as);
3241 while (AS_ISUNMAPWAIT(as)) {
3242 cv_wait(&as->a_cv, &as->a_contents);
3244 mutex_exit(&as->a_contents);
3245 AS_LOCK_ENTER(as, RW_WRITER);
3246 } else {
3248 * We may have raced with
3249 * segvn_reclaim()/segspt_reclaim(). In this case
3250 * clean nounmapwait flag and retry since softlockcnt
3251 * in this segment may be already 0. We don't drop as
3252 * writer lock so our number of retries without
3253 * sleeping should be very small. See segvn_reclaim()
3254 * for more comments.
3256 AS_CLRNOUNMAPWAIT(as);
3257 mutex_exit(&as->a_contents);
3259 goto again;
3262 as_setwatch(as);
3263 AS_LOCK_EXIT(as);
3264 return (error);
3268 * Setup all of the uninitialized watched pages that we can.
3270 void
3271 as_setwatch(struct as *as)
3273 struct watched_page *pwp;
3274 struct seg *seg;
3275 caddr_t vaddr;
3276 uint_t prot;
3277 int err, retrycnt;
3279 if (avl_numnodes(&as->a_wpage) == 0)
3280 return;
3282 ASSERT(AS_WRITE_HELD(as));
3284 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3285 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3286 retrycnt = 0;
3287 retry:
3288 vaddr = pwp->wp_vaddr;
3289 if (pwp->wp_oprot != 0 || /* already set up */
3290 (seg = as_segat(as, vaddr)) == NULL ||
3291 segop_getprot(seg, vaddr, 0, &prot) != 0)
3292 continue;
3294 pwp->wp_oprot = prot;
3295 if (pwp->wp_read)
3296 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3297 if (pwp->wp_write)
3298 prot &= ~PROT_WRITE;
3299 if (pwp->wp_exec)
3300 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3301 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3302 err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3303 if (err == IE_RETRY) {
3304 pwp->wp_oprot = 0;
3305 ASSERT(retrycnt == 0);
3306 retrycnt++;
3307 goto retry;
3310 pwp->wp_prot = prot;
3315 * Clear all of the watched pages in the address space.
3317 void
3318 as_clearwatch(struct as *as)
3320 struct watched_page *pwp;
3321 struct seg *seg;
3322 caddr_t vaddr;
3323 uint_t prot;
3324 int err, retrycnt;
3326 if (avl_numnodes(&as->a_wpage) == 0)
3327 return;
3329 ASSERT(AS_WRITE_HELD(as));
3331 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3332 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3333 retrycnt = 0;
3334 retry:
3335 vaddr = pwp->wp_vaddr;
3336 if (pwp->wp_oprot == 0 || /* not set up */
3337 (seg = as_segat(as, vaddr)) == NULL)
3338 continue;
3340 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3341 err = segop_setprot(seg, vaddr, PAGESIZE, prot);
3342 if (err == IE_RETRY) {
3343 ASSERT(retrycnt == 0);
3344 retrycnt++;
3345 goto retry;
3348 pwp->wp_oprot = 0;
3349 pwp->wp_prot = 0;
3354 * Force a new setup for all the watched pages in the range.
3356 static void
3357 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3359 struct watched_page *pwp;
3360 struct watched_page tpw;
3361 caddr_t eaddr = addr + size;
3362 caddr_t vaddr;
3363 struct seg *seg;
3364 int err, retrycnt;
3365 uint_t wprot;
3366 avl_index_t where;
3368 if (avl_numnodes(&as->a_wpage) == 0)
3369 return;
3371 ASSERT(AS_WRITE_HELD(as));
3373 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3374 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3375 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3377 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3378 retrycnt = 0;
3379 vaddr = pwp->wp_vaddr;
3381 wprot = prot;
3382 if (pwp->wp_read)
3383 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3384 if (pwp->wp_write)
3385 wprot &= ~PROT_WRITE;
3386 if (pwp->wp_exec)
3387 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3388 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3389 retry:
3390 seg = as_segat(as, vaddr);
3391 if (seg == NULL) {
3392 panic("as_setwatchprot: no seg");
3393 /*NOTREACHED*/
3395 err = segop_setprot(seg, vaddr, PAGESIZE, wprot);
3396 if (err == IE_RETRY) {
3397 ASSERT(retrycnt == 0);
3398 retrycnt++;
3399 goto retry;
3402 pwp->wp_oprot = prot;
3403 pwp->wp_prot = wprot;
3405 pwp = AVL_NEXT(&as->a_wpage, pwp);
3410 * Clear all of the watched pages in the range.
3412 static void
3413 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3415 caddr_t eaddr = addr + size;
3416 struct watched_page *pwp;
3417 struct watched_page tpw;
3418 uint_t prot;
3419 struct seg *seg;
3420 int err, retrycnt;
3421 avl_index_t where;
3423 if (avl_numnodes(&as->a_wpage) == 0)
3424 return;
3426 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3427 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3428 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3430 ASSERT(AS_WRITE_HELD(as));
3432 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3434 if ((prot = pwp->wp_oprot) != 0) {
3435 retrycnt = 0;
3437 if (prot != pwp->wp_prot) {
3438 retry:
3439 seg = as_segat(as, pwp->wp_vaddr);
3440 if (seg == NULL)
3441 continue;
3442 err = segop_setprot(seg, pwp->wp_vaddr,
3443 PAGESIZE, prot);
3444 if (err == IE_RETRY) {
3445 ASSERT(retrycnt == 0);
3446 retrycnt++;
3447 goto retry;
3451 pwp->wp_oprot = 0;
3452 pwp->wp_prot = 0;
3455 pwp = AVL_NEXT(&as->a_wpage, pwp);
3459 void
3460 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3462 struct proc *p;
3464 mutex_enter(&pidlock);
3465 for (p = practive; p; p = p->p_next) {
3466 if (p->p_as == as) {
3467 mutex_enter(&p->p_lock);
3468 if (p->p_as == as)
3469 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3470 mutex_exit(&p->p_lock);
3473 mutex_exit(&pidlock);
3477 * return memory object ID
3480 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3482 struct seg *seg;
3483 int sts;
3485 AS_LOCK_ENTER(as, RW_READER);
3486 seg = as_segat(as, addr);
3487 if (seg == NULL) {
3488 AS_LOCK_EXIT(as);
3489 return (EFAULT);
3492 sts = segop_getmemid(seg, addr, memidp);
3494 AS_LOCK_EXIT(as);
3495 return (sts);