4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2018 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
42 * VM - address spaces.
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
70 #include <vm/seg_hole.h>
73 clock_t deadlk_wait
= 1; /* number of ticks to wait before retrying */
75 static struct kmem_cache
*as_cache
;
77 static void as_setwatchprot(struct as
*, caddr_t
, size_t, uint_t
);
78 static void as_clearwatchprot(struct as
*, caddr_t
, size_t);
82 * Verifying the segment lists is very time-consuming; it may not be
83 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
86 #define VERIFY_SEGLIST
91 * Allocate a new callback data structure entry and fill in the events of
92 * interest, the address range of interest, and the callback argument.
93 * Link the entry on the as->a_callbacks list. A callback entry for the
94 * entire address space may be specified with vaddr = 0 and size = -1.
96 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 * the specified as, the caller must guarantee persistence of the specified as
98 * for the duration of this function (eg. pages being locked within the as
99 * will guarantee persistence).
102 as_add_callback(struct as
*as
, void (*cb_func
)(), void *arg
, uint_t events
,
103 caddr_t vaddr
, size_t size
, int sleepflag
)
105 struct as_callback
*current_head
, *cb
;
109 /* callback function and an event are mandatory */
110 if ((cb_func
== NULL
) || ((events
& AS_ALL_EVENT
) == 0))
113 /* Adding a callback after as_free has been called is not allowed */
118 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 * is the entire address space so no rounding is done in that case.
122 saddr
= (caddr_t
)((uintptr_t)vaddr
& (uintptr_t)PAGEMASK
);
123 rsize
= (((size_t)(vaddr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
125 /* check for wraparound */
126 if (saddr
+ rsize
< saddr
)
135 /* Allocate and initialize a callback entry */
136 cb
= kmem_zalloc(sizeof (struct as_callback
), sleepflag
);
140 cb
->ascb_func
= cb_func
;
142 cb
->ascb_events
= events
;
143 cb
->ascb_saddr
= saddr
;
144 cb
->ascb_len
= rsize
;
146 /* Add the entry to the list */
147 mutex_enter(&as
->a_contents
);
148 current_head
= as
->a_callbacks
;
149 as
->a_callbacks
= cb
;
150 cb
->ascb_next
= current_head
;
153 * The call to this function may lose in a race with
154 * a pertinent event - eg. a thread does long term memory locking
155 * but before the callback is added another thread executes as_unmap.
156 * A broadcast here resolves that.
158 if ((cb
->ascb_events
& AS_UNMAPWAIT_EVENT
) && AS_ISUNMAPWAIT(as
)) {
160 cv_broadcast(&as
->a_cv
);
163 mutex_exit(&as
->a_contents
);
168 * Search the callback list for an entry which pertains to arg.
170 * This is called from within the client upon completion of the callback.
172 * AS_CALLBACK_DELETED (callback entry found and deleted)
173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 * entry will be made in as_do_callbacks)
177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 * set, it indicates that as_do_callbacks is processing this entry. The
179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 * to unblock as_do_callbacks, in case it is blocked.
182 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 * the specified as, the caller must guarantee persistence of the specified as
184 * for the duration of this function (eg. pages being locked within the as
185 * will guarantee persistence).
188 as_delete_callback(struct as
*as
, void *arg
)
190 struct as_callback
**prevcb
= &as
->a_callbacks
;
191 struct as_callback
*cb
;
192 uint_t rc
= AS_CALLBACK_NOTFOUND
;
194 mutex_enter(&as
->a_contents
);
195 for (cb
= as
->a_callbacks
; cb
; prevcb
= &cb
->ascb_next
, cb
= *prevcb
) {
196 if (cb
->ascb_arg
!= arg
)
200 * If the events indicate AS_CALLBACK_CALLED, just clear
201 * AS_ALL_EVENT in the events field and wakeup the thread
202 * that may be waiting in as_do_callbacks. as_do_callbacks
203 * will take care of removing this entry from the list. In
204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
205 * (AS_CALLBACK_CALLED not set), just remove it from the
206 * list, return the memory and return AS_CALLBACK_DELETED.
208 if ((cb
->ascb_events
& AS_CALLBACK_CALLED
) != 0) {
209 /* leave AS_CALLBACK_CALLED */
210 cb
->ascb_events
&= ~AS_ALL_EVENT
;
211 rc
= AS_CALLBACK_DELETE_DEFERRED
;
212 cv_broadcast(&as
->a_cv
);
214 *prevcb
= cb
->ascb_next
;
215 kmem_free(cb
, sizeof (struct as_callback
));
216 rc
= AS_CALLBACK_DELETED
;
220 mutex_exit(&as
->a_contents
);
225 * Searches the as callback list for a matching entry.
226 * Returns a pointer to the first matching callback, or NULL if
228 * This function never sleeps so it is ok to call it with more
229 * locks held but the (required) a_contents mutex.
231 * See also comment on as_do_callbacks below.
233 static struct as_callback
*
234 as_find_callback(struct as
*as
, uint_t events
, caddr_t event_addr
,
237 struct as_callback
*cb
;
239 ASSERT(MUTEX_HELD(&as
->a_contents
));
240 for (cb
= as
->a_callbacks
; cb
!= NULL
; cb
= cb
->ascb_next
) {
242 * If the callback has not already been called, then
243 * check if events or address range pertains. An event_len
244 * of zero means do an unconditional callback.
246 if (((cb
->ascb_events
& AS_CALLBACK_CALLED
) != 0) ||
247 ((event_len
!= 0) && (((cb
->ascb_events
& events
) == 0) ||
248 (event_addr
+ event_len
< cb
->ascb_saddr
) ||
249 (event_addr
> (cb
->ascb_saddr
+ cb
->ascb_len
))))) {
258 * Executes a given callback and removes it from the callback list for
259 * this address space.
260 * This function may sleep so the caller must drop all locks except
261 * a_contents before calling this func.
263 * See also comments on as_do_callbacks below.
266 as_execute_callback(struct as
*as
, struct as_callback
*cb
,
269 struct as_callback
**prevcb
;
272 ASSERT(MUTEX_HELD(&as
->a_contents
) && (cb
->ascb_events
& events
));
273 cb
->ascb_events
|= AS_CALLBACK_CALLED
;
274 mutex_exit(&as
->a_contents
);
275 (*cb
->ascb_func
)(as
, cb
->ascb_arg
, events
);
276 mutex_enter(&as
->a_contents
);
278 * the callback function is required to delete the callback
279 * when the callback function determines it is OK for
280 * this thread to continue. as_delete_callback will clear
281 * the AS_ALL_EVENT in the events field when it is deleted.
282 * If the callback function called as_delete_callback,
283 * events will already be cleared and there will be no blocking.
285 while ((cb
->ascb_events
& events
) != 0) {
286 cv_wait(&as
->a_cv
, &as
->a_contents
);
289 * This entry needs to be taken off the list. Normally, the
290 * callback func itself does that, but unfortunately the list
291 * may have changed while the callback was running because the
292 * a_contents mutex was dropped and someone else other than the
293 * callback func itself could have called as_delete_callback,
294 * so we have to search to find this entry again. The entry
295 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
297 cb_arg
= cb
->ascb_arg
;
298 prevcb
= &as
->a_callbacks
;
299 for (cb
= as
->a_callbacks
; cb
!= NULL
;
300 prevcb
= &cb
->ascb_next
, cb
= *prevcb
) {
301 if (((cb
->ascb_events
& AS_CALLBACK_CALLED
) == 0) ||
302 (cb_arg
!= cb
->ascb_arg
)) {
305 *prevcb
= cb
->ascb_next
;
306 kmem_free(cb
, sizeof (struct as_callback
));
312 * Check the callback list for a matching event and intersection of
313 * address range. If there is a match invoke the callback. Skip an entry if:
314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 * - not event of interest
316 * - not address range of interest
318 * An event_len of zero indicates a request for an unconditional callback
319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
320 * a_contents lock must be dropped before a callback, so only one callback
321 * can be done before returning. Return -1 (true) if a callback was
322 * executed and removed from the list, else return 0 (false).
324 * The logically separate parts, i.e. finding a matching callback and
325 * executing a given callback have been separated into two functions
326 * so that they can be called with different sets of locks held beyond
327 * the always-required a_contents. as_find_callback does not sleep so
328 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 * rwlock) are held. as_execute_callback on the other hand may sleep
330 * so all locks beyond a_contents must be dropped by the caller if one
331 * does not want to end comatose.
334 as_do_callbacks(struct as
*as
, uint_t events
, caddr_t event_addr
,
337 struct as_callback
*cb
;
339 if ((cb
= as_find_callback(as
, events
, event_addr
, event_len
))) {
340 as_execute_callback(as
, cb
, events
);
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
358 as_findseg(struct as
*as
, caddr_t addr
, int tail
)
360 struct seg
*seg
= as
->a_seglast
;
363 ASSERT(AS_LOCK_HELD(as
));
366 seg
->s_base
<= addr
&&
367 addr
< seg
->s_base
+ seg
->s_size
)
370 seg
= avl_find(&as
->a_segtree
, &addr
, &where
);
372 return (as
->a_seglast
= seg
);
374 seg
= avl_nearest(&as
->a_segtree
, where
, AVL_AFTER
);
375 if (seg
== NULL
&& tail
)
376 seg
= avl_last(&as
->a_segtree
);
377 return (as
->a_seglast
= seg
);
380 #ifdef VERIFY_SEGLIST
382 * verify that the linked list is coherent
385 as_verify(struct as
*as
)
387 struct seg
*seg
, *seglast
, *p
, *n
;
390 if (do_as_verify
== 0)
393 seglast
= as
->a_seglast
;
395 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= AS_SEGNEXT(as
, seg
)) {
396 ASSERT(seg
->s_as
== as
);
397 p
= AS_SEGPREV(as
, seg
);
398 n
= AS_SEGNEXT(as
, seg
);
399 ASSERT(p
== NULL
|| p
->s_as
== as
);
400 ASSERT(p
== NULL
|| p
->s_base
< seg
->s_base
);
401 ASSERT(n
== NULL
|| n
->s_base
> seg
->s_base
);
402 ASSERT(n
!= NULL
|| seg
== avl_last(&as
->a_segtree
));
407 ASSERT(seglast
== NULL
);
408 ASSERT(avl_numnodes(&as
->a_segtree
) == nsegs
);
410 #endif /* VERIFY_SEGLIST */
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
418 as_addseg(struct as
*as
, struct seg
*newseg
)
425 ASSERT(AS_WRITE_HELD(as
));
427 as
->a_updatedir
= 1; /* inform /proc */
428 gethrestime(&as
->a_updatetime
);
430 if (as
->a_lastgaphl
!= NULL
) {
431 struct seg
*hseg
= NULL
;
432 struct seg
*lseg
= NULL
;
434 if (as
->a_lastgaphl
->s_base
> newseg
->s_base
) {
435 hseg
= as
->a_lastgaphl
;
436 lseg
= AVL_PREV(&as
->a_segtree
, hseg
);
438 lseg
= as
->a_lastgaphl
;
439 hseg
= AVL_NEXT(&as
->a_segtree
, lseg
);
442 if (hseg
&& lseg
&& lseg
->s_base
< newseg
->s_base
&&
443 hseg
->s_base
> newseg
->s_base
) {
444 avl_insert_here(&as
->a_segtree
, newseg
, lseg
,
446 as
->a_lastgaphl
= NULL
;
447 as
->a_seglast
= newseg
;
450 as
->a_lastgaphl
= NULL
;
453 addr
= newseg
->s_base
;
454 eaddr
= addr
+ newseg
->s_size
;
457 seg
= avl_find(&as
->a_segtree
, &addr
, &where
);
460 seg
= avl_nearest(&as
->a_segtree
, where
, AVL_AFTER
);
463 seg
= avl_last(&as
->a_segtree
);
466 caddr_t base
= seg
->s_base
;
469 * If top of seg is below the requested address, then
470 * the insertion point is at the end of the linked list,
471 * and seg points to the tail of the list. Otherwise,
472 * the insertion point is immediately before seg.
474 if (base
+ seg
->s_size
> addr
) {
475 if (addr
>= base
|| eaddr
> base
) {
476 return (-1); /* overlapping segment */
480 as
->a_seglast
= newseg
;
481 avl_insert(&as
->a_segtree
, newseg
, where
);
483 #ifdef VERIFY_SEGLIST
490 as_removeseg(struct as
*as
, struct seg
*seg
)
494 ASSERT(AS_WRITE_HELD(as
));
496 as
->a_updatedir
= 1; /* inform /proc */
497 gethrestime(&as
->a_updatetime
);
503 if (as
->a_seglast
== seg
)
504 as
->a_seglast
= NULL
;
505 as
->a_lastgaphl
= NULL
;
508 * if this segment is at an address higher than
509 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
512 (seg
== as
->a_lastgap
|| seg
->s_base
> as
->a_lastgap
->s_base
))
513 as
->a_lastgap
= AVL_NEXT(t
, seg
);
516 * remove the segment from the seg tree
520 #ifdef VERIFY_SEGLIST
527 * Find a segment containing addr.
530 as_segat(struct as
*as
, caddr_t addr
)
532 struct seg
*seg
= as
->a_seglast
;
534 ASSERT(AS_LOCK_HELD(as
));
536 if (seg
!= NULL
&& seg
->s_base
<= addr
&&
537 addr
< seg
->s_base
+ seg
->s_size
)
540 seg
= avl_find(&as
->a_segtree
, &addr
, NULL
);
545 * Serialize all searches for holes in an address space to
546 * prevent two or more threads from allocating the same virtual
547 * address range. The address space must not be "read/write"
548 * locked by the caller since we may block.
551 as_rangelock(struct as
*as
)
553 mutex_enter(&as
->a_contents
);
554 while (AS_ISCLAIMGAP(as
))
555 cv_wait(&as
->a_cv
, &as
->a_contents
);
557 mutex_exit(&as
->a_contents
);
561 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
564 as_rangeunlock(struct as
*as
)
566 mutex_enter(&as
->a_contents
);
568 cv_signal(&as
->a_cv
);
569 mutex_exit(&as
->a_contents
);
573 * compar segments (or just an address) by segment address range
576 as_segcompar(const void *x
, const void *y
)
578 struct seg
*a
= (struct seg
*)x
;
579 struct seg
*b
= (struct seg
*)y
;
581 if (a
->s_base
< b
->s_base
)
583 if (a
->s_base
>= b
->s_base
+ b
->s_size
)
590 as_avlinit(struct as
*as
)
592 avl_create(&as
->a_segtree
, as_segcompar
, sizeof (struct seg
),
593 offsetof(struct seg
, s_tree
));
594 avl_create(&as
->a_wpage
, wp_compare
, sizeof (struct watched_page
),
595 offsetof(struct watched_page
, wp_link
));
600 as_constructor(void *buf
, void *cdrarg
, int kmflags
)
604 mutex_init(&as
->a_contents
, NULL
, MUTEX_DEFAULT
, NULL
);
605 cv_init(&as
->a_cv
, NULL
, CV_DEFAULT
, NULL
);
606 rw_init(&as
->a_lock
, NULL
, RW_DEFAULT
, NULL
);
613 as_destructor(void *buf
, void *cdrarg
)
617 avl_destroy(&as
->a_segtree
);
618 mutex_destroy(&as
->a_contents
);
619 cv_destroy(&as
->a_cv
);
620 rw_destroy(&as
->a_lock
);
626 as_cache
= kmem_cache_create("as_cache", sizeof (struct as
), 0,
627 as_constructor
, as_destructor
, NULL
, NULL
, NULL
, 0);
631 * Allocate and initialize an address space data structure.
632 * We call hat_alloc to allow any machine dependent
633 * information in the hat structure to be initialized.
640 as
= kmem_cache_alloc(as_cache
, KM_SLEEP
);
645 as
->a_seglast
= NULL
;
649 gethrestime(&as
->a_updatetime
);
650 as
->a_objectdir
= NULL
;
652 as
->a_userlimit
= (caddr_t
)USERLIMIT
;
653 as
->a_lastgap
= NULL
;
654 as
->a_lastgaphl
= NULL
;
655 as
->a_callbacks
= NULL
;
658 AS_LOCK_ENTER(as
, RW_WRITER
);
659 as
->a_hat
= hat_alloc(as
); /* create hat for default system mmu */
666 * Free an address space data structure.
667 * Need to free the hat first and then
668 * all the segments on this as and finally
669 * the space for the as struct itself.
672 as_free(struct as
*as
)
674 struct hat
*hat
= as
->a_hat
;
675 struct seg
*seg
, *next
;
676 boolean_t free_started
= B_FALSE
;
680 * Invoke ALL callbacks. as_do_callbacks will do one callback
681 * per call, and not return (-1) until the callback has completed.
682 * When as_do_callbacks returns zero, all callbacks have completed.
684 mutex_enter(&as
->a_contents
);
685 while (as
->a_callbacks
&& as_do_callbacks(as
, AS_ALL_EVENT
, 0, 0))
688 mutex_exit(&as
->a_contents
);
689 AS_LOCK_ENTER(as
, RW_WRITER
);
692 free_started
= B_TRUE
;
695 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= next
) {
698 next
= AS_SEGNEXT(as
, seg
);
700 err
= segop_unmap(seg
, seg
->s_base
, seg
->s_size
);
702 mutex_enter(&as
->a_contents
);
703 if (as
->a_callbacks
) {
705 } else if (!AS_ISNOUNMAPWAIT(as
)) {
707 * Memory is currently locked. Wait for a
708 * cv_signal that it has been unlocked, then
709 * try the operation again.
711 if (AS_ISUNMAPWAIT(as
) == 0)
712 cv_broadcast(&as
->a_cv
);
715 while (AS_ISUNMAPWAIT(as
))
716 cv_wait(&as
->a_cv
, &as
->a_contents
);
719 * We may have raced with
720 * segvn_reclaim()/segspt_reclaim(). In this
721 * case clean nounmapwait flag and retry since
722 * softlockcnt in this segment may be already
723 * 0. We don't drop as writer lock so our
724 * number of retries without sleeping should
725 * be very small. See segvn_reclaim() for
728 AS_CLRNOUNMAPWAIT(as
);
729 mutex_exit(&as
->a_contents
);
732 mutex_exit(&as
->a_contents
);
736 * We do not expect any other error return at this
737 * time. This is similar to an ASSERT in seg_unmap()
746 ASSERT(avl_numnodes(&as
->a_wpage
) == 0);
747 if (as
->a_objectdir
) {
748 kmem_free(as
->a_objectdir
, as
->a_sizedir
* sizeof (vnode_t
*));
749 as
->a_objectdir
= NULL
;
754 * Free the struct as back to kmem. Assert it has no segments.
756 ASSERT(avl_numnodes(&as
->a_segtree
) == 0);
757 kmem_cache_free(as_cache
, as
);
761 as_dup(struct as
*as
, struct proc
*forkedproc
)
764 struct seg
*seg
, *newseg
;
765 size_t purgesize
= 0;
768 AS_LOCK_ENTER(as
, RW_WRITER
);
771 newas
->a_userlimit
= as
->a_userlimit
;
772 newas
->a_proc
= forkedproc
;
774 AS_LOCK_ENTER(newas
, RW_WRITER
);
776 (void) hat_dup(as
->a_hat
, newas
->a_hat
, NULL
, 0, HAT_DUP_SRD
);
778 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= AS_SEGNEXT(as
, seg
)) {
780 if (seg
->s_flags
& S_PURGE
) {
781 purgesize
+= seg
->s_size
;
785 newseg
= seg_alloc(newas
, seg
->s_base
, seg
->s_size
);
786 if (newseg
== NULL
) {
793 if ((error
= segop_dup(seg
, newseg
)) != 0) {
795 * We call seg_free() on the new seg
796 * because the segment is not set up
797 * completely; i.e. it has no ops.
806 if ((newseg
->s_flags
& S_HOLE
) == 0) {
807 newas
->a_size
+= seg
->s_size
;
810 newas
->a_resvsize
= as
->a_resvsize
- purgesize
;
812 error
= hat_dup(as
->a_hat
, newas
->a_hat
, NULL
, 0, HAT_DUP_ALL
);
822 forkedproc
->p_as
= newas
;
827 * Handle a ``fault'' at addr for size bytes.
830 as_fault(struct hat
*hat
, struct as
*as
, caddr_t addr
, size_t size
,
831 enum fault_type type
, enum seg_rw rw
)
834 caddr_t raddr
; /* rounded down addr */
835 size_t rsize
; /* rounded up size */
841 klwp_t
*lwp
= ttolwp(curthread
);
847 * Indicate that the lwp is not to be stopped while waiting for a
848 * pagefault. This is to avoid deadlock while debugging a process
849 * via /proc over NFS (in particular).
855 * same length must be used when we softlock and softunlock. We
856 * don't support softunlocking lengths less than the original length
857 * when there is largepage support. See seg_dev.c for more
863 CPU_STATS_ADD_K(vm
, softlock
, 1);
870 CPU_STATS_ADD_K(vm
, prot_fault
, 1);
875 CPU_STATS_ADDQ(CPU
, vm
, as_fault
, 1);
877 CPU_STATS_ADDQ(CPU
, vm
, kernel_asflt
, 1);
882 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
883 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
887 * XXX -- Don't grab the as lock for segkmap. We should grab it for
888 * correctness, but then we could be stuck holding this lock for
889 * a LONG time if the fault needs to be resolved on a slow
890 * filesystem, and then no-one will be able to exec new commands,
891 * as exec'ing requires the write lock on the as.
893 if (as
== &kas
&& segkmap
&& segkmap
->s_base
<= raddr
&&
894 raddr
+ size
< segkmap
->s_base
+ segkmap
->s_size
) {
898 AS_LOCK_ENTER(as
, RW_READER
);
900 seg
= as_segat(as
, raddr
);
914 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
915 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
916 seg
= AS_SEGNEXT(as
, seg
);
917 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
922 if (raddr
+ rsize
> seg
->s_base
+ seg
->s_size
)
923 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
927 res
= segop_fault(hat
, seg
, raddr
, ssize
, type
, rw
);
933 * If we were SOFTLOCKing and encountered a failure,
934 * we must SOFTUNLOCK the range we already did. (Maybe we
935 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
938 if (res
!= 0 && type
== F_SOFTLOCK
) {
939 for (seg
= segsav
; addrsav
< raddr
; addrsav
+= ssize
) {
940 if (addrsav
>= seg
->s_base
+ seg
->s_size
)
941 seg
= AS_SEGNEXT(as
, seg
);
944 * Now call the fault routine again to perform the
945 * unlock using S_OTHER instead of the rw variable
946 * since we never got a chance to touch the pages.
948 if (raddr
> seg
->s_base
+ seg
->s_size
)
949 ssize
= seg
->s_base
+ seg
->s_size
- addrsav
;
951 ssize
= raddr
- addrsav
;
952 (void) segop_fault(hat
, seg
, addrsav
, ssize
,
953 F_SOFTUNLOCK
, S_OTHER
);
962 * If the lower levels returned EDEADLK for a fault,
963 * It means that we should retry the fault. Let's wait
964 * a bit also to let the deadlock causing condition clear.
965 * This is part of a gross hack to work around a design flaw
966 * in the ufs/sds logging code and should go away when the
967 * logging code is re-designed to fix the problem. See bug
968 * 4125102 for details of the problem.
970 if (FC_ERRNO(res
) == EDEADLK
) {
981 * Asynchronous ``fault'' at addr for size bytes.
984 as_faulta(struct as
*as
, caddr_t addr
, size_t size
)
987 caddr_t raddr
; /* rounded down addr */
988 size_t rsize
; /* rounded up size */
990 klwp_t
*lwp
= ttolwp(curthread
);
994 * Indicate that the lwp is not to be stopped while waiting
995 * for a pagefault. This is to avoid deadlock while debugging
996 * a process via /proc over NFS (in particular).
1001 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1002 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1005 AS_LOCK_ENTER(as
, RW_READER
);
1006 seg
= as_segat(as
, raddr
);
1014 for (; rsize
!= 0; rsize
-= PAGESIZE
, raddr
+= PAGESIZE
) {
1015 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1016 seg
= AS_SEGNEXT(as
, seg
);
1017 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1022 res
= segop_faulta(seg
, raddr
);
1030 * If the lower levels returned EDEADLK for a fault,
1031 * It means that we should retry the fault. Let's wait
1032 * a bit also to let the deadlock causing condition clear.
1033 * This is part of a gross hack to work around a design flaw
1034 * in the ufs/sds logging code and should go away when the
1035 * logging code is re-designed to fix the problem. See bug
1036 * 4125102 for details of the problem.
1038 if (FC_ERRNO(res
) == EDEADLK
) {
1047 * Set the virtual mapping for the interval from [addr : addr + size)
1048 * in address space `as' to have the specified protection.
1049 * It is ok for the range to cross over several segments,
1050 * as long as they are contiguous.
1053 as_setprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
1056 struct as_callback
*cb
;
1058 caddr_t raddr
; /* rounded down addr */
1059 size_t rsize
; /* rounded up size */
1060 int error
= 0, writer
= 0;
1065 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1066 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1069 if (raddr
+ rsize
< raddr
) /* check for wraparound */
1076 * Normally we only lock the as as a reader. But
1077 * if due to setprot the segment driver needs to split
1078 * a segment it will return IE_RETRY. Therefore we re-acquire
1079 * the as lock as a writer so the segment driver can change
1080 * the seg list. Also the segment driver will return IE_RETRY
1081 * after it has changed the segment list so we therefore keep
1082 * locking as a writer. Since these opeartions should be rare
1083 * want to only lock as a writer when necessary.
1085 if (writer
|| avl_numnodes(&as
->a_wpage
) != 0) {
1086 AS_LOCK_ENTER(as
, RW_WRITER
);
1088 AS_LOCK_ENTER(as
, RW_READER
);
1091 as_clearwatchprot(as
, raddr
, rsize
);
1092 seg
= as_segat(as
, raddr
);
1099 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
1100 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1101 seg
= AS_SEGNEXT(as
, seg
);
1102 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1107 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
1108 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1112 error
= segop_setprot(seg
, raddr
, ssize
, prot
);
1114 if (error
== IE_NOMEM
) {
1119 if (error
== IE_RETRY
) {
1125 if (error
== EAGAIN
) {
1127 * Make sure we have a_lock as writer.
1136 * Memory is currently locked. It must be unlocked
1137 * before this operation can succeed through a retry.
1138 * The possible reasons for locked memory and
1139 * corresponding strategies for unlocking are:
1141 * wait for a signal that the I/O operation
1142 * has completed and the memory is unlocked.
1143 * (2) Asynchronous I/O
1144 * The aio subsystem does not unlock pages when
1145 * the I/O is completed. Those pages are unlocked
1146 * when the application calls aiowait/aioerror.
1147 * So, to prevent blocking forever, cv_broadcast()
1148 * is done to wake up aio_cleanup_thread.
1149 * Subsequently, segvn_reclaim will be called, and
1150 * that will do AS_CLRUNMAPWAIT() and wake us up.
1151 * (3) Long term page locking:
1152 * Drivers intending to have pages locked for a
1153 * period considerably longer than for normal I/O
1154 * (essentially forever) may have registered for a
1155 * callback so they may unlock these pages on
1156 * request. This is needed to allow this operation
1157 * to succeed. Each entry on the callback list is
1158 * examined. If the event or address range pertains
1159 * the callback is invoked (unless it already is in
1160 * progress). The a_contents lock must be dropped
1161 * before the callback, so only one callback can
1162 * be done at a time. Go to the top and do more
1163 * until zero is returned. If zero is returned,
1164 * either there were no callbacks for this event
1165 * or they were already in progress.
1167 mutex_enter(&as
->a_contents
);
1168 if (as
->a_callbacks
&&
1169 (cb
= as_find_callback(as
, AS_SETPROT_EVENT
,
1170 seg
->s_base
, seg
->s_size
))) {
1172 as_execute_callback(as
, cb
, AS_SETPROT_EVENT
);
1173 } else if (!AS_ISNOUNMAPWAIT(as
)) {
1174 if (AS_ISUNMAPWAIT(as
) == 0)
1175 cv_broadcast(&as
->a_cv
);
1176 AS_SETUNMAPWAIT(as
);
1178 while (AS_ISUNMAPWAIT(as
))
1179 cv_wait(&as
->a_cv
, &as
->a_contents
);
1182 * We may have raced with
1183 * segvn_reclaim()/segspt_reclaim(). In this
1184 * case clean nounmapwait flag and retry since
1185 * softlockcnt in this segment may be already
1186 * 0. We don't drop as writer lock so our
1187 * number of retries without sleeping should
1188 * be very small. See segvn_reclaim() for
1191 AS_CLRNOUNMAPWAIT(as
);
1192 mutex_exit(&as
->a_contents
);
1195 mutex_exit(&as
->a_contents
);
1197 } else if (error
!= 0)
1203 as_setwatchprot(as
, saveraddr
, saversize
, prot
);
1210 * Check to make sure that the interval [addr, addr + size)
1211 * in address space `as' has at least the specified protection.
1212 * It is ok for the range to cross over several segments, as long
1213 * as they are contiguous.
1216 as_checkprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
1220 caddr_t raddr
; /* rounded down addr */
1221 size_t rsize
; /* rounded up size */
1224 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1225 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1228 if (raddr
+ rsize
< raddr
) /* check for wraparound */
1232 * This is ugly as sin...
1233 * Normally, we only acquire the address space readers lock.
1234 * However, if the address space has watchpoints present,
1235 * we must acquire the writer lock on the address space for
1236 * the benefit of as_clearwatchprot() and as_setwatchprot().
1238 if (avl_numnodes(&as
->a_wpage
) != 0)
1239 AS_LOCK_ENTER(as
, RW_WRITER
);
1241 AS_LOCK_ENTER(as
, RW_READER
);
1242 as_clearwatchprot(as
, raddr
, rsize
);
1243 seg
= as_segat(as
, raddr
);
1250 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
1251 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1252 seg
= AS_SEGNEXT(as
, seg
);
1253 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1258 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
1259 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1263 error
= segop_checkprot(seg
, raddr
, ssize
, prot
);
1273 as_unmap(struct as
*as
, caddr_t addr
, size_t size
)
1275 struct seg
*seg
, *seg_next
;
1276 struct as_callback
*cb
;
1277 caddr_t raddr
, eaddr
;
1278 size_t ssize
, rsize
= 0;
1282 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1283 eaddr
= (caddr_t
)(((uintptr_t)(addr
+ size
) + PAGEOFFSET
) &
1284 (uintptr_t)PAGEMASK
);
1286 AS_LOCK_ENTER(as
, RW_WRITER
);
1288 as
->a_updatedir
= 1; /* inform /proc */
1289 gethrestime(&as
->a_updatetime
);
1292 * Use as_findseg to find the first segment in the range, then
1293 * step through the segments in order, following s_next.
1295 as_clearwatchprot(as
, raddr
, eaddr
- raddr
);
1297 for (seg
= as_findseg(as
, raddr
, 0); seg
!= NULL
; seg
= seg_next
) {
1298 const boolean_t is_hole
= ((seg
->s_flags
& S_HOLE
) != 0);
1300 if (eaddr
<= seg
->s_base
)
1301 break; /* eaddr was in a gap; all done */
1303 /* this is implied by the test above */
1304 ASSERT(raddr
< eaddr
);
1306 if (raddr
< seg
->s_base
)
1307 raddr
= seg
->s_base
; /* raddr was in a gap */
1309 if (eaddr
> (seg
->s_base
+ seg
->s_size
))
1310 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1312 ssize
= eaddr
- raddr
;
1315 * Save next segment pointer since seg can be
1316 * destroyed during the segment unmap operation.
1318 seg_next
= AS_SEGNEXT(as
, seg
);
1321 * We didn't count /dev/null mappings, so ignore them here.
1322 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1323 * we have to do this check here while we have seg.)
1326 if (!SEG_IS_DEVNULL_MAPPING(seg
) &&
1327 !SEG_IS_PARTIAL_RESV(seg
))
1331 err
= segop_unmap(seg
, raddr
, ssize
);
1332 if (err
== EAGAIN
) {
1334 * Memory is currently locked. It must be unlocked
1335 * before this operation can succeed through a retry.
1336 * The possible reasons for locked memory and
1337 * corresponding strategies for unlocking are:
1339 * wait for a signal that the I/O operation
1340 * has completed and the memory is unlocked.
1341 * (2) Asynchronous I/O
1342 * The aio subsystem does not unlock pages when
1343 * the I/O is completed. Those pages are unlocked
1344 * when the application calls aiowait/aioerror.
1345 * So, to prevent blocking forever, cv_broadcast()
1346 * is done to wake up aio_cleanup_thread.
1347 * Subsequently, segvn_reclaim will be called, and
1348 * that will do AS_CLRUNMAPWAIT() and wake us up.
1349 * (3) Long term page locking:
1350 * Drivers intending to have pages locked for a
1351 * period considerably longer than for normal I/O
1352 * (essentially forever) may have registered for a
1353 * callback so they may unlock these pages on
1354 * request. This is needed to allow this operation
1355 * to succeed. Each entry on the callback list is
1356 * examined. If the event or address range pertains
1357 * the callback is invoked (unless it already is in
1358 * progress). The a_contents lock must be dropped
1359 * before the callback, so only one callback can
1360 * be done at a time. Go to the top and do more
1361 * until zero is returned. If zero is returned,
1362 * either there were no callbacks for this event
1363 * or they were already in progress.
1365 mutex_enter(&as
->a_contents
);
1366 if (as
->a_callbacks
&&
1367 (cb
= as_find_callback(as
, AS_UNMAP_EVENT
,
1368 seg
->s_base
, seg
->s_size
))) {
1370 as_execute_callback(as
, cb
, AS_UNMAP_EVENT
);
1371 } else if (!AS_ISNOUNMAPWAIT(as
)) {
1372 if (AS_ISUNMAPWAIT(as
) == 0)
1373 cv_broadcast(&as
->a_cv
);
1374 AS_SETUNMAPWAIT(as
);
1376 while (AS_ISUNMAPWAIT(as
))
1377 cv_wait(&as
->a_cv
, &as
->a_contents
);
1380 * We may have raced with
1381 * segvn_reclaim()/segspt_reclaim(). In this
1382 * case clean nounmapwait flag and retry since
1383 * softlockcnt in this segment may be already
1384 * 0. We don't drop as writer lock so our
1385 * number of retries without sleeping should
1386 * be very small. See segvn_reclaim() for
1389 AS_CLRNOUNMAPWAIT(as
);
1390 mutex_exit(&as
->a_contents
);
1393 mutex_exit(&as
->a_contents
);
1395 } else if (err
== IE_RETRY
) {
1405 as
->a_size
-= ssize
;
1407 as
->a_resvsize
-= rsize
;
1416 as_map_segvn_segs(struct as
*as
, caddr_t addr
, size_t size
, uint_t szcvec
,
1417 segcreate_func_t crfp
, struct segvn_crargs
*vn_a
, boolean_t
*segcreated
)
1419 uint_t szc
, nszc
, save_szcvec
;
1423 const boolean_t do_off
= (vn_a
->vp
!= NULL
|| vn_a
->amp
!= NULL
);
1425 ASSERT(AS_WRITE_HELD(as
));
1426 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1427 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1428 ASSERT(vn_a
->vp
== NULL
|| vn_a
->amp
== NULL
);
1435 struct seg
*seg
, *segref
;
1437 seg
= segref
= seg_alloc(as
, addr
, size
);
1442 error
= (*crfp
)(&seg
, vn_a
);
1444 VERIFY3P(seg
, ==, segref
);
1448 as
->a_resvsize
+= size
;
1453 eaddr
= addr
+ size
;
1454 save_szcvec
= szcvec
;
1459 if ((szcvec
& 0x1) == 0) {
1465 pgsz
= page_get_pagesize(nszc
);
1466 a
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
1468 struct seg
*seg
, *segref
;
1474 seg
= segref
= seg_alloc(as
, addr
, segsize
);
1479 error
= (*crfp
)(&seg
, vn_a
);
1481 VERIFY3P(seg
, ==, segref
);
1485 as
->a_size
+= segsize
;
1486 as
->a_resvsize
+= segsize
;
1487 *segcreated
= B_TRUE
;
1489 vn_a
->offset
+= segsize
;
1497 ASSERT(addr
< eaddr
);
1498 szcvec
= save_szcvec
| 1; /* add 8K pages */
1500 a
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
1503 struct seg
*seg
, *segref
;
1507 seg
= segref
= seg_alloc(as
, addr
, segsize
);
1512 error
= (*crfp
)(&seg
, vn_a
);
1514 VERIFY3P(seg
, ==, segref
);
1518 as
->a_size
+= segsize
;
1519 as
->a_resvsize
+= segsize
;
1520 *segcreated
= B_TRUE
;
1522 vn_a
->offset
+= segsize
;
1526 szcvec
&= ~(1 << szc
);
1528 szc
= highbit(szcvec
) - 1;
1529 pgsz
= page_get_pagesize(szc
);
1532 ASSERT(addr
== eaddr
);
1538 as_map_vnsegs(struct as
*as
, caddr_t addr
, size_t size
,
1539 segcreate_func_t crfp
, struct segvn_crargs
*vn_a
, boolean_t
*segcreated
)
1541 uint_t mapflags
= vn_a
->flags
& (MAP_TEXT
| MAP_INITDATA
);
1542 int type
= (vn_a
->type
== MAP_SHARED
) ? MAPPGSZC_SHM
: MAPPGSZC_PRIVM
;
1543 uint_t szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
, mapflags
,
1548 size_t save_size
= 0;
1549 extern size_t textrepl_size_thresh
;
1551 ASSERT(AS_WRITE_HELD(as
));
1552 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1553 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1554 ASSERT(vn_a
->vp
!= NULL
);
1555 ASSERT(vn_a
->amp
== NULL
);
1559 struct seg
*seg
, *segref
;
1561 seg
= segref
= seg_alloc(as
, addr
, size
);
1566 error
= (*crfp
)(&seg
, vn_a
);
1568 VERIFY3P(seg
, ==, segref
);
1572 as
->a_resvsize
+= size
;
1577 va
.va_mask
= VATTR_SIZE
;
1578 if (fop_getattr(vn_a
->vp
, &va
, ATTR_HINT
, vn_a
->cred
, NULL
) != 0) {
1582 eoff
= vn_a
->offset
& PAGEMASK
;
1583 if (eoff
>= va
.va_size
) {
1588 if (btopr(va
.va_size
) < btopr(eoff
)) {
1590 size
= va
.va_size
- (vn_a
->offset
& PAGEMASK
);
1591 size
= P2ROUNDUP_TYPED(size
, PAGESIZE
, size_t);
1592 szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
, mapflags
,
1600 if (size
> textrepl_size_thresh
) {
1601 vn_a
->flags
|= _MAP_TEXTREPL
;
1603 error
= as_map_segvn_segs(as
, addr
, size
, szcvec
, crfp
, vn_a
,
1610 size
= save_size
- size
;
1618 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1619 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1622 as_map_ansegs(struct as
*as
, caddr_t addr
, size_t size
,
1623 segcreate_func_t crfp
, struct segvn_crargs
*vn_a
, boolean_t
*segcreated
)
1628 ASSERT(vn_a
->type
== MAP_SHARED
|| vn_a
->type
== MAP_PRIVATE
);
1629 if (vn_a
->type
== MAP_SHARED
) {
1630 type
= MAPPGSZC_SHM
;
1631 } else if (vn_a
->type
== MAP_PRIVATE
) {
1632 if (vn_a
->szc
== AS_MAP_HEAP
) {
1633 type
= MAPPGSZC_HEAP
;
1634 } else if (vn_a
->szc
== AS_MAP_STACK
) {
1635 type
= MAPPGSZC_STACK
;
1637 type
= MAPPGSZC_PRIVM
;
1640 szcvec
= map_pgszcvec(addr
, size
, vn_a
->amp
== NULL
?
1641 (uintptr_t)addr
: (uintptr_t)P2ROUNDUP(vn_a
->offset
, PAGESIZE
),
1642 (vn_a
->flags
& MAP_TEXT
), type
, 0);
1643 ASSERT(AS_WRITE_HELD(as
));
1644 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1645 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1646 ASSERT(vn_a
->vp
== NULL
);
1648 return (as_map_segvn_segs(as
, addr
, size
, szcvec
,
1649 crfp
, vn_a
, segcreated
));
1653 as_map(struct as
*as
, caddr_t addr
, size_t size
, segcreate_func_t crfp
,
1656 AS_LOCK_ENTER(as
, RW_WRITER
);
1657 return (as_map_locked(as
, addr
, size
, crfp
, argsp
));
1661 as_map_locked(struct as
*as
, caddr_t addr
, size_t size
, segcreate_func_t crfp
,
1664 caddr_t raddr
; /* rounded down addr */
1665 size_t rsize
; /* rounded up size */
1667 boolean_t is_hole
= B_FALSE
;
1669 * The use of a_proc is preferred to handle the case where curproc is
1670 * a door_call server and is allocating memory in the client's (a_proc)
1672 * When creating a shared memory segment a_proc will be NULL so we
1673 * fallback to curproc in that case.
1675 struct proc
*p
= (as
->a_proc
== NULL
) ? curproc
: as
->a_proc
;
1676 struct segvn_crargs crargs
;
1678 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1679 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1683 * check for wrap around
1685 if ((raddr
+ rsize
< raddr
) || (as
->a_size
> (ULONG_MAX
- size
))) {
1690 as
->a_updatedir
= 1; /* inform /proc */
1691 gethrestime(&as
->a_updatetime
);
1695 * Ensure that the virtual size of the process will not exceed
1696 * the configured limit. Since seg_hole segments will later
1697 * set the S_HOLE flag indicating their status as a hole in the
1698 * AS, they are excluded from this check.
1700 if (as
->a_size
+ rsize
> (size_t)p
->p_vmem_ctl
&&
1701 !AS_MAP_CHECK_SEGHOLE(crfp
)) {
1704 (void) rctl_action(rctlproc_legacy
[RLIMIT_VMEM
],
1705 p
->p_rctls
, p
, RCA_UNSAFE_ALL
);
1710 if (AS_MAP_CHECK_VNODE_LPOOB(crfp
, argsp
)) {
1711 boolean_t do_unmap
= B_FALSE
;
1713 crargs
= *(struct segvn_crargs
*)argsp
;
1714 error
= as_map_vnsegs(as
, raddr
, rsize
, crfp
, &crargs
,
1719 (void) as_unmap(as
, addr
, size
);
1723 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp
, argsp
)) {
1724 boolean_t do_unmap
= B_FALSE
;
1726 crargs
= *(struct segvn_crargs
*)argsp
;
1727 error
= as_map_ansegs(as
, raddr
, rsize
, crfp
, &crargs
,
1732 (void) as_unmap(as
, addr
, size
);
1737 struct seg
*seg
, *segref
;
1739 seg
= segref
= seg_alloc(as
, addr
, size
);
1746 * It is possible that the segment creation routine will free
1747 * 'seg' as part of a more advanced operation, such as when
1748 * segvn concatenates adjacent segments together. When this
1749 * occurs, the seg*_create routine must communicate the
1750 * resulting segment out via the 'struct seg **' parameter.
1752 * If segment creation fails, it must not free the passed-in
1753 * segment, nor alter the argument pointer.
1755 error
= (*crfp
)(&seg
, argsp
);
1757 VERIFY3P(seg
, ==, segref
);
1764 * Check if the resulting segment represents a hole in the
1765 * address space, rather than contributing to the AS size.
1767 is_hole
= ((seg
->s_flags
& S_HOLE
) != 0);
1769 /* Add size now so as_unmap will work if as_ctl fails. */
1771 as
->a_size
+= rsize
;
1772 as
->a_resvsize
+= rsize
;
1779 * Establish memory locks for the segment if the address space is
1780 * locked, provided it's not an explicit hole in the AS.
1782 mutex_enter(&as
->a_contents
);
1783 if (AS_ISPGLCK(as
) && !is_hole
) {
1784 mutex_exit(&as
->a_contents
);
1786 error
= as_ctl(as
, addr
, size
, MC_LOCK
, 0, 0, NULL
, 0);
1788 (void) as_unmap(as
, addr
, size
);
1790 mutex_exit(&as
->a_contents
);
1798 * Delete all segments in the address space marked with S_PURGE.
1799 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1800 * These segments are deleted as a first step before calls to as_gap(), so
1801 * that they don't affect mmap() or shmat().
1804 as_purge(struct as
*as
)
1807 struct seg
*next_seg
;
1810 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1811 * no need to grab a_contents mutex for this check
1813 if ((as
->a_flags
& AS_NEEDSPURGE
) == 0)
1816 AS_LOCK_ENTER(as
, RW_WRITER
);
1818 seg
= AS_SEGFIRST(as
);
1819 while (seg
!= NULL
) {
1820 next_seg
= AS_SEGNEXT(as
, seg
);
1821 if (seg
->s_flags
& S_PURGE
)
1822 (void) segop_unmap(seg
, seg
->s_base
, seg
->s_size
);
1827 mutex_enter(&as
->a_contents
);
1828 as
->a_flags
&= ~AS_NEEDSPURGE
;
1829 mutex_exit(&as
->a_contents
);
1833 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1834 * range of addresses at least "minlen" long, where the base of the range is
1835 * at "off" phase from an "align" boundary and there is space for a
1836 * "redzone"-sized redzone on eithe rside of the range. Thus,
1837 * if align was 4M and off was 16k, the user wants a hole which will start
1838 * 16k into a 4M page.
1840 * If flags specifies AH_HI, the hole will have the highest possible address
1841 * in the range. We use the as->a_lastgap field to figure out where to
1842 * start looking for a gap.
1844 * Otherwise, the gap will have the lowest possible address.
1846 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1848 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1849 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1851 * NOTE: This routine is not correct when base+len overflows caddr_t.
1854 as_gap_aligned(struct as
*as
, size_t minlen
, caddr_t
*basep
, size_t *lenp
,
1855 uint_t flags
, caddr_t addr
, size_t align
, size_t redzone
, size_t off
)
1857 caddr_t lobound
= *basep
;
1858 caddr_t hibound
= lobound
+ *lenp
;
1859 struct seg
*lseg
, *hseg
;
1865 size_t save_redzone
;
1870 save_minlen
= minlen
;
1871 save_redzone
= redzone
;
1874 * For the first pass/fast_path, just add align and redzone into
1875 * minlen since if we get an allocation, we can guarantee that it
1876 * will fit the alignment and redzone requested.
1877 * This increases the chance that hibound will be adjusted to
1878 * a_lastgap->s_base which will likely allow us to find an
1879 * acceptable hole in the address space quicker.
1880 * If we can't find a hole with this fast_path, then we look for
1881 * smaller holes in which the alignment and offset may allow
1882 * the allocation to fit.
1885 minlen
+= 2 * redzone
;
1888 AS_LOCK_ENTER(as
, RW_READER
);
1889 if (AS_SEGFIRST(as
) == NULL
) {
1890 if (valid_va_range_aligned(basep
, lenp
, minlen
, flags
& AH_DIR
,
1891 align
, redzone
, off
)) {
1904 * Set up to iterate over all the inter-segment holes in the given
1905 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1906 * NULL for the highest-addressed hole. If moving backwards, we reset
1907 * sseg to denote the highest-addressed segment.
1909 forward
= (flags
& AH_DIR
) == AH_LO
;
1911 hseg
= as_findseg(as
, lobound
, 1);
1912 lseg
= AS_SEGPREV(as
, hseg
);
1916 * If allocating at least as much as the last allocation,
1917 * use a_lastgap's base as a better estimate of hibound.
1919 if (as
->a_lastgap
&&
1920 minlen
>= as
->a_lastgap
->s_size
&&
1921 hibound
>= as
->a_lastgap
->s_base
)
1922 hibound
= as
->a_lastgap
->s_base
;
1924 hseg
= as_findseg(as
, hibound
, 1);
1925 if (hseg
->s_base
+ hseg
->s_size
< hibound
) {
1929 lseg
= AS_SEGPREV(as
, hseg
);
1935 * Set lo and hi to the hole's boundaries. (We should really
1936 * use MAXADDR in place of hibound in the expression below,
1937 * but can't express it easily; using hibound in its place is
1940 lo
= (lseg
== NULL
) ? 0 : lseg
->s_base
+ lseg
->s_size
;
1941 hi
= (hseg
== NULL
) ? hibound
: hseg
->s_base
;
1943 * If the iteration has moved past the interval from lobound
1944 * to hibound it's pointless to continue.
1946 if ((forward
&& lo
> hibound
) || (!forward
&& hi
< lobound
))
1948 else if (lo
> hibound
|| hi
< lobound
)
1951 * Candidate hole lies at least partially within the allowable
1952 * range. Restrict it to fall completely within that range,
1953 * i.e., to [max(lo, lobound), min(hi, hibound)].
1960 * Verify that the candidate hole is big enough and meets
1961 * hardware constraints. If the hole is too small, no need
1962 * to do the further checks since they will fail.
1966 if (*lenp
>= minlen
&& valid_va_range_aligned(basep
, lenp
,
1967 minlen
, forward
? AH_LO
: AH_HI
, align
, redzone
, off
) &&
1968 ((flags
& AH_CONTAIN
) == 0 ||
1969 (*basep
<= addr
&& *basep
+ *lenp
> addr
))) {
1971 as
->a_lastgap
= hseg
;
1973 as
->a_lastgaphl
= hseg
;
1975 as
->a_lastgaphl
= lseg
;
1981 * Move to the next hole.
1987 hseg
= AS_SEGNEXT(as
, hseg
);
1992 lseg
= AS_SEGPREV(as
, lseg
);
1995 if (fast_path
&& (align
!= 0 || save_redzone
!= 0)) {
1997 minlen
= save_minlen
;
1998 redzone
= save_redzone
;
2008 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2010 * If flags specifies AH_HI, the hole will have the highest possible address
2011 * in the range. We use the as->a_lastgap field to figure out where to
2012 * start looking for a gap.
2014 * Otherwise, the gap will have the lowest possible address.
2016 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2018 * If an adequate hole is found, base and len are set to reflect the part of
2019 * the hole that is within range, and 0 is returned, otherwise,
2022 * NOTE: This routine is not correct when base+len overflows caddr_t.
2025 as_gap(struct as
*as
, size_t minlen
, caddr_t
*basep
, size_t *lenp
, uint_t flags
,
2029 return (as_gap_aligned(as
, minlen
, basep
, lenp
, flags
, addr
, 0, 0, 0));
2033 * Return the next range within [base, base + len) that is backed
2034 * with "real memory". Skip holes and non-seg_vn segments.
2035 * We're lazy and only return one segment at a time.
2038 as_memory(struct as
*as
, caddr_t
*basep
, size_t *lenp
)
2040 extern const struct seg_ops segspt_shmops
; /* needs a header file */
2042 caddr_t addr
, eaddr
;
2045 AS_LOCK_ENTER(as
, RW_READER
);
2048 eaddr
= addr
+ *lenp
;
2050 seg
= as_findseg(as
, addr
, 0);
2052 addr
= MAX(seg
->s_base
, addr
);
2055 if (seg
== NULL
|| addr
>= eaddr
|| eaddr
<= seg
->s_base
) {
2060 if (seg
->s_ops
== &segvn_ops
) {
2061 segend
= seg
->s_base
+ seg
->s_size
;
2066 * We do ISM by looking into the private data
2067 * to determine the real size of the segment.
2069 if (seg
->s_ops
== &segspt_shmops
) {
2070 segend
= seg
->s_base
+ spt_realsize(seg
);
2075 seg
= AS_SEGNEXT(as
, seg
);
2084 *lenp
= eaddr
- addr
;
2086 *lenp
= segend
- addr
;
2093 * Determine whether data from the mappings in interval [addr, addr + size)
2094 * are in the primary memory (core) cache.
2097 as_incore(struct as
*as
, caddr_t addr
,
2098 size_t size
, char *vec
, size_t *sizep
)
2102 caddr_t raddr
; /* rounded down addr */
2103 size_t rsize
; /* rounded up size */
2104 size_t isize
; /* iteration size */
2105 int error
= 0; /* result, assume success */
2108 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2109 rsize
= ((((size_t)addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2112 if (raddr
+ rsize
< raddr
) /* check for wraparound */
2115 AS_LOCK_ENTER(as
, RW_READER
);
2116 seg
= as_segat(as
, raddr
);
2122 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2123 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2124 seg
= AS_SEGNEXT(as
, seg
);
2125 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2130 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2131 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2134 *sizep
+= isize
= segop_incore(seg
, raddr
, ssize
, vec
);
2135 if (isize
!= ssize
) {
2139 vec
+= btopr(ssize
);
2146 as_segunlock(struct seg
*seg
, caddr_t addr
, int attr
,
2147 ulong_t
*bitmap
, size_t position
, size_t npages
)
2149 caddr_t range_start
;
2150 size_t pos1
= position
;
2153 size_t end_pos
= npages
+ position
;
2155 while (bt_range(bitmap
, &pos1
, &pos2
, end_pos
)) {
2156 size
= ptob((pos2
- pos1
));
2157 range_start
= (caddr_t
)((uintptr_t)addr
+
2158 ptob(pos1
- position
));
2160 (void) segop_lockop(seg
, range_start
, size
, attr
, MC_UNLOCK
,
2167 as_unlockerr(struct as
*as
, int attr
, ulong_t
*mlock_map
,
2168 caddr_t raddr
, size_t rsize
)
2170 struct seg
*seg
= as_segat(as
, raddr
);
2173 while (rsize
!= 0) {
2174 if (raddr
>= seg
->s_base
+ seg
->s_size
)
2175 seg
= AS_SEGNEXT(as
, seg
);
2177 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2178 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2182 as_segunlock(seg
, raddr
, attr
, mlock_map
, 0, btopr(ssize
));
2190 * Cache control operations over the interval [addr, addr + size) in
2191 * address space "as".
2195 as_ctl(struct as
*as
, caddr_t addr
, size_t size
, int func
, int attr
,
2196 uintptr_t arg
, ulong_t
*lock_map
, size_t pos
)
2198 struct seg
*seg
; /* working segment */
2199 caddr_t raddr
; /* rounded down addr */
2200 caddr_t initraddr
; /* saved initial rounded down addr */
2201 size_t rsize
; /* rounded up size */
2202 size_t initrsize
; /* saved initial rounded up size */
2203 size_t ssize
; /* size of seg */
2204 int error
= 0; /* result */
2205 size_t mlock_size
; /* size of bitmap */
2206 ulong_t
*mlock_map
; /* pointer to bitmap used */
2207 /* to represent the locked */
2210 if (error
== IE_RETRY
)
2211 AS_LOCK_ENTER(as
, RW_WRITER
);
2213 AS_LOCK_ENTER(as
, RW_READER
);
2216 * If these are address space lock/unlock operations, loop over
2217 * all segments in the address space, as appropriate.
2219 if (func
== MC_LOCKAS
) {
2221 size_t rlen
= 0; /* rounded as length */
2225 if (arg
& MCL_FUTURE
) {
2226 mutex_enter(&as
->a_contents
);
2228 mutex_exit(&as
->a_contents
);
2230 if ((arg
& MCL_CURRENT
) == 0) {
2235 seg
= AS_SEGFIRST(as
);
2242 raddr
= (caddr_t
)((uintptr_t)seg
->s_base
&
2243 (uintptr_t)PAGEMASK
);
2244 rlen
+= (((uintptr_t)(seg
->s_base
+ seg
->s_size
) +
2245 PAGEOFFSET
) & PAGEMASK
) - (uintptr_t)raddr
;
2246 } while ((seg
= AS_SEGNEXT(as
, seg
)) != NULL
);
2248 mlock_size
= BT_BITOUL(btopr(rlen
));
2249 if ((mlock_map
= (ulong_t
*)kmem_zalloc(mlock_size
*
2250 sizeof (ulong_t
), KM_NOSLEEP
)) == NULL
) {
2255 for (seg
= AS_SEGFIRST(as
); seg
; seg
= AS_SEGNEXT(as
, seg
)) {
2256 if ((seg
->s_flags
& S_HOLE
) != 0)
2259 error
= segop_lockop(seg
, seg
->s_base
,
2260 seg
->s_size
, attr
, MC_LOCK
, mlock_map
, pos
);
2263 pos
+= seg_pages(seg
);
2267 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
;
2268 seg
= AS_SEGNEXT(as
, seg
)) {
2270 raddr
= (caddr_t
)((uintptr_t)seg
->s_base
&
2271 (uintptr_t)PAGEMASK
);
2272 npages
= seg_pages(seg
);
2273 as_segunlock(seg
, raddr
, attr
, mlock_map
,
2279 kmem_free(mlock_map
, mlock_size
* sizeof (ulong_t
));
2282 } else if (func
== MC_UNLOCKAS
) {
2283 mutex_enter(&as
->a_contents
);
2285 mutex_exit(&as
->a_contents
);
2287 for (seg
= AS_SEGFIRST(as
); seg
; seg
= AS_SEGNEXT(as
, seg
)) {
2288 if ((seg
->s_flags
& S_HOLE
) != 0)
2291 error
= segop_lockop(seg
, seg
->s_base
,
2292 seg
->s_size
, attr
, MC_UNLOCK
, NULL
, 0);
2302 * Normalize addresses and sizes.
2304 initraddr
= raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2305 initrsize
= rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2308 if (raddr
+ rsize
< raddr
) { /* check for wraparound */
2314 * Get initial segment.
2316 if ((seg
= as_segat(as
, raddr
)) == NULL
) {
2321 if (func
== MC_LOCK
) {
2322 mlock_size
= BT_BITOUL(btopr(rsize
));
2323 if ((mlock_map
= (ulong_t
*)kmem_zalloc(mlock_size
*
2324 sizeof (ulong_t
), KM_NOSLEEP
)) == NULL
) {
2331 * Loop over all segments. If a hole in the address range is
2332 * discovered, then fail. For each segment, perform the appropriate
2333 * control operation.
2335 while (rsize
!= 0) {
2338 * Make sure there's no hole, calculate the portion
2339 * of the next segment to be operated over.
2341 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2342 seg
= AS_SEGNEXT(as
, seg
);
2343 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2344 if (func
== MC_LOCK
) {
2345 as_unlockerr(as
, attr
, mlock_map
,
2346 initraddr
, initrsize
- rsize
);
2347 kmem_free(mlock_map
,
2348 mlock_size
* sizeof (ulong_t
));
2354 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2355 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2360 * Dispatch on specific function.
2365 * Synchronize cached data from mappings with backing
2369 if (error
= segop_sync(seg
, raddr
, ssize
,
2370 attr
, (uint_t
)arg
)) {
2377 * Lock pages in memory.
2380 if (error
= segop_lockop(seg
, raddr
, ssize
,
2381 attr
, func
, mlock_map
, pos
)) {
2382 as_unlockerr(as
, attr
, mlock_map
, initraddr
,
2383 initrsize
- rsize
+ ssize
);
2384 kmem_free(mlock_map
, mlock_size
*
2392 * Unlock mapped pages.
2395 (void) segop_lockop(seg
, raddr
, ssize
, attr
, func
,
2400 * Store VM advise for mapped pages in segment layer.
2403 error
= segop_advise(seg
, raddr
, ssize
, (uint_t
)arg
);
2406 * Check for regular errors and special retry error
2409 if (error
== IE_RETRY
) {
2411 * Need to acquire writers lock, so
2412 * have to drop readers lock and start
2417 } else if (error
== IE_REATTACH
) {
2419 * Find segment for current address
2420 * because current segment just got
2421 * split or concatenated
2423 seg
= as_segat(as
, raddr
);
2438 case MC_INHERIT_ZERO
:
2439 error
= segop_inherit(seg
, raddr
, ssize
, SEGP_INH_ZERO
);
2450 panic("as_ctl: bad operation %d", func
);
2458 if (func
== MC_LOCK
)
2459 kmem_free(mlock_map
, mlock_size
* sizeof (ulong_t
));
2465 * If the lower levels returned EDEADLK for a segment lockop,
2466 * it means that we should retry the operation. Let's wait
2467 * a bit also to let the deadlock causing condition clear.
2468 * This is part of a gross hack to work around a design flaw
2469 * in the ufs/sds logging code and should go away when the
2470 * logging code is re-designed to fix the problem. See bug
2471 * 4125102 for details of the problem.
2473 if (error
== EDEADLK
) {
2482 fc_decode(faultcode_t fault_err
)
2486 switch (FC_CODE(fault_err
)) {
2488 error
= FC_ERRNO(fault_err
);
2501 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2502 * lists from each segment and copy them to one contiguous shadow list (plist)
2503 * as expected by the caller. Save pointers to per segment shadow lists at
2504 * the tail of plist so that they can be used during as_pageunlock().
2507 as_pagelock_segs(struct as
*as
, struct seg
*seg
, struct page
***ppp
,
2508 caddr_t addr
, size_t size
, enum seg_rw rw
)
2510 caddr_t sv_addr
= addr
;
2511 size_t sv_size
= size
;
2512 struct seg
*sv_seg
= seg
;
2516 pgcnt_t npages
= btop(size
);
2521 faultcode_t fault_err
= 0;
2523 extern const struct seg_ops segspt_shmops
;
2525 ASSERT(AS_LOCK_HELD(as
));
2526 ASSERT(seg
!= NULL
);
2527 ASSERT(addr
>= seg
->s_base
&& addr
< seg
->s_base
+ seg
->s_size
);
2528 ASSERT(addr
+ size
> seg
->s_base
+ seg
->s_size
);
2529 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
2530 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
2533 * Count the number of segments covered by the range we are about to
2534 * lock. The segment count is used to size the shadow list we return
2535 * back to the caller.
2537 for (; size
!= 0; size
-= ssize
, addr
+= ssize
) {
2538 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2540 seg
= AS_SEGNEXT(as
, seg
);
2541 if (seg
== NULL
|| addr
!= seg
->s_base
) {
2546 * Do a quick check if subsequent segments
2547 * will most likely support pagelock.
2549 if (seg
->s_ops
== &segvn_ops
) {
2552 if (segop_getvp(seg
, addr
, &vp
) != 0 ||
2557 } else if (seg
->s_ops
!= &segspt_shmops
) {
2563 if (addr
+ size
> seg
->s_base
+ seg
->s_size
) {
2564 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2571 plist
= kmem_zalloc((npages
+ segcnt
) * sizeof (page_t
*), KM_SLEEP
);
2577 for (cnt
= 0, pl_off
= 0; size
!= 0; size
-= ssize
, addr
+= ssize
) {
2578 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2579 seg
= AS_SEGNEXT(as
, seg
);
2580 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2582 ASSERT(cnt
< segcnt
);
2584 if (addr
+ size
> seg
->s_base
+ seg
->s_size
) {
2585 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2589 pl
= &plist
[npages
+ cnt
];
2590 error
= segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2595 ASSERT(plist
[npages
+ cnt
] != NULL
);
2596 ASSERT(pl_off
+ btop(ssize
) <= npages
);
2597 bcopy(plist
[npages
+ cnt
], &plist
[pl_off
],
2598 btop(ssize
) * sizeof (page_t
*));
2599 pl_off
+= btop(ssize
);
2604 ASSERT(cnt
== segcnt
- 1);
2610 * one of pagelock calls failed. The error type is in error variable.
2611 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2612 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2613 * back to the caller.
2619 for (cnt
= 0, addr
= sv_addr
; addr
< eaddr
; addr
+= ssize
) {
2620 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2621 seg
= AS_SEGNEXT(as
, seg
);
2622 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2624 ASSERT(cnt
< segcnt
);
2626 if (eaddr
> seg
->s_base
+ seg
->s_size
) {
2627 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2629 ssize
= eaddr
- addr
;
2631 pl
= &plist
[npages
+ cnt
];
2632 ASSERT(*pl
!= NULL
);
2633 (void) segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2639 kmem_free(plist
, (npages
+ segcnt
) * sizeof (page_t
*));
2641 if (error
!= ENOTSUP
&& error
!= EFAULT
) {
2647 * If we are here because pagelock failed due to the need to cow fault
2648 * in the pages we want to lock F_SOFTLOCK will do this job and in
2649 * next as_pagelock() call for this address range pagelock will
2650 * hopefully succeed.
2652 fault_err
= as_fault(as
->a_hat
, as
, sv_addr
, sv_size
, F_SOFTLOCK
, rw
);
2653 if (fault_err
!= 0) {
2654 return (fc_decode(fault_err
));
2662 * lock pages in a given address space. Return shadow list. If
2663 * the list is NULL, the MMU mapping is also locked.
2666 as_pagelock(struct as
*as
, struct page
***ppp
, caddr_t addr
,
2667 size_t size
, enum seg_rw rw
)
2671 faultcode_t fault_err
;
2675 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2676 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2680 * if the request crosses two segments let
2681 * as_fault handle it.
2683 AS_LOCK_ENTER(as
, RW_READER
);
2685 seg
= as_segat(as
, raddr
);
2690 ASSERT(raddr
>= seg
->s_base
&& raddr
< seg
->s_base
+ seg
->s_size
);
2691 if (raddr
+ rsize
> seg
->s_base
+ seg
->s_size
) {
2692 return (as_pagelock_segs(as
, seg
, ppp
, raddr
, rsize
, rw
));
2694 if (raddr
+ rsize
<= raddr
) {
2700 * try to lock pages and pass back shadow list
2702 err
= segop_pagelock(seg
, raddr
, rsize
, ppp
, L_PAGELOCK
, rw
);
2706 if (err
== 0 || (err
!= ENOTSUP
&& err
!= EFAULT
)) {
2711 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2712 * to no pagelock support for this segment or pages need to be cow
2713 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2714 * this as_pagelock() call and in the next as_pagelock() call for the
2715 * same address range pagelock call will hopefull succeed.
2717 fault_err
= as_fault(as
->a_hat
, as
, addr
, size
, F_SOFTLOCK
, rw
);
2718 if (fault_err
!= 0) {
2719 return (fc_decode(fault_err
));
2727 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2728 * lists from the end of plist and call pageunlock interface for each segment.
2729 * Drop as lock and free plist.
2732 as_pageunlock_segs(struct as
*as
, struct seg
*seg
, caddr_t addr
, size_t size
,
2733 struct page
**plist
, enum seg_rw rw
)
2736 caddr_t eaddr
= addr
+ size
;
2737 pgcnt_t npages
= btop(size
);
2741 ASSERT(AS_LOCK_HELD(as
));
2742 ASSERT(seg
!= NULL
);
2743 ASSERT(addr
>= seg
->s_base
&& addr
< seg
->s_base
+ seg
->s_size
);
2744 ASSERT(addr
+ size
> seg
->s_base
+ seg
->s_size
);
2745 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
2746 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
2747 ASSERT(plist
!= NULL
);
2749 for (cnt
= 0; addr
< eaddr
; addr
+= ssize
) {
2750 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2751 seg
= AS_SEGNEXT(as
, seg
);
2752 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2755 if (eaddr
> seg
->s_base
+ seg
->s_size
) {
2756 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2758 ssize
= eaddr
- addr
;
2760 pl
= &plist
[npages
+ cnt
];
2761 ASSERT(*pl
!= NULL
);
2762 (void) segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2769 kmem_free(plist
, (npages
+ cnt
) * sizeof (page_t
*));
2773 * unlock pages in a given address range
2776 as_pageunlock(struct as
*as
, struct page
**pp
, caddr_t addr
, size_t size
,
2784 * if the shadow list is NULL, as_pagelock was
2785 * falling back to as_fault
2788 (void) as_fault(as
->a_hat
, as
, addr
, size
, F_SOFTUNLOCK
, rw
);
2792 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2793 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2796 AS_LOCK_ENTER(as
, RW_READER
);
2797 seg
= as_segat(as
, raddr
);
2798 ASSERT(seg
!= NULL
);
2800 ASSERT(raddr
>= seg
->s_base
&& raddr
< seg
->s_base
+ seg
->s_size
);
2801 if (raddr
+ rsize
<= seg
->s_base
+ seg
->s_size
) {
2802 (void) segop_pagelock(seg
, raddr
, rsize
, &pp
, L_PAGEUNLOCK
, rw
);
2804 as_pageunlock_segs(as
, seg
, raddr
, rsize
, pp
, rw
);
2811 as_setpagesize(struct as
*as
, caddr_t addr
, size_t size
, uint_t szc
,
2816 caddr_t raddr
; /* rounded down addr */
2817 size_t rsize
; /* rounded up size */
2819 size_t pgsz
= page_get_pagesize(szc
);
2822 if (!IS_P2ALIGNED(addr
, pgsz
) || !IS_P2ALIGNED(size
, pgsz
)) {
2829 if (raddr
+ rsize
< raddr
) /* check for wraparound */
2832 AS_LOCK_ENTER(as
, RW_WRITER
);
2833 as_clearwatchprot(as
, raddr
, rsize
);
2834 seg
= as_segat(as
, raddr
);
2841 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2842 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2843 seg
= AS_SEGNEXT(as
, seg
);
2844 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2849 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
2850 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2856 error
= segop_setpagesize(seg
, raddr
, ssize
, szc
);
2858 if (error
== IE_NOMEM
) {
2863 if (error
== IE_RETRY
) {
2868 if (error
== ENOTSUP
) {
2873 if (wait
&& (error
== EAGAIN
)) {
2875 * Memory is currently locked. It must be unlocked
2876 * before this operation can succeed through a retry.
2877 * The possible reasons for locked memory and
2878 * corresponding strategies for unlocking are:
2880 * wait for a signal that the I/O operation
2881 * has completed and the memory is unlocked.
2882 * (2) Asynchronous I/O
2883 * The aio subsystem does not unlock pages when
2884 * the I/O is completed. Those pages are unlocked
2885 * when the application calls aiowait/aioerror.
2886 * So, to prevent blocking forever, cv_broadcast()
2887 * is done to wake up aio_cleanup_thread.
2888 * Subsequently, segvn_reclaim will be called, and
2889 * that will do AS_CLRUNMAPWAIT() and wake us up.
2890 * (3) Long term page locking:
2891 * This is not relevant for as_setpagesize()
2892 * because we cannot change the page size for
2893 * driver memory. The attempt to do so will
2894 * fail with a different error than EAGAIN so
2895 * there's no need to trigger as callbacks like
2896 * as_unmap, as_setprot or as_free would do.
2898 mutex_enter(&as
->a_contents
);
2899 if (!AS_ISNOUNMAPWAIT(as
)) {
2900 if (AS_ISUNMAPWAIT(as
) == 0) {
2901 cv_broadcast(&as
->a_cv
);
2903 AS_SETUNMAPWAIT(as
);
2905 while (AS_ISUNMAPWAIT(as
)) {
2906 cv_wait(&as
->a_cv
, &as
->a_contents
);
2910 * We may have raced with
2911 * segvn_reclaim()/segspt_reclaim(). In this
2912 * case clean nounmapwait flag and retry since
2913 * softlockcnt in this segment may be already
2914 * 0. We don't drop as writer lock so our
2915 * number of retries without sleeping should
2916 * be very small. See segvn_reclaim() for
2919 AS_CLRNOUNMAPWAIT(as
);
2920 mutex_exit(&as
->a_contents
);
2923 mutex_exit(&as
->a_contents
);
2925 } else if (error
!= 0) {
2935 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2936 * in its chunk where s_szc is less than the szc we want to set.
2939 as_iset3_default_lpsize(struct as
*as
, caddr_t raddr
, size_t rsize
, uint_t szc
,
2946 ASSERT(AS_WRITE_HELD(as
));
2948 seg
= as_segat(as
, raddr
);
2950 panic("as_iset3_default_lpsize: no seg");
2953 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2954 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2955 seg
= AS_SEGNEXT(as
, seg
);
2956 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2957 panic("as_iset3_default_lpsize: as changed");
2960 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
2961 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2966 if (szc
> seg
->s_szc
) {
2967 error
= segop_setpagesize(seg
, raddr
, ssize
, szc
);
2968 /* Only retry on EINVAL segments that have no vnode. */
2969 if (error
== EINVAL
) {
2971 if ((segop_gettype(seg
, raddr
) & MAP_SHARED
) &&
2972 (segop_getvp(seg
, raddr
, &vp
) != 0 ||
2988 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2989 * pagesize on each segment in its range, but if any fails with EINVAL,
2990 * then it reduces the pagesizes to the next size in the bitmap and
2991 * retries as_iset3_default_lpsize(). The reason why the code retries
2992 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2993 * match the bigger sizes, and (b) it's hard to get this offset (to begin
2994 * with) to pass to map_pgszcvec().
2997 as_iset2_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
, uint_t szc
,
3003 ASSERT(AS_WRITE_HELD(as
));
3006 error
= as_iset3_default_lpsize(as
, addr
, size
, szc
, &retry
);
3007 if (error
== EINVAL
&& retry
) {
3008 szcvec
&= ~(1 << szc
);
3012 szc
= highbit(szcvec
) - 1;
3020 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3021 * segments have a smaller szc than we want to set. For each such area,
3022 * it calls as_iset2_default_lpsize()
3025 as_iset1_default_lpsize(struct as
*as
, caddr_t raddr
, size_t rsize
, uint_t szc
,
3030 caddr_t setaddr
= raddr
;
3035 ASSERT(AS_WRITE_HELD(as
));
3037 seg
= as_segat(as
, raddr
);
3039 panic("as_iset1_default_lpsize: no seg");
3041 if (seg
->s_szc
< szc
) {
3047 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
, setsize
+= ssize
) {
3048 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
3049 seg
= AS_SEGNEXT(as
, seg
);
3050 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
3051 panic("as_iset1_default_lpsize: as changed");
3053 if (seg
->s_szc
>= szc
&& set
) {
3054 ASSERT(setsize
!= 0);
3055 error
= as_iset2_default_lpsize(as
,
3056 setaddr
, setsize
, szc
, szcvec
);
3061 } else if (seg
->s_szc
< szc
&& !set
) {
3067 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
3068 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
3075 ASSERT(setsize
!= 0);
3076 error
= as_iset2_default_lpsize(as
, setaddr
, setsize
,
3083 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3084 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3085 * chunk to as_iset1_default_lpsize().
3088 as_iset_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
, int flags
,
3091 int rtype
= (type
& MAP_SHARED
) ? MAPPGSZC_SHM
: MAPPGSZC_PRIVM
;
3092 uint_t szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
,
3103 ASSERT(AS_WRITE_HELD(as
));
3104 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
3105 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
3108 if (szcvec
<= 1) { /* skip if base page size */
3112 /* Get the pagesize of the first larger page size. */
3113 szc
= lowbit(szcvec
) - 1;
3114 pgsz
= page_get_pagesize(szc
);
3115 eaddr
= addr
+ size
;
3116 addr
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
3117 eaddr
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
3119 save_szcvec
= szcvec
;
3120 szcvec
>>= (szc
+ 1);
3123 if ((szcvec
& 0x1) == 0) {
3129 pgsz
= page_get_pagesize(nszc
);
3130 a
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
3135 error
= as_iset1_default_lpsize(as
, addr
, segsize
, szc
,
3146 ASSERT(addr
< eaddr
);
3147 szcvec
= save_szcvec
;
3149 a
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
3154 error
= as_iset1_default_lpsize(as
, addr
, segsize
, szc
,
3161 szcvec
&= ~(1 << szc
);
3163 szc
= highbit(szcvec
) - 1;
3164 pgsz
= page_get_pagesize(szc
);
3167 ASSERT(addr
== eaddr
);
3173 * Set the default large page size for the range. Called via memcntl with
3174 * page size set to 0. as_set_default_lpsize breaks the range down into
3175 * chunks with the same type/flags, ignores-non segvn segments, and passes
3176 * each chunk to as_iset_default_lpsize().
3179 as_set_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
)
3195 AS_LOCK_ENTER(as
, RW_WRITER
);
3199 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3200 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
3203 if (raddr
+ rsize
< raddr
) { /* check for wraparound */
3207 as_clearwatchprot(as
, raddr
, rsize
);
3208 seg
= as_segat(as
, raddr
);
3214 if (seg
->s_ops
== &segvn_ops
) {
3215 rtype
= segop_gettype(seg
, addr
);
3216 rflags
= rtype
& (MAP_TEXT
| MAP_INITDATA
);
3217 rtype
= rtype
& (MAP_SHARED
| MAP_PRIVATE
);
3225 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
, setsize
+= ssize
) {
3226 if (raddr
>= (seg
->s_base
+ seg
->s_size
)) {
3227 seg
= AS_SEGNEXT(as
, seg
);
3228 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
3232 if (seg
->s_ops
== &segvn_ops
) {
3233 stype
= segop_gettype(seg
, raddr
);
3234 sflags
= stype
& (MAP_TEXT
| MAP_INITDATA
);
3235 stype
&= (MAP_SHARED
| MAP_PRIVATE
);
3236 if (segvn
&& (rflags
!= sflags
||
3239 * The next segment is also segvn but
3240 * has different flags and/or type.
3242 ASSERT(setsize
!= 0);
3243 error
= as_iset_default_lpsize(as
,
3244 setaddr
, setsize
, rflags
, rtype
);
3252 } else if (!segvn
) {
3260 /* The next segment is not segvn. */
3261 ASSERT(setsize
!= 0);
3262 error
= as_iset_default_lpsize(as
,
3263 setaddr
, setsize
, rflags
, rtype
);
3270 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
3271 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
3276 if (error
== 0 && segvn
) {
3277 /* The last chunk when rsize == 0. */
3278 ASSERT(setsize
!= 0);
3279 error
= as_iset_default_lpsize(as
, setaddr
, setsize
,
3283 if (error
== IE_RETRY
) {
3285 } else if (error
== IE_NOMEM
) {
3287 } else if (error
== ENOTSUP
) {
3289 } else if (error
== EAGAIN
) {
3290 mutex_enter(&as
->a_contents
);
3291 if (!AS_ISNOUNMAPWAIT(as
)) {
3292 if (AS_ISUNMAPWAIT(as
) == 0) {
3293 cv_broadcast(&as
->a_cv
);
3295 AS_SETUNMAPWAIT(as
);
3297 while (AS_ISUNMAPWAIT(as
)) {
3298 cv_wait(&as
->a_cv
, &as
->a_contents
);
3300 mutex_exit(&as
->a_contents
);
3301 AS_LOCK_ENTER(as
, RW_WRITER
);
3304 * We may have raced with
3305 * segvn_reclaim()/segspt_reclaim(). In this case
3306 * clean nounmapwait flag and retry since softlockcnt
3307 * in this segment may be already 0. We don't drop as
3308 * writer lock so our number of retries without
3309 * sleeping should be very small. See segvn_reclaim()
3310 * for more comments.
3312 AS_CLRNOUNMAPWAIT(as
);
3313 mutex_exit(&as
->a_contents
);
3324 * Setup all of the uninitialized watched pages that we can.
3327 as_setwatch(struct as
*as
)
3329 struct watched_page
*pwp
;
3335 if (avl_numnodes(&as
->a_wpage
) == 0)
3338 ASSERT(AS_WRITE_HELD(as
));
3340 for (pwp
= avl_first(&as
->a_wpage
); pwp
!= NULL
;
3341 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
)) {
3344 vaddr
= pwp
->wp_vaddr
;
3345 if (pwp
->wp_oprot
!= 0 || /* already set up */
3346 (seg
= as_segat(as
, vaddr
)) == NULL
||
3347 segop_getprot(seg
, vaddr
, 0, &prot
) != 0)
3350 pwp
->wp_oprot
= prot
;
3352 prot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3354 prot
&= ~PROT_WRITE
;
3356 prot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3357 if (!(pwp
->wp_flags
& WP_NOWATCH
) && prot
!= pwp
->wp_oprot
) {
3358 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, prot
);
3359 if (err
== IE_RETRY
) {
3361 ASSERT(retrycnt
== 0);
3366 pwp
->wp_prot
= prot
;
3371 * Clear all of the watched pages in the address space.
3374 as_clearwatch(struct as
*as
)
3376 struct watched_page
*pwp
;
3382 if (avl_numnodes(&as
->a_wpage
) == 0)
3385 ASSERT(AS_WRITE_HELD(as
));
3387 for (pwp
= avl_first(&as
->a_wpage
); pwp
!= NULL
;
3388 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
)) {
3391 vaddr
= pwp
->wp_vaddr
;
3392 if (pwp
->wp_oprot
== 0 || /* not set up */
3393 (seg
= as_segat(as
, vaddr
)) == NULL
)
3396 if ((prot
= pwp
->wp_oprot
) != pwp
->wp_prot
) {
3397 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, prot
);
3398 if (err
== IE_RETRY
) {
3399 ASSERT(retrycnt
== 0);
3410 * Force a new setup for all the watched pages in the range.
3413 as_setwatchprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
3415 struct watched_page
*pwp
;
3416 struct watched_page tpw
;
3417 caddr_t eaddr
= addr
+ size
;
3424 if (avl_numnodes(&as
->a_wpage
) == 0)
3427 ASSERT(AS_WRITE_HELD(as
));
3429 tpw
.wp_vaddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3430 if ((pwp
= avl_find(&as
->a_wpage
, &tpw
, &where
)) == NULL
)
3431 pwp
= avl_nearest(&as
->a_wpage
, where
, AVL_AFTER
);
3433 while (pwp
!= NULL
&& pwp
->wp_vaddr
< eaddr
) {
3435 vaddr
= pwp
->wp_vaddr
;
3439 wprot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3441 wprot
&= ~PROT_WRITE
;
3443 wprot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3444 if (!(pwp
->wp_flags
& WP_NOWATCH
) && wprot
!= pwp
->wp_oprot
) {
3446 seg
= as_segat(as
, vaddr
);
3448 panic("as_setwatchprot: no seg");
3451 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, wprot
);
3452 if (err
== IE_RETRY
) {
3453 ASSERT(retrycnt
== 0);
3458 pwp
->wp_oprot
= prot
;
3459 pwp
->wp_prot
= wprot
;
3461 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
);
3466 * Clear all of the watched pages in the range.
3469 as_clearwatchprot(struct as
*as
, caddr_t addr
, size_t size
)
3471 caddr_t eaddr
= addr
+ size
;
3472 struct watched_page
*pwp
;
3473 struct watched_page tpw
;
3479 if (avl_numnodes(&as
->a_wpage
) == 0)
3482 tpw
.wp_vaddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3483 if ((pwp
= avl_find(&as
->a_wpage
, &tpw
, &where
)) == NULL
)
3484 pwp
= avl_nearest(&as
->a_wpage
, where
, AVL_AFTER
);
3486 ASSERT(AS_WRITE_HELD(as
));
3488 while (pwp
!= NULL
&& pwp
->wp_vaddr
< eaddr
) {
3490 if ((prot
= pwp
->wp_oprot
) != 0) {
3493 if (prot
!= pwp
->wp_prot
) {
3495 seg
= as_segat(as
, pwp
->wp_vaddr
);
3498 err
= segop_setprot(seg
, pwp
->wp_vaddr
,
3500 if (err
== IE_RETRY
) {
3501 ASSERT(retrycnt
== 0);
3511 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
);
3516 as_signal_proc(struct as
*as
, k_siginfo_t
*siginfo
)
3520 mutex_enter(&pidlock
);
3521 for (p
= practive
; p
; p
= p
->p_next
) {
3522 if (p
->p_as
== as
) {
3523 mutex_enter(&p
->p_lock
);
3525 sigaddq(p
, NULL
, siginfo
, KM_NOSLEEP
);
3526 mutex_exit(&p
->p_lock
);
3529 mutex_exit(&pidlock
);
3533 * return memory object ID
3536 as_getmemid(struct as
*as
, caddr_t addr
, memid_t
*memidp
)
3541 AS_LOCK_ENTER(as
, RW_READER
);
3542 seg
= as_segat(as
, addr
);
3548 sts
= segop_getmemid(seg
, addr
, memidp
);