4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
42 * VM - address spaces.
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
72 clock_t deadlk_wait
= 1; /* number of ticks to wait before retrying */
74 static struct kmem_cache
*as_cache
;
76 static void as_setwatchprot(struct as
*, caddr_t
, size_t, uint_t
);
77 static void as_clearwatchprot(struct as
*, caddr_t
, size_t);
78 int as_map_locked(struct as
*, caddr_t
, size_t, int ((*)()), void *);
82 * Verifying the segment lists is very time-consuming; it may not be
83 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
86 #define VERIFY_SEGLIST
91 * Allocate a new callback data structure entry and fill in the events of
92 * interest, the address range of interest, and the callback argument.
93 * Link the entry on the as->a_callbacks list. A callback entry for the
94 * entire address space may be specified with vaddr = 0 and size = -1.
96 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 * the specified as, the caller must guarantee persistence of the specified as
98 * for the duration of this function (eg. pages being locked within the as
99 * will guarantee persistence).
102 as_add_callback(struct as
*as
, void (*cb_func
)(), void *arg
, uint_t events
,
103 caddr_t vaddr
, size_t size
, int sleepflag
)
105 struct as_callback
*current_head
, *cb
;
109 /* callback function and an event are mandatory */
110 if ((cb_func
== NULL
) || ((events
& AS_ALL_EVENT
) == 0))
113 /* Adding a callback after as_free has been called is not allowed */
118 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 * is the entire address space so no rounding is done in that case.
122 saddr
= (caddr_t
)((uintptr_t)vaddr
& (uintptr_t)PAGEMASK
);
123 rsize
= (((size_t)(vaddr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
125 /* check for wraparound */
126 if (saddr
+ rsize
< saddr
)
135 /* Allocate and initialize a callback entry */
136 cb
= kmem_zalloc(sizeof (struct as_callback
), sleepflag
);
140 cb
->ascb_func
= cb_func
;
142 cb
->ascb_events
= events
;
143 cb
->ascb_saddr
= saddr
;
144 cb
->ascb_len
= rsize
;
146 /* Add the entry to the list */
147 mutex_enter(&as
->a_contents
);
148 current_head
= as
->a_callbacks
;
149 as
->a_callbacks
= cb
;
150 cb
->ascb_next
= current_head
;
153 * The call to this function may lose in a race with
154 * a pertinent event - eg. a thread does long term memory locking
155 * but before the callback is added another thread executes as_unmap.
156 * A broadcast here resolves that.
158 if ((cb
->ascb_events
& AS_UNMAPWAIT_EVENT
) && AS_ISUNMAPWAIT(as
)) {
160 cv_broadcast(&as
->a_cv
);
163 mutex_exit(&as
->a_contents
);
168 * Search the callback list for an entry which pertains to arg.
170 * This is called from within the client upon completion of the callback.
172 * AS_CALLBACK_DELETED (callback entry found and deleted)
173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 * entry will be made in as_do_callbacks)
177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 * set, it indicates that as_do_callbacks is processing this entry. The
179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 * to unblock as_do_callbacks, in case it is blocked.
182 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 * the specified as, the caller must guarantee persistence of the specified as
184 * for the duration of this function (eg. pages being locked within the as
185 * will guarantee persistence).
188 as_delete_callback(struct as
*as
, void *arg
)
190 struct as_callback
**prevcb
= &as
->a_callbacks
;
191 struct as_callback
*cb
;
192 uint_t rc
= AS_CALLBACK_NOTFOUND
;
194 mutex_enter(&as
->a_contents
);
195 for (cb
= as
->a_callbacks
; cb
; prevcb
= &cb
->ascb_next
, cb
= *prevcb
) {
196 if (cb
->ascb_arg
!= arg
)
200 * If the events indicate AS_CALLBACK_CALLED, just clear
201 * AS_ALL_EVENT in the events field and wakeup the thread
202 * that may be waiting in as_do_callbacks. as_do_callbacks
203 * will take care of removing this entry from the list. In
204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
205 * (AS_CALLBACK_CALLED not set), just remove it from the
206 * list, return the memory and return AS_CALLBACK_DELETED.
208 if ((cb
->ascb_events
& AS_CALLBACK_CALLED
) != 0) {
209 /* leave AS_CALLBACK_CALLED */
210 cb
->ascb_events
&= ~AS_ALL_EVENT
;
211 rc
= AS_CALLBACK_DELETE_DEFERRED
;
212 cv_broadcast(&as
->a_cv
);
214 *prevcb
= cb
->ascb_next
;
215 kmem_free(cb
, sizeof (struct as_callback
));
216 rc
= AS_CALLBACK_DELETED
;
220 mutex_exit(&as
->a_contents
);
225 * Searches the as callback list for a matching entry.
226 * Returns a pointer to the first matching callback, or NULL if
228 * This function never sleeps so it is ok to call it with more
229 * locks held but the (required) a_contents mutex.
231 * See also comment on as_do_callbacks below.
233 static struct as_callback
*
234 as_find_callback(struct as
*as
, uint_t events
, caddr_t event_addr
,
237 struct as_callback
*cb
;
239 ASSERT(MUTEX_HELD(&as
->a_contents
));
240 for (cb
= as
->a_callbacks
; cb
!= NULL
; cb
= cb
->ascb_next
) {
242 * If the callback has not already been called, then
243 * check if events or address range pertains. An event_len
244 * of zero means do an unconditional callback.
246 if (((cb
->ascb_events
& AS_CALLBACK_CALLED
) != 0) ||
247 ((event_len
!= 0) && (((cb
->ascb_events
& events
) == 0) ||
248 (event_addr
+ event_len
< cb
->ascb_saddr
) ||
249 (event_addr
> (cb
->ascb_saddr
+ cb
->ascb_len
))))) {
258 * Executes a given callback and removes it from the callback list for
259 * this address space.
260 * This function may sleep so the caller must drop all locks except
261 * a_contents before calling this func.
263 * See also comments on as_do_callbacks below.
266 as_execute_callback(struct as
*as
, struct as_callback
*cb
,
269 struct as_callback
**prevcb
;
272 ASSERT(MUTEX_HELD(&as
->a_contents
) && (cb
->ascb_events
& events
));
273 cb
->ascb_events
|= AS_CALLBACK_CALLED
;
274 mutex_exit(&as
->a_contents
);
275 (*cb
->ascb_func
)(as
, cb
->ascb_arg
, events
);
276 mutex_enter(&as
->a_contents
);
278 * the callback function is required to delete the callback
279 * when the callback function determines it is OK for
280 * this thread to continue. as_delete_callback will clear
281 * the AS_ALL_EVENT in the events field when it is deleted.
282 * If the callback function called as_delete_callback,
283 * events will already be cleared and there will be no blocking.
285 while ((cb
->ascb_events
& events
) != 0) {
286 cv_wait(&as
->a_cv
, &as
->a_contents
);
289 * This entry needs to be taken off the list. Normally, the
290 * callback func itself does that, but unfortunately the list
291 * may have changed while the callback was running because the
292 * a_contents mutex was dropped and someone else other than the
293 * callback func itself could have called as_delete_callback,
294 * so we have to search to find this entry again. The entry
295 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
297 cb_arg
= cb
->ascb_arg
;
298 prevcb
= &as
->a_callbacks
;
299 for (cb
= as
->a_callbacks
; cb
!= NULL
;
300 prevcb
= &cb
->ascb_next
, cb
= *prevcb
) {
301 if (((cb
->ascb_events
& AS_CALLBACK_CALLED
) == 0) ||
302 (cb_arg
!= cb
->ascb_arg
)) {
305 *prevcb
= cb
->ascb_next
;
306 kmem_free(cb
, sizeof (struct as_callback
));
312 * Check the callback list for a matching event and intersection of
313 * address range. If there is a match invoke the callback. Skip an entry if:
314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 * - not event of interest
316 * - not address range of interest
318 * An event_len of zero indicates a request for an unconditional callback
319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
320 * a_contents lock must be dropped before a callback, so only one callback
321 * can be done before returning. Return -1 (true) if a callback was
322 * executed and removed from the list, else return 0 (false).
324 * The logically separate parts, i.e. finding a matching callback and
325 * executing a given callback have been separated into two functions
326 * so that they can be called with different sets of locks held beyond
327 * the always-required a_contents. as_find_callback does not sleep so
328 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 * rwlock) are held. as_execute_callback on the other hand may sleep
330 * so all locks beyond a_contents must be dropped by the caller if one
331 * does not want to end comatose.
334 as_do_callbacks(struct as
*as
, uint_t events
, caddr_t event_addr
,
337 struct as_callback
*cb
;
339 if ((cb
= as_find_callback(as
, events
, event_addr
, event_len
))) {
340 as_execute_callback(as
, cb
, events
);
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
358 as_findseg(struct as
*as
, caddr_t addr
, int tail
)
360 struct seg
*seg
= as
->a_seglast
;
363 ASSERT(AS_LOCK_HELD(as
));
366 seg
->s_base
<= addr
&&
367 addr
< seg
->s_base
+ seg
->s_size
)
370 seg
= avl_find(&as
->a_segtree
, &addr
, &where
);
372 return (as
->a_seglast
= seg
);
374 seg
= avl_nearest(&as
->a_segtree
, where
, AVL_AFTER
);
375 if (seg
== NULL
&& tail
)
376 seg
= avl_last(&as
->a_segtree
);
377 return (as
->a_seglast
= seg
);
380 #ifdef VERIFY_SEGLIST
382 * verify that the linked list is coherent
385 as_verify(struct as
*as
)
387 struct seg
*seg
, *seglast
, *p
, *n
;
390 if (do_as_verify
== 0)
393 seglast
= as
->a_seglast
;
395 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= AS_SEGNEXT(as
, seg
)) {
396 ASSERT(seg
->s_as
== as
);
397 p
= AS_SEGPREV(as
, seg
);
398 n
= AS_SEGNEXT(as
, seg
);
399 ASSERT(p
== NULL
|| p
->s_as
== as
);
400 ASSERT(p
== NULL
|| p
->s_base
< seg
->s_base
);
401 ASSERT(n
== NULL
|| n
->s_base
> seg
->s_base
);
402 ASSERT(n
!= NULL
|| seg
== avl_last(&as
->a_segtree
));
407 ASSERT(seglast
== NULL
);
408 ASSERT(avl_numnodes(&as
->a_segtree
) == nsegs
);
410 #endif /* VERIFY_SEGLIST */
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
418 as_addseg(struct as
*as
, struct seg
*newseg
)
425 ASSERT(AS_WRITE_HELD(as
));
427 as
->a_updatedir
= 1; /* inform /proc */
428 gethrestime(&as
->a_updatetime
);
430 if (as
->a_lastgaphl
!= NULL
) {
431 struct seg
*hseg
= NULL
;
432 struct seg
*lseg
= NULL
;
434 if (as
->a_lastgaphl
->s_base
> newseg
->s_base
) {
435 hseg
= as
->a_lastgaphl
;
436 lseg
= AVL_PREV(&as
->a_segtree
, hseg
);
438 lseg
= as
->a_lastgaphl
;
439 hseg
= AVL_NEXT(&as
->a_segtree
, lseg
);
442 if (hseg
&& lseg
&& lseg
->s_base
< newseg
->s_base
&&
443 hseg
->s_base
> newseg
->s_base
) {
444 avl_insert_here(&as
->a_segtree
, newseg
, lseg
,
446 as
->a_lastgaphl
= NULL
;
447 as
->a_seglast
= newseg
;
450 as
->a_lastgaphl
= NULL
;
453 addr
= newseg
->s_base
;
454 eaddr
= addr
+ newseg
->s_size
;
457 seg
= avl_find(&as
->a_segtree
, &addr
, &where
);
460 seg
= avl_nearest(&as
->a_segtree
, where
, AVL_AFTER
);
463 seg
= avl_last(&as
->a_segtree
);
466 caddr_t base
= seg
->s_base
;
469 * If top of seg is below the requested address, then
470 * the insertion point is at the end of the linked list,
471 * and seg points to the tail of the list. Otherwise,
472 * the insertion point is immediately before seg.
474 if (base
+ seg
->s_size
> addr
) {
475 if (addr
>= base
|| eaddr
> base
) {
477 extern const struct seg_ops segnf_ops
;
480 * no-fault segs must disappear if overlaid.
481 * XXX need new segment type so
482 * we don't have to check s_ops
484 if (seg
->s_ops
== &segnf_ops
) {
489 return (-1); /* overlapping segment */
493 as
->a_seglast
= newseg
;
494 avl_insert(&as
->a_segtree
, newseg
, where
);
496 #ifdef VERIFY_SEGLIST
503 as_removeseg(struct as
*as
, struct seg
*seg
)
507 ASSERT(AS_WRITE_HELD(as
));
509 as
->a_updatedir
= 1; /* inform /proc */
510 gethrestime(&as
->a_updatetime
);
516 if (as
->a_seglast
== seg
)
517 as
->a_seglast
= NULL
;
518 as
->a_lastgaphl
= NULL
;
521 * if this segment is at an address higher than
522 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
525 (seg
== as
->a_lastgap
|| seg
->s_base
> as
->a_lastgap
->s_base
))
526 as
->a_lastgap
= AVL_NEXT(t
, seg
);
529 * remove the segment from the seg tree
533 #ifdef VERIFY_SEGLIST
540 * Find a segment containing addr.
543 as_segat(struct as
*as
, caddr_t addr
)
545 struct seg
*seg
= as
->a_seglast
;
547 ASSERT(AS_LOCK_HELD(as
));
549 if (seg
!= NULL
&& seg
->s_base
<= addr
&&
550 addr
< seg
->s_base
+ seg
->s_size
)
553 seg
= avl_find(&as
->a_segtree
, &addr
, NULL
);
558 * Serialize all searches for holes in an address space to
559 * prevent two or more threads from allocating the same virtual
560 * address range. The address space must not be "read/write"
561 * locked by the caller since we may block.
564 as_rangelock(struct as
*as
)
566 mutex_enter(&as
->a_contents
);
567 while (AS_ISCLAIMGAP(as
))
568 cv_wait(&as
->a_cv
, &as
->a_contents
);
570 mutex_exit(&as
->a_contents
);
574 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
577 as_rangeunlock(struct as
*as
)
579 mutex_enter(&as
->a_contents
);
581 cv_signal(&as
->a_cv
);
582 mutex_exit(&as
->a_contents
);
586 * compar segments (or just an address) by segment address range
589 as_segcompar(const void *x
, const void *y
)
591 struct seg
*a
= (struct seg
*)x
;
592 struct seg
*b
= (struct seg
*)y
;
594 if (a
->s_base
< b
->s_base
)
596 if (a
->s_base
>= b
->s_base
+ b
->s_size
)
603 as_avlinit(struct as
*as
)
605 avl_create(&as
->a_segtree
, as_segcompar
, sizeof (struct seg
),
606 offsetof(struct seg
, s_tree
));
607 avl_create(&as
->a_wpage
, wp_compare
, sizeof (struct watched_page
),
608 offsetof(struct watched_page
, wp_link
));
613 as_constructor(void *buf
, void *cdrarg
, int kmflags
)
617 mutex_init(&as
->a_contents
, NULL
, MUTEX_DEFAULT
, NULL
);
618 cv_init(&as
->a_cv
, NULL
, CV_DEFAULT
, NULL
);
619 rw_init(&as
->a_lock
, NULL
, RW_DEFAULT
, NULL
);
626 as_destructor(void *buf
, void *cdrarg
)
630 avl_destroy(&as
->a_segtree
);
631 mutex_destroy(&as
->a_contents
);
632 cv_destroy(&as
->a_cv
);
633 rw_destroy(&as
->a_lock
);
639 as_cache
= kmem_cache_create("as_cache", sizeof (struct as
), 0,
640 as_constructor
, as_destructor
, NULL
, NULL
, NULL
, 0);
644 * Allocate and initialize an address space data structure.
645 * We call hat_alloc to allow any machine dependent
646 * information in the hat structure to be initialized.
653 as
= kmem_cache_alloc(as_cache
, KM_SLEEP
);
658 as
->a_seglast
= NULL
;
662 gethrestime(&as
->a_updatetime
);
663 as
->a_objectdir
= NULL
;
665 as
->a_userlimit
= (caddr_t
)USERLIMIT
;
666 as
->a_lastgap
= NULL
;
667 as
->a_lastgaphl
= NULL
;
668 as
->a_callbacks
= NULL
;
671 AS_LOCK_ENTER(as
, RW_WRITER
);
672 as
->a_hat
= hat_alloc(as
); /* create hat for default system mmu */
679 * Free an address space data structure.
680 * Need to free the hat first and then
681 * all the segments on this as and finally
682 * the space for the as struct itself.
685 as_free(struct as
*as
)
687 struct hat
*hat
= as
->a_hat
;
688 struct seg
*seg
, *next
;
689 boolean_t free_started
= B_FALSE
;
693 * Invoke ALL callbacks. as_do_callbacks will do one callback
694 * per call, and not return (-1) until the callback has completed.
695 * When as_do_callbacks returns zero, all callbacks have completed.
697 mutex_enter(&as
->a_contents
);
698 while (as
->a_callbacks
&& as_do_callbacks(as
, AS_ALL_EVENT
, 0, 0))
701 mutex_exit(&as
->a_contents
);
702 AS_LOCK_ENTER(as
, RW_WRITER
);
705 free_started
= B_TRUE
;
708 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= next
) {
711 next
= AS_SEGNEXT(as
, seg
);
713 err
= segop_unmap(seg
, seg
->s_base
, seg
->s_size
);
715 mutex_enter(&as
->a_contents
);
716 if (as
->a_callbacks
) {
718 } else if (!AS_ISNOUNMAPWAIT(as
)) {
720 * Memory is currently locked. Wait for a
721 * cv_signal that it has been unlocked, then
722 * try the operation again.
724 if (AS_ISUNMAPWAIT(as
) == 0)
725 cv_broadcast(&as
->a_cv
);
728 while (AS_ISUNMAPWAIT(as
))
729 cv_wait(&as
->a_cv
, &as
->a_contents
);
732 * We may have raced with
733 * segvn_reclaim()/segspt_reclaim(). In this
734 * case clean nounmapwait flag and retry since
735 * softlockcnt in this segment may be already
736 * 0. We don't drop as writer lock so our
737 * number of retries without sleeping should
738 * be very small. See segvn_reclaim() for
741 AS_CLRNOUNMAPWAIT(as
);
742 mutex_exit(&as
->a_contents
);
745 mutex_exit(&as
->a_contents
);
749 * We do not expect any other error return at this
750 * time. This is similar to an ASSERT in seg_unmap()
759 ASSERT(avl_numnodes(&as
->a_wpage
) == 0);
760 if (as
->a_objectdir
) {
761 kmem_free(as
->a_objectdir
, as
->a_sizedir
* sizeof (vnode_t
*));
762 as
->a_objectdir
= NULL
;
767 * Free the struct as back to kmem. Assert it has no segments.
769 ASSERT(avl_numnodes(&as
->a_segtree
) == 0);
770 kmem_cache_free(as_cache
, as
);
774 as_dup(struct as
*as
, struct proc
*forkedproc
)
777 struct seg
*seg
, *newseg
;
778 size_t purgesize
= 0;
781 AS_LOCK_ENTER(as
, RW_WRITER
);
784 newas
->a_userlimit
= as
->a_userlimit
;
785 newas
->a_proc
= forkedproc
;
787 AS_LOCK_ENTER(newas
, RW_WRITER
);
789 (void) hat_dup(as
->a_hat
, newas
->a_hat
, NULL
, 0, HAT_DUP_SRD
);
791 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
; seg
= AS_SEGNEXT(as
, seg
)) {
793 if (seg
->s_flags
& S_PURGE
) {
794 purgesize
+= seg
->s_size
;
798 newseg
= seg_alloc(newas
, seg
->s_base
, seg
->s_size
);
799 if (newseg
== NULL
) {
806 if ((error
= segop_dup(seg
, newseg
)) != 0) {
808 * We call seg_free() on the new seg
809 * because the segment is not set up
810 * completely; i.e. it has no ops.
819 newas
->a_size
+= seg
->s_size
;
821 newas
->a_resvsize
= as
->a_resvsize
- purgesize
;
823 error
= hat_dup(as
->a_hat
, newas
->a_hat
, NULL
, 0, HAT_DUP_ALL
);
833 forkedproc
->p_as
= newas
;
838 * Handle a ``fault'' at addr for size bytes.
841 as_fault(struct hat
*hat
, struct as
*as
, caddr_t addr
, size_t size
,
842 enum fault_type type
, enum seg_rw rw
)
845 caddr_t raddr
; /* rounded down addr */
846 size_t rsize
; /* rounded up size */
852 klwp_t
*lwp
= ttolwp(curthread
);
858 * Indicate that the lwp is not to be stopped while waiting for a
859 * pagefault. This is to avoid deadlock while debugging a process
860 * via /proc over NFS (in particular).
866 * same length must be used when we softlock and softunlock. We
867 * don't support softunlocking lengths less than the original length
868 * when there is largepage support. See seg_dev.c for more
874 CPU_STATS_ADD_K(vm
, softlock
, 1);
881 CPU_STATS_ADD_K(vm
, prot_fault
, 1);
886 CPU_STATS_ADDQ(CPU
, vm
, as_fault
, 1);
888 CPU_STATS_ADDQ(CPU
, vm
, kernel_asflt
, 1);
893 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
894 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
898 * XXX -- Don't grab the as lock for segkmap. We should grab it for
899 * correctness, but then we could be stuck holding this lock for
900 * a LONG time if the fault needs to be resolved on a slow
901 * filesystem, and then no-one will be able to exec new commands,
902 * as exec'ing requires the write lock on the as.
904 if (as
== &kas
&& segkmap
&& segkmap
->s_base
<= raddr
&&
905 raddr
+ size
< segkmap
->s_base
+ segkmap
->s_size
) {
909 AS_LOCK_ENTER(as
, RW_READER
);
911 seg
= as_segat(as
, raddr
);
925 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
926 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
927 seg
= AS_SEGNEXT(as
, seg
);
928 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
933 if (raddr
+ rsize
> seg
->s_base
+ seg
->s_size
)
934 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
938 res
= segop_fault(hat
, seg
, raddr
, ssize
, type
, rw
);
944 * If we were SOFTLOCKing and encountered a failure,
945 * we must SOFTUNLOCK the range we already did. (Maybe we
946 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
949 if (res
!= 0 && type
== F_SOFTLOCK
) {
950 for (seg
= segsav
; addrsav
< raddr
; addrsav
+= ssize
) {
951 if (addrsav
>= seg
->s_base
+ seg
->s_size
)
952 seg
= AS_SEGNEXT(as
, seg
);
955 * Now call the fault routine again to perform the
956 * unlock using S_OTHER instead of the rw variable
957 * since we never got a chance to touch the pages.
959 if (raddr
> seg
->s_base
+ seg
->s_size
)
960 ssize
= seg
->s_base
+ seg
->s_size
- addrsav
;
962 ssize
= raddr
- addrsav
;
963 (void) segop_fault(hat
, seg
, addrsav
, ssize
,
964 F_SOFTUNLOCK
, S_OTHER
);
973 * If the lower levels returned EDEADLK for a fault,
974 * It means that we should retry the fault. Let's wait
975 * a bit also to let the deadlock causing condition clear.
976 * This is part of a gross hack to work around a design flaw
977 * in the ufs/sds logging code and should go away when the
978 * logging code is re-designed to fix the problem. See bug
979 * 4125102 for details of the problem.
981 if (FC_ERRNO(res
) == EDEADLK
) {
992 * Asynchronous ``fault'' at addr for size bytes.
995 as_faulta(struct as
*as
, caddr_t addr
, size_t size
)
998 caddr_t raddr
; /* rounded down addr */
999 size_t rsize
; /* rounded up size */
1000 faultcode_t res
= 0;
1001 klwp_t
*lwp
= ttolwp(curthread
);
1005 * Indicate that the lwp is not to be stopped while waiting
1006 * for a pagefault. This is to avoid deadlock while debugging
1007 * a process via /proc over NFS (in particular).
1012 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1013 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1016 AS_LOCK_ENTER(as
, RW_READER
);
1017 seg
= as_segat(as
, raddr
);
1025 for (; rsize
!= 0; rsize
-= PAGESIZE
, raddr
+= PAGESIZE
) {
1026 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1027 seg
= AS_SEGNEXT(as
, seg
);
1028 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1033 res
= segop_faulta(seg
, raddr
);
1041 * If the lower levels returned EDEADLK for a fault,
1042 * It means that we should retry the fault. Let's wait
1043 * a bit also to let the deadlock causing condition clear.
1044 * This is part of a gross hack to work around a design flaw
1045 * in the ufs/sds logging code and should go away when the
1046 * logging code is re-designed to fix the problem. See bug
1047 * 4125102 for details of the problem.
1049 if (FC_ERRNO(res
) == EDEADLK
) {
1058 * Set the virtual mapping for the interval from [addr : addr + size)
1059 * in address space `as' to have the specified protection.
1060 * It is ok for the range to cross over several segments,
1061 * as long as they are contiguous.
1064 as_setprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
1067 struct as_callback
*cb
;
1069 caddr_t raddr
; /* rounded down addr */
1070 size_t rsize
; /* rounded up size */
1071 int error
= 0, writer
= 0;
1076 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1077 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1080 if (raddr
+ rsize
< raddr
) /* check for wraparound */
1087 * Normally we only lock the as as a reader. But
1088 * if due to setprot the segment driver needs to split
1089 * a segment it will return IE_RETRY. Therefore we re-acquire
1090 * the as lock as a writer so the segment driver can change
1091 * the seg list. Also the segment driver will return IE_RETRY
1092 * after it has changed the segment list so we therefore keep
1093 * locking as a writer. Since these opeartions should be rare
1094 * want to only lock as a writer when necessary.
1096 if (writer
|| avl_numnodes(&as
->a_wpage
) != 0) {
1097 AS_LOCK_ENTER(as
, RW_WRITER
);
1099 AS_LOCK_ENTER(as
, RW_READER
);
1102 as_clearwatchprot(as
, raddr
, rsize
);
1103 seg
= as_segat(as
, raddr
);
1110 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
1111 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1112 seg
= AS_SEGNEXT(as
, seg
);
1113 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1118 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
1119 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1123 error
= segop_setprot(seg
, raddr
, ssize
, prot
);
1125 if (error
== IE_NOMEM
) {
1130 if (error
== IE_RETRY
) {
1136 if (error
== EAGAIN
) {
1138 * Make sure we have a_lock as writer.
1147 * Memory is currently locked. It must be unlocked
1148 * before this operation can succeed through a retry.
1149 * The possible reasons for locked memory and
1150 * corresponding strategies for unlocking are:
1152 * wait for a signal that the I/O operation
1153 * has completed and the memory is unlocked.
1154 * (2) Asynchronous I/O
1155 * The aio subsystem does not unlock pages when
1156 * the I/O is completed. Those pages are unlocked
1157 * when the application calls aiowait/aioerror.
1158 * So, to prevent blocking forever, cv_broadcast()
1159 * is done to wake up aio_cleanup_thread.
1160 * Subsequently, segvn_reclaim will be called, and
1161 * that will do AS_CLRUNMAPWAIT() and wake us up.
1162 * (3) Long term page locking:
1163 * Drivers intending to have pages locked for a
1164 * period considerably longer than for normal I/O
1165 * (essentially forever) may have registered for a
1166 * callback so they may unlock these pages on
1167 * request. This is needed to allow this operation
1168 * to succeed. Each entry on the callback list is
1169 * examined. If the event or address range pertains
1170 * the callback is invoked (unless it already is in
1171 * progress). The a_contents lock must be dropped
1172 * before the callback, so only one callback can
1173 * be done at a time. Go to the top and do more
1174 * until zero is returned. If zero is returned,
1175 * either there were no callbacks for this event
1176 * or they were already in progress.
1178 mutex_enter(&as
->a_contents
);
1179 if (as
->a_callbacks
&&
1180 (cb
= as_find_callback(as
, AS_SETPROT_EVENT
,
1181 seg
->s_base
, seg
->s_size
))) {
1183 as_execute_callback(as
, cb
, AS_SETPROT_EVENT
);
1184 } else if (!AS_ISNOUNMAPWAIT(as
)) {
1185 if (AS_ISUNMAPWAIT(as
) == 0)
1186 cv_broadcast(&as
->a_cv
);
1187 AS_SETUNMAPWAIT(as
);
1189 while (AS_ISUNMAPWAIT(as
))
1190 cv_wait(&as
->a_cv
, &as
->a_contents
);
1193 * We may have raced with
1194 * segvn_reclaim()/segspt_reclaim(). In this
1195 * case clean nounmapwait flag and retry since
1196 * softlockcnt in this segment may be already
1197 * 0. We don't drop as writer lock so our
1198 * number of retries without sleeping should
1199 * be very small. See segvn_reclaim() for
1202 AS_CLRNOUNMAPWAIT(as
);
1203 mutex_exit(&as
->a_contents
);
1206 mutex_exit(&as
->a_contents
);
1208 } else if (error
!= 0)
1214 as_setwatchprot(as
, saveraddr
, saversize
, prot
);
1221 * Check to make sure that the interval [addr, addr + size)
1222 * in address space `as' has at least the specified protection.
1223 * It is ok for the range to cross over several segments, as long
1224 * as they are contiguous.
1227 as_checkprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
1231 caddr_t raddr
; /* rounded down addr */
1232 size_t rsize
; /* rounded up size */
1235 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1236 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1239 if (raddr
+ rsize
< raddr
) /* check for wraparound */
1243 * This is ugly as sin...
1244 * Normally, we only acquire the address space readers lock.
1245 * However, if the address space has watchpoints present,
1246 * we must acquire the writer lock on the address space for
1247 * the benefit of as_clearwatchprot() and as_setwatchprot().
1249 if (avl_numnodes(&as
->a_wpage
) != 0)
1250 AS_LOCK_ENTER(as
, RW_WRITER
);
1252 AS_LOCK_ENTER(as
, RW_READER
);
1253 as_clearwatchprot(as
, raddr
, rsize
);
1254 seg
= as_segat(as
, raddr
);
1261 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
1262 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
1263 seg
= AS_SEGNEXT(as
, seg
);
1264 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
1269 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
1270 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1274 error
= segop_checkprot(seg
, raddr
, ssize
, prot
);
1284 as_unmap(struct as
*as
, caddr_t addr
, size_t size
)
1286 struct seg
*seg
, *seg_next
;
1287 struct as_callback
*cb
;
1288 caddr_t raddr
, eaddr
;
1289 size_t ssize
, rsize
= 0;
1293 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1294 eaddr
= (caddr_t
)(((uintptr_t)(addr
+ size
) + PAGEOFFSET
) &
1295 (uintptr_t)PAGEMASK
);
1297 AS_LOCK_ENTER(as
, RW_WRITER
);
1299 as
->a_updatedir
= 1; /* inform /proc */
1300 gethrestime(&as
->a_updatetime
);
1303 * Use as_findseg to find the first segment in the range, then
1304 * step through the segments in order, following s_next.
1306 as_clearwatchprot(as
, raddr
, eaddr
- raddr
);
1308 for (seg
= as_findseg(as
, raddr
, 0); seg
!= NULL
; seg
= seg_next
) {
1309 if (eaddr
<= seg
->s_base
)
1310 break; /* eaddr was in a gap; all done */
1312 /* this is implied by the test above */
1313 ASSERT(raddr
< eaddr
);
1315 if (raddr
< seg
->s_base
)
1316 raddr
= seg
->s_base
; /* raddr was in a gap */
1318 if (eaddr
> (seg
->s_base
+ seg
->s_size
))
1319 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
1321 ssize
= eaddr
- raddr
;
1324 * Save next segment pointer since seg can be
1325 * destroyed during the segment unmap operation.
1327 seg_next
= AS_SEGNEXT(as
, seg
);
1330 * We didn't count /dev/null mappings, so ignore them here.
1331 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1332 * we have to do this check here while we have seg.)
1335 if (!SEG_IS_DEVNULL_MAPPING(seg
) &&
1336 !SEG_IS_PARTIAL_RESV(seg
))
1340 err
= segop_unmap(seg
, raddr
, ssize
);
1341 if (err
== EAGAIN
) {
1343 * Memory is currently locked. It must be unlocked
1344 * before this operation can succeed through a retry.
1345 * The possible reasons for locked memory and
1346 * corresponding strategies for unlocking are:
1348 * wait for a signal that the I/O operation
1349 * has completed and the memory is unlocked.
1350 * (2) Asynchronous I/O
1351 * The aio subsystem does not unlock pages when
1352 * the I/O is completed. Those pages are unlocked
1353 * when the application calls aiowait/aioerror.
1354 * So, to prevent blocking forever, cv_broadcast()
1355 * is done to wake up aio_cleanup_thread.
1356 * Subsequently, segvn_reclaim will be called, and
1357 * that will do AS_CLRUNMAPWAIT() and wake us up.
1358 * (3) Long term page locking:
1359 * Drivers intending to have pages locked for a
1360 * period considerably longer than for normal I/O
1361 * (essentially forever) may have registered for a
1362 * callback so they may unlock these pages on
1363 * request. This is needed to allow this operation
1364 * to succeed. Each entry on the callback list is
1365 * examined. If the event or address range pertains
1366 * the callback is invoked (unless it already is in
1367 * progress). The a_contents lock must be dropped
1368 * before the callback, so only one callback can
1369 * be done at a time. Go to the top and do more
1370 * until zero is returned. If zero is returned,
1371 * either there were no callbacks for this event
1372 * or they were already in progress.
1374 mutex_enter(&as
->a_contents
);
1375 if (as
->a_callbacks
&&
1376 (cb
= as_find_callback(as
, AS_UNMAP_EVENT
,
1377 seg
->s_base
, seg
->s_size
))) {
1379 as_execute_callback(as
, cb
, AS_UNMAP_EVENT
);
1380 } else if (!AS_ISNOUNMAPWAIT(as
)) {
1381 if (AS_ISUNMAPWAIT(as
) == 0)
1382 cv_broadcast(&as
->a_cv
);
1383 AS_SETUNMAPWAIT(as
);
1385 while (AS_ISUNMAPWAIT(as
))
1386 cv_wait(&as
->a_cv
, &as
->a_contents
);
1389 * We may have raced with
1390 * segvn_reclaim()/segspt_reclaim(). In this
1391 * case clean nounmapwait flag and retry since
1392 * softlockcnt in this segment may be already
1393 * 0. We don't drop as writer lock so our
1394 * number of retries without sleeping should
1395 * be very small. See segvn_reclaim() for
1398 AS_CLRNOUNMAPWAIT(as
);
1399 mutex_exit(&as
->a_contents
);
1402 mutex_exit(&as
->a_contents
);
1404 } else if (err
== IE_RETRY
) {
1413 as
->a_size
-= ssize
;
1415 as
->a_resvsize
-= rsize
;
1423 as_map_segvn_segs(struct as
*as
, caddr_t addr
, size_t size
, uint_t szcvec
,
1424 int (*crfp
)(), struct segvn_crargs
*vn_a
, int *segcreated
)
1434 int do_off
= (vn_a
->vp
!= NULL
|| vn_a
->amp
!= NULL
);
1437 ASSERT(AS_WRITE_HELD(as
));
1438 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1439 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1440 ASSERT(vn_a
->vp
== NULL
|| vn_a
->amp
== NULL
);
1446 seg
= seg_alloc(as
, addr
, size
);
1451 error
= (*crfp
)(seg
, vn_a
);
1456 as
->a_resvsize
+= size
;
1461 eaddr
= addr
+ size
;
1462 save_szcvec
= szcvec
;
1467 if ((szcvec
& 0x1) == 0) {
1473 pgsz
= page_get_pagesize(nszc
);
1474 a
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
1478 seg
= seg_alloc(as
, addr
, segsize
);
1483 error
= (*crfp
)(seg
, vn_a
);
1488 as
->a_size
+= segsize
;
1489 as
->a_resvsize
+= segsize
;
1492 vn_a
->offset
+= segsize
;
1500 ASSERT(addr
< eaddr
);
1501 szcvec
= save_szcvec
| 1; /* add 8K pages */
1503 a
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
1507 seg
= seg_alloc(as
, addr
, segsize
);
1512 error
= (*crfp
)(seg
, vn_a
);
1517 as
->a_size
+= segsize
;
1518 as
->a_resvsize
+= segsize
;
1521 vn_a
->offset
+= segsize
;
1525 szcvec
&= ~(1 << szc
);
1527 szc
= highbit(szcvec
) - 1;
1528 pgsz
= page_get_pagesize(szc
);
1531 ASSERT(addr
== eaddr
);
1537 as_map_vnsegs(struct as
*as
, caddr_t addr
, size_t size
,
1538 int (*crfp
)(), struct segvn_crargs
*vn_a
, int *segcreated
)
1540 uint_t mapflags
= vn_a
->flags
& (MAP_TEXT
| MAP_INITDATA
);
1541 int type
= (vn_a
->type
== MAP_SHARED
) ? MAPPGSZC_SHM
: MAPPGSZC_PRIVM
;
1542 uint_t szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
, mapflags
,
1548 size_t save_size
= 0;
1549 extern size_t textrepl_size_thresh
;
1551 ASSERT(AS_WRITE_HELD(as
));
1552 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1553 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1554 ASSERT(vn_a
->vp
!= NULL
);
1555 ASSERT(vn_a
->amp
== NULL
);
1559 seg
= seg_alloc(as
, addr
, size
);
1564 error
= (*crfp
)(seg
, vn_a
);
1569 as
->a_resvsize
+= size
;
1574 va
.va_mask
= AT_SIZE
;
1575 if (fop_getattr(vn_a
->vp
, &va
, ATTR_HINT
, vn_a
->cred
, NULL
) != 0) {
1579 eoff
= vn_a
->offset
& PAGEMASK
;
1580 if (eoff
>= va
.va_size
) {
1585 if (btopr(va
.va_size
) < btopr(eoff
)) {
1587 size
= va
.va_size
- (vn_a
->offset
& PAGEMASK
);
1588 size
= P2ROUNDUP_TYPED(size
, PAGESIZE
, size_t);
1589 szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
, mapflags
,
1597 if (size
> textrepl_size_thresh
) {
1598 vn_a
->flags
|= _MAP_TEXTREPL
;
1600 error
= as_map_segvn_segs(as
, addr
, size
, szcvec
, crfp
, vn_a
,
1607 size
= save_size
- size
;
1615 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1616 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1619 as_map_ansegs(struct as
*as
, caddr_t addr
, size_t size
,
1620 int (*crfp
)(), struct segvn_crargs
*vn_a
, int *segcreated
)
1625 ASSERT(vn_a
->type
== MAP_SHARED
|| vn_a
->type
== MAP_PRIVATE
);
1626 if (vn_a
->type
== MAP_SHARED
) {
1627 type
= MAPPGSZC_SHM
;
1628 } else if (vn_a
->type
== MAP_PRIVATE
) {
1629 if (vn_a
->szc
== AS_MAP_HEAP
) {
1630 type
= MAPPGSZC_HEAP
;
1631 } else if (vn_a
->szc
== AS_MAP_STACK
) {
1632 type
= MAPPGSZC_STACK
;
1634 type
= MAPPGSZC_PRIVM
;
1637 szcvec
= map_pgszcvec(addr
, size
, vn_a
->amp
== NULL
?
1638 (uintptr_t)addr
: (uintptr_t)P2ROUNDUP(vn_a
->offset
, PAGESIZE
),
1639 (vn_a
->flags
& MAP_TEXT
), type
, 0);
1640 ASSERT(AS_WRITE_HELD(as
));
1641 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
1642 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
1643 ASSERT(vn_a
->vp
== NULL
);
1645 return (as_map_segvn_segs(as
, addr
, size
, szcvec
,
1646 crfp
, vn_a
, segcreated
));
1650 as_map(struct as
*as
, caddr_t addr
, size_t size
, int (*crfp
)(), void *argsp
)
1652 AS_LOCK_ENTER(as
, RW_WRITER
);
1653 return (as_map_locked(as
, addr
, size
, crfp
, argsp
));
1657 as_map_locked(struct as
*as
, caddr_t addr
, size_t size
, int (*crfp
)(),
1660 struct seg
*seg
= NULL
;
1661 caddr_t raddr
; /* rounded down addr */
1662 size_t rsize
; /* rounded up size */
1666 * The use of a_proc is preferred to handle the case where curproc is
1667 * a door_call server and is allocating memory in the client's (a_proc)
1669 * When creating a shared memory segment a_proc will be NULL so we
1670 * fallback to curproc in that case.
1672 struct proc
*p
= (as
->a_proc
== NULL
) ? curproc
: as
->a_proc
;
1673 struct segvn_crargs crargs
;
1675 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
1676 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
1680 * check for wrap around
1682 if ((raddr
+ rsize
< raddr
) || (as
->a_size
> (ULONG_MAX
- size
))) {
1687 as
->a_updatedir
= 1; /* inform /proc */
1688 gethrestime(&as
->a_updatetime
);
1690 if (as
!= &kas
&& as
->a_size
+ rsize
> (size_t)p
->p_vmem_ctl
) {
1693 (void) rctl_action(rctlproc_legacy
[RLIMIT_VMEM
], p
->p_rctls
, p
,
1699 if (AS_MAP_CHECK_VNODE_LPOOB(crfp
, argsp
)) {
1700 crargs
= *(struct segvn_crargs
*)argsp
;
1701 error
= as_map_vnsegs(as
, raddr
, rsize
, crfp
, &crargs
, &unmap
);
1705 (void) as_unmap(as
, addr
, size
);
1709 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp
, argsp
)) {
1710 crargs
= *(struct segvn_crargs
*)argsp
;
1711 error
= as_map_ansegs(as
, raddr
, rsize
, crfp
, &crargs
, &unmap
);
1715 (void) as_unmap(as
, addr
, size
);
1720 seg
= seg_alloc(as
, addr
, size
);
1726 error
= (*crfp
)(seg
, argsp
);
1733 * Add size now so as_unmap will work if as_ctl fails.
1735 as
->a_size
+= rsize
;
1736 as
->a_resvsize
+= rsize
;
1742 * If the address space is locked,
1743 * establish memory locks for the new segment.
1745 mutex_enter(&as
->a_contents
);
1746 if (AS_ISPGLCK(as
)) {
1747 mutex_exit(&as
->a_contents
);
1749 error
= as_ctl(as
, addr
, size
, MC_LOCK
, 0, 0, NULL
, 0);
1751 (void) as_unmap(as
, addr
, size
);
1753 mutex_exit(&as
->a_contents
);
1761 * Delete all segments in the address space marked with S_PURGE.
1762 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1763 * These segments are deleted as a first step before calls to as_gap(), so
1764 * that they don't affect mmap() or shmat().
1767 as_purge(struct as
*as
)
1770 struct seg
*next_seg
;
1773 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1774 * no need to grab a_contents mutex for this check
1776 if ((as
->a_flags
& AS_NEEDSPURGE
) == 0)
1779 AS_LOCK_ENTER(as
, RW_WRITER
);
1781 seg
= AS_SEGFIRST(as
);
1782 while (seg
!= NULL
) {
1783 next_seg
= AS_SEGNEXT(as
, seg
);
1784 if (seg
->s_flags
& S_PURGE
)
1785 (void) segop_unmap(seg
, seg
->s_base
, seg
->s_size
);
1790 mutex_enter(&as
->a_contents
);
1791 as
->a_flags
&= ~AS_NEEDSPURGE
;
1792 mutex_exit(&as
->a_contents
);
1796 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1797 * range of addresses at least "minlen" long, where the base of the range is
1798 * at "off" phase from an "align" boundary and there is space for a
1799 * "redzone"-sized redzone on eithe rside of the range. Thus,
1800 * if align was 4M and off was 16k, the user wants a hole which will start
1801 * 16k into a 4M page.
1803 * If flags specifies AH_HI, the hole will have the highest possible address
1804 * in the range. We use the as->a_lastgap field to figure out where to
1805 * start looking for a gap.
1807 * Otherwise, the gap will have the lowest possible address.
1809 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1811 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1812 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1814 * NOTE: This routine is not correct when base+len overflows caddr_t.
1817 as_gap_aligned(struct as
*as
, size_t minlen
, caddr_t
*basep
, size_t *lenp
,
1818 uint_t flags
, caddr_t addr
, size_t align
, size_t redzone
, size_t off
)
1820 caddr_t lobound
= *basep
;
1821 caddr_t hibound
= lobound
+ *lenp
;
1822 struct seg
*lseg
, *hseg
;
1828 size_t save_redzone
;
1833 save_minlen
= minlen
;
1834 save_redzone
= redzone
;
1837 * For the first pass/fast_path, just add align and redzone into
1838 * minlen since if we get an allocation, we can guarantee that it
1839 * will fit the alignment and redzone requested.
1840 * This increases the chance that hibound will be adjusted to
1841 * a_lastgap->s_base which will likely allow us to find an
1842 * acceptable hole in the address space quicker.
1843 * If we can't find a hole with this fast_path, then we look for
1844 * smaller holes in which the alignment and offset may allow
1845 * the allocation to fit.
1848 minlen
+= 2 * redzone
;
1851 AS_LOCK_ENTER(as
, RW_READER
);
1852 if (AS_SEGFIRST(as
) == NULL
) {
1853 if (valid_va_range_aligned(basep
, lenp
, minlen
, flags
& AH_DIR
,
1854 align
, redzone
, off
)) {
1867 * Set up to iterate over all the inter-segment holes in the given
1868 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1869 * NULL for the highest-addressed hole. If moving backwards, we reset
1870 * sseg to denote the highest-addressed segment.
1872 forward
= (flags
& AH_DIR
) == AH_LO
;
1874 hseg
= as_findseg(as
, lobound
, 1);
1875 lseg
= AS_SEGPREV(as
, hseg
);
1879 * If allocating at least as much as the last allocation,
1880 * use a_lastgap's base as a better estimate of hibound.
1882 if (as
->a_lastgap
&&
1883 minlen
>= as
->a_lastgap
->s_size
&&
1884 hibound
>= as
->a_lastgap
->s_base
)
1885 hibound
= as
->a_lastgap
->s_base
;
1887 hseg
= as_findseg(as
, hibound
, 1);
1888 if (hseg
->s_base
+ hseg
->s_size
< hibound
) {
1892 lseg
= AS_SEGPREV(as
, hseg
);
1898 * Set lo and hi to the hole's boundaries. (We should really
1899 * use MAXADDR in place of hibound in the expression below,
1900 * but can't express it easily; using hibound in its place is
1903 lo
= (lseg
== NULL
) ? 0 : lseg
->s_base
+ lseg
->s_size
;
1904 hi
= (hseg
== NULL
) ? hibound
: hseg
->s_base
;
1906 * If the iteration has moved past the interval from lobound
1907 * to hibound it's pointless to continue.
1909 if ((forward
&& lo
> hibound
) || (!forward
&& hi
< lobound
))
1911 else if (lo
> hibound
|| hi
< lobound
)
1914 * Candidate hole lies at least partially within the allowable
1915 * range. Restrict it to fall completely within that range,
1916 * i.e., to [max(lo, lobound), min(hi, hibound)].
1923 * Verify that the candidate hole is big enough and meets
1924 * hardware constraints. If the hole is too small, no need
1925 * to do the further checks since they will fail.
1929 if (*lenp
>= minlen
&& valid_va_range_aligned(basep
, lenp
,
1930 minlen
, forward
? AH_LO
: AH_HI
, align
, redzone
, off
) &&
1931 ((flags
& AH_CONTAIN
) == 0 ||
1932 (*basep
<= addr
&& *basep
+ *lenp
> addr
))) {
1934 as
->a_lastgap
= hseg
;
1936 as
->a_lastgaphl
= hseg
;
1938 as
->a_lastgaphl
= lseg
;
1944 * Move to the next hole.
1950 hseg
= AS_SEGNEXT(as
, hseg
);
1955 lseg
= AS_SEGPREV(as
, lseg
);
1958 if (fast_path
&& (align
!= 0 || save_redzone
!= 0)) {
1960 minlen
= save_minlen
;
1961 redzone
= save_redzone
;
1971 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1973 * If flags specifies AH_HI, the hole will have the highest possible address
1974 * in the range. We use the as->a_lastgap field to figure out where to
1975 * start looking for a gap.
1977 * Otherwise, the gap will have the lowest possible address.
1979 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1981 * If an adequate hole is found, base and len are set to reflect the part of
1982 * the hole that is within range, and 0 is returned, otherwise,
1985 * NOTE: This routine is not correct when base+len overflows caddr_t.
1988 as_gap(struct as
*as
, size_t minlen
, caddr_t
*basep
, size_t *lenp
, uint_t flags
,
1992 return (as_gap_aligned(as
, minlen
, basep
, lenp
, flags
, addr
, 0, 0, 0));
1996 * Return the next range within [base, base + len) that is backed
1997 * with "real memory". Skip holes and non-seg_vn segments.
1998 * We're lazy and only return one segment at a time.
2001 as_memory(struct as
*as
, caddr_t
*basep
, size_t *lenp
)
2003 extern const struct seg_ops segspt_shmops
; /* needs a header file */
2005 caddr_t addr
, eaddr
;
2008 AS_LOCK_ENTER(as
, RW_READER
);
2011 eaddr
= addr
+ *lenp
;
2013 seg
= as_findseg(as
, addr
, 0);
2015 addr
= MAX(seg
->s_base
, addr
);
2018 if (seg
== NULL
|| addr
>= eaddr
|| eaddr
<= seg
->s_base
) {
2023 if (seg
->s_ops
== &segvn_ops
) {
2024 segend
= seg
->s_base
+ seg
->s_size
;
2029 * We do ISM by looking into the private data
2030 * to determine the real size of the segment.
2032 if (seg
->s_ops
== &segspt_shmops
) {
2033 segend
= seg
->s_base
+ spt_realsize(seg
);
2038 seg
= AS_SEGNEXT(as
, seg
);
2047 *lenp
= eaddr
- addr
;
2049 *lenp
= segend
- addr
;
2056 * Determine whether data from the mappings in interval [addr, addr + size)
2057 * are in the primary memory (core) cache.
2060 as_incore(struct as
*as
, caddr_t addr
,
2061 size_t size
, char *vec
, size_t *sizep
)
2065 caddr_t raddr
; /* rounded down addr */
2066 size_t rsize
; /* rounded up size */
2067 size_t isize
; /* iteration size */
2068 int error
= 0; /* result, assume success */
2071 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2072 rsize
= ((((size_t)addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2075 if (raddr
+ rsize
< raddr
) /* check for wraparound */
2078 AS_LOCK_ENTER(as
, RW_READER
);
2079 seg
= as_segat(as
, raddr
);
2085 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2086 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2087 seg
= AS_SEGNEXT(as
, seg
);
2088 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2093 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2094 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2097 *sizep
+= isize
= segop_incore(seg
, raddr
, ssize
, vec
);
2098 if (isize
!= ssize
) {
2102 vec
+= btopr(ssize
);
2109 as_segunlock(struct seg
*seg
, caddr_t addr
, int attr
,
2110 ulong_t
*bitmap
, size_t position
, size_t npages
)
2112 caddr_t range_start
;
2113 size_t pos1
= position
;
2116 size_t end_pos
= npages
+ position
;
2118 while (bt_range(bitmap
, &pos1
, &pos2
, end_pos
)) {
2119 size
= ptob((pos2
- pos1
));
2120 range_start
= (caddr_t
)((uintptr_t)addr
+
2121 ptob(pos1
- position
));
2123 (void) segop_lockop(seg
, range_start
, size
, attr
, MC_UNLOCK
,
2130 as_unlockerr(struct as
*as
, int attr
, ulong_t
*mlock_map
,
2131 caddr_t raddr
, size_t rsize
)
2133 struct seg
*seg
= as_segat(as
, raddr
);
2136 while (rsize
!= 0) {
2137 if (raddr
>= seg
->s_base
+ seg
->s_size
)
2138 seg
= AS_SEGNEXT(as
, seg
);
2140 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2141 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2145 as_segunlock(seg
, raddr
, attr
, mlock_map
, 0, btopr(ssize
));
2153 * Cache control operations over the interval [addr, addr + size) in
2154 * address space "as".
2158 as_ctl(struct as
*as
, caddr_t addr
, size_t size
, int func
, int attr
,
2159 uintptr_t arg
, ulong_t
*lock_map
, size_t pos
)
2161 struct seg
*seg
; /* working segment */
2162 caddr_t raddr
; /* rounded down addr */
2163 caddr_t initraddr
; /* saved initial rounded down addr */
2164 size_t rsize
; /* rounded up size */
2165 size_t initrsize
; /* saved initial rounded up size */
2166 size_t ssize
; /* size of seg */
2167 int error
= 0; /* result */
2168 size_t mlock_size
; /* size of bitmap */
2169 ulong_t
*mlock_map
; /* pointer to bitmap used */
2170 /* to represent the locked */
2173 if (error
== IE_RETRY
)
2174 AS_LOCK_ENTER(as
, RW_WRITER
);
2176 AS_LOCK_ENTER(as
, RW_READER
);
2179 * If these are address space lock/unlock operations, loop over
2180 * all segments in the address space, as appropriate.
2182 if (func
== MC_LOCKAS
) {
2184 size_t rlen
= 0; /* rounded as length */
2188 if (arg
& MCL_FUTURE
) {
2189 mutex_enter(&as
->a_contents
);
2191 mutex_exit(&as
->a_contents
);
2193 if ((arg
& MCL_CURRENT
) == 0) {
2198 seg
= AS_SEGFIRST(as
);
2205 raddr
= (caddr_t
)((uintptr_t)seg
->s_base
&
2206 (uintptr_t)PAGEMASK
);
2207 rlen
+= (((uintptr_t)(seg
->s_base
+ seg
->s_size
) +
2208 PAGEOFFSET
) & PAGEMASK
) - (uintptr_t)raddr
;
2209 } while ((seg
= AS_SEGNEXT(as
, seg
)) != NULL
);
2211 mlock_size
= BT_BITOUL(btopr(rlen
));
2212 if ((mlock_map
= (ulong_t
*)kmem_zalloc(mlock_size
*
2213 sizeof (ulong_t
), KM_NOSLEEP
)) == NULL
) {
2218 for (seg
= AS_SEGFIRST(as
); seg
; seg
= AS_SEGNEXT(as
, seg
)) {
2219 error
= segop_lockop(seg
, seg
->s_base
,
2220 seg
->s_size
, attr
, MC_LOCK
, mlock_map
, pos
);
2223 pos
+= seg_pages(seg
);
2227 for (seg
= AS_SEGFIRST(as
); seg
!= NULL
;
2228 seg
= AS_SEGNEXT(as
, seg
)) {
2230 raddr
= (caddr_t
)((uintptr_t)seg
->s_base
&
2231 (uintptr_t)PAGEMASK
);
2232 npages
= seg_pages(seg
);
2233 as_segunlock(seg
, raddr
, attr
, mlock_map
,
2239 kmem_free(mlock_map
, mlock_size
* sizeof (ulong_t
));
2242 } else if (func
== MC_UNLOCKAS
) {
2243 mutex_enter(&as
->a_contents
);
2245 mutex_exit(&as
->a_contents
);
2247 for (seg
= AS_SEGFIRST(as
); seg
; seg
= AS_SEGNEXT(as
, seg
)) {
2248 error
= segop_lockop(seg
, seg
->s_base
,
2249 seg
->s_size
, attr
, MC_UNLOCK
, NULL
, 0);
2259 * Normalize addresses and sizes.
2261 initraddr
= raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2262 initrsize
= rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2265 if (raddr
+ rsize
< raddr
) { /* check for wraparound */
2271 * Get initial segment.
2273 if ((seg
= as_segat(as
, raddr
)) == NULL
) {
2278 if (func
== MC_LOCK
) {
2279 mlock_size
= BT_BITOUL(btopr(rsize
));
2280 if ((mlock_map
= (ulong_t
*)kmem_zalloc(mlock_size
*
2281 sizeof (ulong_t
), KM_NOSLEEP
)) == NULL
) {
2288 * Loop over all segments. If a hole in the address range is
2289 * discovered, then fail. For each segment, perform the appropriate
2290 * control operation.
2292 while (rsize
!= 0) {
2295 * Make sure there's no hole, calculate the portion
2296 * of the next segment to be operated over.
2298 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2299 seg
= AS_SEGNEXT(as
, seg
);
2300 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2301 if (func
== MC_LOCK
) {
2302 as_unlockerr(as
, attr
, mlock_map
,
2303 initraddr
, initrsize
- rsize
);
2304 kmem_free(mlock_map
,
2305 mlock_size
* sizeof (ulong_t
));
2311 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
))
2312 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2317 * Dispatch on specific function.
2322 * Synchronize cached data from mappings with backing
2326 if (error
= segop_sync(seg
, raddr
, ssize
,
2327 attr
, (uint_t
)arg
)) {
2334 * Lock pages in memory.
2337 if (error
= segop_lockop(seg
, raddr
, ssize
,
2338 attr
, func
, mlock_map
, pos
)) {
2339 as_unlockerr(as
, attr
, mlock_map
, initraddr
,
2340 initrsize
- rsize
+ ssize
);
2341 kmem_free(mlock_map
, mlock_size
*
2349 * Unlock mapped pages.
2352 (void) segop_lockop(seg
, raddr
, ssize
, attr
, func
,
2357 * Store VM advise for mapped pages in segment layer.
2360 error
= segop_advise(seg
, raddr
, ssize
, (uint_t
)arg
);
2363 * Check for regular errors and special retry error
2366 if (error
== IE_RETRY
) {
2368 * Need to acquire writers lock, so
2369 * have to drop readers lock and start
2374 } else if (error
== IE_REATTACH
) {
2376 * Find segment for current address
2377 * because current segment just got
2378 * split or concatenated
2380 seg
= as_segat(as
, raddr
);
2395 case MC_INHERIT_ZERO
:
2396 error
= segop_inherit(seg
, raddr
, ssize
, SEGP_INH_ZERO
);
2407 panic("as_ctl: bad operation %d", func
);
2415 if (func
== MC_LOCK
)
2416 kmem_free(mlock_map
, mlock_size
* sizeof (ulong_t
));
2422 * If the lower levels returned EDEADLK for a segment lockop,
2423 * it means that we should retry the operation. Let's wait
2424 * a bit also to let the deadlock causing condition clear.
2425 * This is part of a gross hack to work around a design flaw
2426 * in the ufs/sds logging code and should go away when the
2427 * logging code is re-designed to fix the problem. See bug
2428 * 4125102 for details of the problem.
2430 if (error
== EDEADLK
) {
2439 fc_decode(faultcode_t fault_err
)
2443 switch (FC_CODE(fault_err
)) {
2445 error
= FC_ERRNO(fault_err
);
2458 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2459 * lists from each segment and copy them to one contiguous shadow list (plist)
2460 * as expected by the caller. Save pointers to per segment shadow lists at
2461 * the tail of plist so that they can be used during as_pageunlock().
2464 as_pagelock_segs(struct as
*as
, struct seg
*seg
, struct page
***ppp
,
2465 caddr_t addr
, size_t size
, enum seg_rw rw
)
2467 caddr_t sv_addr
= addr
;
2468 size_t sv_size
= size
;
2469 struct seg
*sv_seg
= seg
;
2473 pgcnt_t npages
= btop(size
);
2478 faultcode_t fault_err
= 0;
2480 extern const struct seg_ops segspt_shmops
;
2482 ASSERT(AS_LOCK_HELD(as
));
2483 ASSERT(seg
!= NULL
);
2484 ASSERT(addr
>= seg
->s_base
&& addr
< seg
->s_base
+ seg
->s_size
);
2485 ASSERT(addr
+ size
> seg
->s_base
+ seg
->s_size
);
2486 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
2487 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
2490 * Count the number of segments covered by the range we are about to
2491 * lock. The segment count is used to size the shadow list we return
2492 * back to the caller.
2494 for (; size
!= 0; size
-= ssize
, addr
+= ssize
) {
2495 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2497 seg
= AS_SEGNEXT(as
, seg
);
2498 if (seg
== NULL
|| addr
!= seg
->s_base
) {
2503 * Do a quick check if subsequent segments
2504 * will most likely support pagelock.
2506 if (seg
->s_ops
== &segvn_ops
) {
2509 if (segop_getvp(seg
, addr
, &vp
) != 0 ||
2514 } else if (seg
->s_ops
!= &segspt_shmops
) {
2520 if (addr
+ size
> seg
->s_base
+ seg
->s_size
) {
2521 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2528 plist
= kmem_zalloc((npages
+ segcnt
) * sizeof (page_t
*), KM_SLEEP
);
2534 for (cnt
= 0, pl_off
= 0; size
!= 0; size
-= ssize
, addr
+= ssize
) {
2535 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2536 seg
= AS_SEGNEXT(as
, seg
);
2537 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2539 ASSERT(cnt
< segcnt
);
2541 if (addr
+ size
> seg
->s_base
+ seg
->s_size
) {
2542 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2546 pl
= &plist
[npages
+ cnt
];
2547 error
= segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2552 ASSERT(plist
[npages
+ cnt
] != NULL
);
2553 ASSERT(pl_off
+ btop(ssize
) <= npages
);
2554 bcopy(plist
[npages
+ cnt
], &plist
[pl_off
],
2555 btop(ssize
) * sizeof (page_t
*));
2556 pl_off
+= btop(ssize
);
2561 ASSERT(cnt
== segcnt
- 1);
2567 * one of pagelock calls failed. The error type is in error variable.
2568 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2569 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2570 * back to the caller.
2576 for (cnt
= 0, addr
= sv_addr
; addr
< eaddr
; addr
+= ssize
) {
2577 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2578 seg
= AS_SEGNEXT(as
, seg
);
2579 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2581 ASSERT(cnt
< segcnt
);
2583 if (eaddr
> seg
->s_base
+ seg
->s_size
) {
2584 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2586 ssize
= eaddr
- addr
;
2588 pl
= &plist
[npages
+ cnt
];
2589 ASSERT(*pl
!= NULL
);
2590 (void) segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2596 kmem_free(plist
, (npages
+ segcnt
) * sizeof (page_t
*));
2598 if (error
!= ENOTSUP
&& error
!= EFAULT
) {
2604 * If we are here because pagelock failed due to the need to cow fault
2605 * in the pages we want to lock F_SOFTLOCK will do this job and in
2606 * next as_pagelock() call for this address range pagelock will
2607 * hopefully succeed.
2609 fault_err
= as_fault(as
->a_hat
, as
, sv_addr
, sv_size
, F_SOFTLOCK
, rw
);
2610 if (fault_err
!= 0) {
2611 return (fc_decode(fault_err
));
2619 * lock pages in a given address space. Return shadow list. If
2620 * the list is NULL, the MMU mapping is also locked.
2623 as_pagelock(struct as
*as
, struct page
***ppp
, caddr_t addr
,
2624 size_t size
, enum seg_rw rw
)
2628 faultcode_t fault_err
;
2632 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2633 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2637 * if the request crosses two segments let
2638 * as_fault handle it.
2640 AS_LOCK_ENTER(as
, RW_READER
);
2642 seg
= as_segat(as
, raddr
);
2647 ASSERT(raddr
>= seg
->s_base
&& raddr
< seg
->s_base
+ seg
->s_size
);
2648 if (raddr
+ rsize
> seg
->s_base
+ seg
->s_size
) {
2649 return (as_pagelock_segs(as
, seg
, ppp
, raddr
, rsize
, rw
));
2651 if (raddr
+ rsize
<= raddr
) {
2657 * try to lock pages and pass back shadow list
2659 err
= segop_pagelock(seg
, raddr
, rsize
, ppp
, L_PAGELOCK
, rw
);
2663 if (err
== 0 || (err
!= ENOTSUP
&& err
!= EFAULT
)) {
2668 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2669 * to no pagelock support for this segment or pages need to be cow
2670 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2671 * this as_pagelock() call and in the next as_pagelock() call for the
2672 * same address range pagelock call will hopefull succeed.
2674 fault_err
= as_fault(as
->a_hat
, as
, addr
, size
, F_SOFTLOCK
, rw
);
2675 if (fault_err
!= 0) {
2676 return (fc_decode(fault_err
));
2684 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2685 * lists from the end of plist and call pageunlock interface for each segment.
2686 * Drop as lock and free plist.
2689 as_pageunlock_segs(struct as
*as
, struct seg
*seg
, caddr_t addr
, size_t size
,
2690 struct page
**plist
, enum seg_rw rw
)
2693 caddr_t eaddr
= addr
+ size
;
2694 pgcnt_t npages
= btop(size
);
2698 ASSERT(AS_LOCK_HELD(as
));
2699 ASSERT(seg
!= NULL
);
2700 ASSERT(addr
>= seg
->s_base
&& addr
< seg
->s_base
+ seg
->s_size
);
2701 ASSERT(addr
+ size
> seg
->s_base
+ seg
->s_size
);
2702 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
2703 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
2704 ASSERT(plist
!= NULL
);
2706 for (cnt
= 0; addr
< eaddr
; addr
+= ssize
) {
2707 if (addr
>= seg
->s_base
+ seg
->s_size
) {
2708 seg
= AS_SEGNEXT(as
, seg
);
2709 ASSERT(seg
!= NULL
&& addr
== seg
->s_base
);
2712 if (eaddr
> seg
->s_base
+ seg
->s_size
) {
2713 ssize
= seg
->s_base
+ seg
->s_size
- addr
;
2715 ssize
= eaddr
- addr
;
2717 pl
= &plist
[npages
+ cnt
];
2718 ASSERT(*pl
!= NULL
);
2719 (void) segop_pagelock(seg
, addr
, ssize
, (page_t
***)pl
,
2726 kmem_free(plist
, (npages
+ cnt
) * sizeof (page_t
*));
2730 * unlock pages in a given address range
2733 as_pageunlock(struct as
*as
, struct page
**pp
, caddr_t addr
, size_t size
,
2741 * if the shadow list is NULL, as_pagelock was
2742 * falling back to as_fault
2745 (void) as_fault(as
->a_hat
, as
, addr
, size
, F_SOFTUNLOCK
, rw
);
2749 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
2750 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
2753 AS_LOCK_ENTER(as
, RW_READER
);
2754 seg
= as_segat(as
, raddr
);
2755 ASSERT(seg
!= NULL
);
2757 ASSERT(raddr
>= seg
->s_base
&& raddr
< seg
->s_base
+ seg
->s_size
);
2758 if (raddr
+ rsize
<= seg
->s_base
+ seg
->s_size
) {
2759 (void) segop_pagelock(seg
, raddr
, rsize
, &pp
, L_PAGEUNLOCK
, rw
);
2761 as_pageunlock_segs(as
, seg
, raddr
, rsize
, pp
, rw
);
2768 as_setpagesize(struct as
*as
, caddr_t addr
, size_t size
, uint_t szc
,
2773 caddr_t raddr
; /* rounded down addr */
2774 size_t rsize
; /* rounded up size */
2776 size_t pgsz
= page_get_pagesize(szc
);
2779 if (!IS_P2ALIGNED(addr
, pgsz
) || !IS_P2ALIGNED(size
, pgsz
)) {
2786 if (raddr
+ rsize
< raddr
) /* check for wraparound */
2789 AS_LOCK_ENTER(as
, RW_WRITER
);
2790 as_clearwatchprot(as
, raddr
, rsize
);
2791 seg
= as_segat(as
, raddr
);
2798 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2799 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2800 seg
= AS_SEGNEXT(as
, seg
);
2801 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2806 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
2807 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2813 error
= segop_setpagesize(seg
, raddr
, ssize
, szc
);
2815 if (error
== IE_NOMEM
) {
2820 if (error
== IE_RETRY
) {
2825 if (error
== ENOTSUP
) {
2830 if (wait
&& (error
== EAGAIN
)) {
2832 * Memory is currently locked. It must be unlocked
2833 * before this operation can succeed through a retry.
2834 * The possible reasons for locked memory and
2835 * corresponding strategies for unlocking are:
2837 * wait for a signal that the I/O operation
2838 * has completed and the memory is unlocked.
2839 * (2) Asynchronous I/O
2840 * The aio subsystem does not unlock pages when
2841 * the I/O is completed. Those pages are unlocked
2842 * when the application calls aiowait/aioerror.
2843 * So, to prevent blocking forever, cv_broadcast()
2844 * is done to wake up aio_cleanup_thread.
2845 * Subsequently, segvn_reclaim will be called, and
2846 * that will do AS_CLRUNMAPWAIT() and wake us up.
2847 * (3) Long term page locking:
2848 * This is not relevant for as_setpagesize()
2849 * because we cannot change the page size for
2850 * driver memory. The attempt to do so will
2851 * fail with a different error than EAGAIN so
2852 * there's no need to trigger as callbacks like
2853 * as_unmap, as_setprot or as_free would do.
2855 mutex_enter(&as
->a_contents
);
2856 if (!AS_ISNOUNMAPWAIT(as
)) {
2857 if (AS_ISUNMAPWAIT(as
) == 0) {
2858 cv_broadcast(&as
->a_cv
);
2860 AS_SETUNMAPWAIT(as
);
2862 while (AS_ISUNMAPWAIT(as
)) {
2863 cv_wait(&as
->a_cv
, &as
->a_contents
);
2867 * We may have raced with
2868 * segvn_reclaim()/segspt_reclaim(). In this
2869 * case clean nounmapwait flag and retry since
2870 * softlockcnt in this segment may be already
2871 * 0. We don't drop as writer lock so our
2872 * number of retries without sleeping should
2873 * be very small. See segvn_reclaim() for
2876 AS_CLRNOUNMAPWAIT(as
);
2877 mutex_exit(&as
->a_contents
);
2880 mutex_exit(&as
->a_contents
);
2882 } else if (error
!= 0) {
2892 * as_iset3_default_lpsize() just calls segop_setpagesize() on all segments
2893 * in its chunk where s_szc is less than the szc we want to set.
2896 as_iset3_default_lpsize(struct as
*as
, caddr_t raddr
, size_t rsize
, uint_t szc
,
2903 ASSERT(AS_WRITE_HELD(as
));
2905 seg
= as_segat(as
, raddr
);
2907 panic("as_iset3_default_lpsize: no seg");
2910 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
) {
2911 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
2912 seg
= AS_SEGNEXT(as
, seg
);
2913 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
2914 panic("as_iset3_default_lpsize: as changed");
2917 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
2918 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
2923 if (szc
> seg
->s_szc
) {
2924 error
= segop_setpagesize(seg
, raddr
, ssize
, szc
);
2925 /* Only retry on EINVAL segments that have no vnode. */
2926 if (error
== EINVAL
) {
2928 if ((segop_gettype(seg
, raddr
) & MAP_SHARED
) &&
2929 (segop_getvp(seg
, raddr
, &vp
) != 0 ||
2945 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
2946 * pagesize on each segment in its range, but if any fails with EINVAL,
2947 * then it reduces the pagesizes to the next size in the bitmap and
2948 * retries as_iset3_default_lpsize(). The reason why the code retries
2949 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
2950 * match the bigger sizes, and (b) it's hard to get this offset (to begin
2951 * with) to pass to map_pgszcvec().
2954 as_iset2_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
, uint_t szc
,
2960 ASSERT(AS_WRITE_HELD(as
));
2963 error
= as_iset3_default_lpsize(as
, addr
, size
, szc
, &retry
);
2964 if (error
== EINVAL
&& retry
) {
2965 szcvec
&= ~(1 << szc
);
2969 szc
= highbit(szcvec
) - 1;
2977 * as_iset1_default_lpsize() breaks its chunk into areas where existing
2978 * segments have a smaller szc than we want to set. For each such area,
2979 * it calls as_iset2_default_lpsize()
2982 as_iset1_default_lpsize(struct as
*as
, caddr_t raddr
, size_t rsize
, uint_t szc
,
2987 caddr_t setaddr
= raddr
;
2992 ASSERT(AS_WRITE_HELD(as
));
2994 seg
= as_segat(as
, raddr
);
2996 panic("as_iset1_default_lpsize: no seg");
2998 if (seg
->s_szc
< szc
) {
3004 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
, setsize
+= ssize
) {
3005 if (raddr
>= seg
->s_base
+ seg
->s_size
) {
3006 seg
= AS_SEGNEXT(as
, seg
);
3007 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
3008 panic("as_iset1_default_lpsize: as changed");
3010 if (seg
->s_szc
>= szc
&& set
) {
3011 ASSERT(setsize
!= 0);
3012 error
= as_iset2_default_lpsize(as
,
3013 setaddr
, setsize
, szc
, szcvec
);
3018 } else if (seg
->s_szc
< szc
&& !set
) {
3024 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
3025 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
3032 ASSERT(setsize
!= 0);
3033 error
= as_iset2_default_lpsize(as
, setaddr
, setsize
,
3040 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3041 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3042 * chunk to as_iset1_default_lpsize().
3045 as_iset_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
, int flags
,
3048 int rtype
= (type
& MAP_SHARED
) ? MAPPGSZC_SHM
: MAPPGSZC_PRIVM
;
3049 uint_t szcvec
= map_pgszcvec(addr
, size
, (uintptr_t)addr
,
3060 ASSERT(AS_WRITE_HELD(as
));
3061 ASSERT(IS_P2ALIGNED(addr
, PAGESIZE
));
3062 ASSERT(IS_P2ALIGNED(size
, PAGESIZE
));
3065 if (szcvec
<= 1) { /* skip if base page size */
3069 /* Get the pagesize of the first larger page size. */
3070 szc
= lowbit(szcvec
) - 1;
3071 pgsz
= page_get_pagesize(szc
);
3072 eaddr
= addr
+ size
;
3073 addr
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
3074 eaddr
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
3076 save_szcvec
= szcvec
;
3077 szcvec
>>= (szc
+ 1);
3080 if ((szcvec
& 0x1) == 0) {
3086 pgsz
= page_get_pagesize(nszc
);
3087 a
= (caddr_t
)P2ROUNDUP((uintptr_t)addr
, pgsz
);
3092 error
= as_iset1_default_lpsize(as
, addr
, segsize
, szc
,
3103 ASSERT(addr
< eaddr
);
3104 szcvec
= save_szcvec
;
3106 a
= (caddr_t
)P2ALIGN((uintptr_t)eaddr
, pgsz
);
3111 error
= as_iset1_default_lpsize(as
, addr
, segsize
, szc
,
3118 szcvec
&= ~(1 << szc
);
3120 szc
= highbit(szcvec
) - 1;
3121 pgsz
= page_get_pagesize(szc
);
3124 ASSERT(addr
== eaddr
);
3130 * Set the default large page size for the range. Called via memcntl with
3131 * page size set to 0. as_set_default_lpsize breaks the range down into
3132 * chunks with the same type/flags, ignores-non segvn segments, and passes
3133 * each chunk to as_iset_default_lpsize().
3136 as_set_default_lpsize(struct as
*as
, caddr_t addr
, size_t size
)
3152 AS_LOCK_ENTER(as
, RW_WRITER
);
3156 raddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3157 rsize
= (((size_t)(addr
+ size
) + PAGEOFFSET
) & PAGEMASK
) -
3160 if (raddr
+ rsize
< raddr
) { /* check for wraparound */
3164 as_clearwatchprot(as
, raddr
, rsize
);
3165 seg
= as_segat(as
, raddr
);
3171 if (seg
->s_ops
== &segvn_ops
) {
3172 rtype
= segop_gettype(seg
, addr
);
3173 rflags
= rtype
& (MAP_TEXT
| MAP_INITDATA
);
3174 rtype
= rtype
& (MAP_SHARED
| MAP_PRIVATE
);
3182 for (; rsize
!= 0; rsize
-= ssize
, raddr
+= ssize
, setsize
+= ssize
) {
3183 if (raddr
>= (seg
->s_base
+ seg
->s_size
)) {
3184 seg
= AS_SEGNEXT(as
, seg
);
3185 if (seg
== NULL
|| raddr
!= seg
->s_base
) {
3189 if (seg
->s_ops
== &segvn_ops
) {
3190 stype
= segop_gettype(seg
, raddr
);
3191 sflags
= stype
& (MAP_TEXT
| MAP_INITDATA
);
3192 stype
&= (MAP_SHARED
| MAP_PRIVATE
);
3193 if (segvn
&& (rflags
!= sflags
||
3196 * The next segment is also segvn but
3197 * has different flags and/or type.
3199 ASSERT(setsize
!= 0);
3200 error
= as_iset_default_lpsize(as
,
3201 setaddr
, setsize
, rflags
, rtype
);
3209 } else if (!segvn
) {
3217 /* The next segment is not segvn. */
3218 ASSERT(setsize
!= 0);
3219 error
= as_iset_default_lpsize(as
,
3220 setaddr
, setsize
, rflags
, rtype
);
3227 if ((raddr
+ rsize
) > (seg
->s_base
+ seg
->s_size
)) {
3228 ssize
= seg
->s_base
+ seg
->s_size
- raddr
;
3233 if (error
== 0 && segvn
) {
3234 /* The last chunk when rsize == 0. */
3235 ASSERT(setsize
!= 0);
3236 error
= as_iset_default_lpsize(as
, setaddr
, setsize
,
3240 if (error
== IE_RETRY
) {
3242 } else if (error
== IE_NOMEM
) {
3244 } else if (error
== ENOTSUP
) {
3246 } else if (error
== EAGAIN
) {
3247 mutex_enter(&as
->a_contents
);
3248 if (!AS_ISNOUNMAPWAIT(as
)) {
3249 if (AS_ISUNMAPWAIT(as
) == 0) {
3250 cv_broadcast(&as
->a_cv
);
3252 AS_SETUNMAPWAIT(as
);
3254 while (AS_ISUNMAPWAIT(as
)) {
3255 cv_wait(&as
->a_cv
, &as
->a_contents
);
3257 mutex_exit(&as
->a_contents
);
3258 AS_LOCK_ENTER(as
, RW_WRITER
);
3261 * We may have raced with
3262 * segvn_reclaim()/segspt_reclaim(). In this case
3263 * clean nounmapwait flag and retry since softlockcnt
3264 * in this segment may be already 0. We don't drop as
3265 * writer lock so our number of retries without
3266 * sleeping should be very small. See segvn_reclaim()
3267 * for more comments.
3269 AS_CLRNOUNMAPWAIT(as
);
3270 mutex_exit(&as
->a_contents
);
3281 * Setup all of the uninitialized watched pages that we can.
3284 as_setwatch(struct as
*as
)
3286 struct watched_page
*pwp
;
3292 if (avl_numnodes(&as
->a_wpage
) == 0)
3295 ASSERT(AS_WRITE_HELD(as
));
3297 for (pwp
= avl_first(&as
->a_wpage
); pwp
!= NULL
;
3298 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
)) {
3301 vaddr
= pwp
->wp_vaddr
;
3302 if (pwp
->wp_oprot
!= 0 || /* already set up */
3303 (seg
= as_segat(as
, vaddr
)) == NULL
||
3304 segop_getprot(seg
, vaddr
, 0, &prot
) != 0)
3307 pwp
->wp_oprot
= prot
;
3309 prot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3311 prot
&= ~PROT_WRITE
;
3313 prot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3314 if (!(pwp
->wp_flags
& WP_NOWATCH
) && prot
!= pwp
->wp_oprot
) {
3315 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, prot
);
3316 if (err
== IE_RETRY
) {
3318 ASSERT(retrycnt
== 0);
3323 pwp
->wp_prot
= prot
;
3328 * Clear all of the watched pages in the address space.
3331 as_clearwatch(struct as
*as
)
3333 struct watched_page
*pwp
;
3339 if (avl_numnodes(&as
->a_wpage
) == 0)
3342 ASSERT(AS_WRITE_HELD(as
));
3344 for (pwp
= avl_first(&as
->a_wpage
); pwp
!= NULL
;
3345 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
)) {
3348 vaddr
= pwp
->wp_vaddr
;
3349 if (pwp
->wp_oprot
== 0 || /* not set up */
3350 (seg
= as_segat(as
, vaddr
)) == NULL
)
3353 if ((prot
= pwp
->wp_oprot
) != pwp
->wp_prot
) {
3354 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, prot
);
3355 if (err
== IE_RETRY
) {
3356 ASSERT(retrycnt
== 0);
3367 * Force a new setup for all the watched pages in the range.
3370 as_setwatchprot(struct as
*as
, caddr_t addr
, size_t size
, uint_t prot
)
3372 struct watched_page
*pwp
;
3373 struct watched_page tpw
;
3374 caddr_t eaddr
= addr
+ size
;
3381 if (avl_numnodes(&as
->a_wpage
) == 0)
3384 ASSERT(AS_WRITE_HELD(as
));
3386 tpw
.wp_vaddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3387 if ((pwp
= avl_find(&as
->a_wpage
, &tpw
, &where
)) == NULL
)
3388 pwp
= avl_nearest(&as
->a_wpage
, where
, AVL_AFTER
);
3390 while (pwp
!= NULL
&& pwp
->wp_vaddr
< eaddr
) {
3392 vaddr
= pwp
->wp_vaddr
;
3396 wprot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3398 wprot
&= ~PROT_WRITE
;
3400 wprot
&= ~(PROT_READ
|PROT_WRITE
|PROT_EXEC
);
3401 if (!(pwp
->wp_flags
& WP_NOWATCH
) && wprot
!= pwp
->wp_oprot
) {
3403 seg
= as_segat(as
, vaddr
);
3405 panic("as_setwatchprot: no seg");
3408 err
= segop_setprot(seg
, vaddr
, PAGESIZE
, wprot
);
3409 if (err
== IE_RETRY
) {
3410 ASSERT(retrycnt
== 0);
3415 pwp
->wp_oprot
= prot
;
3416 pwp
->wp_prot
= wprot
;
3418 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
);
3423 * Clear all of the watched pages in the range.
3426 as_clearwatchprot(struct as
*as
, caddr_t addr
, size_t size
)
3428 caddr_t eaddr
= addr
+ size
;
3429 struct watched_page
*pwp
;
3430 struct watched_page tpw
;
3436 if (avl_numnodes(&as
->a_wpage
) == 0)
3439 tpw
.wp_vaddr
= (caddr_t
)((uintptr_t)addr
& (uintptr_t)PAGEMASK
);
3440 if ((pwp
= avl_find(&as
->a_wpage
, &tpw
, &where
)) == NULL
)
3441 pwp
= avl_nearest(&as
->a_wpage
, where
, AVL_AFTER
);
3443 ASSERT(AS_WRITE_HELD(as
));
3445 while (pwp
!= NULL
&& pwp
->wp_vaddr
< eaddr
) {
3447 if ((prot
= pwp
->wp_oprot
) != 0) {
3450 if (prot
!= pwp
->wp_prot
) {
3452 seg
= as_segat(as
, pwp
->wp_vaddr
);
3455 err
= segop_setprot(seg
, pwp
->wp_vaddr
,
3457 if (err
== IE_RETRY
) {
3458 ASSERT(retrycnt
== 0);
3468 pwp
= AVL_NEXT(&as
->a_wpage
, pwp
);
3473 as_signal_proc(struct as
*as
, k_siginfo_t
*siginfo
)
3477 mutex_enter(&pidlock
);
3478 for (p
= practive
; p
; p
= p
->p_next
) {
3479 if (p
->p_as
== as
) {
3480 mutex_enter(&p
->p_lock
);
3482 sigaddq(p
, NULL
, siginfo
, KM_NOSLEEP
);
3483 mutex_exit(&p
->p_lock
);
3486 mutex_exit(&pidlock
);
3490 * return memory object ID
3493 as_getmemid(struct as
*as
, caddr_t addr
, memid_t
*memidp
)
3498 AS_LOCK_ENTER(as
, RW_READER
);
3499 seg
= as_segat(as
, addr
);
3505 sts
= segop_getmemid(seg
, addr
, memidp
);