4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
39 * VM - shared or copy-on-write from a vnode/anonymous memory.
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/t_lock.h>
45 #include <sys/errno.h>
46 #include <sys/systm.h>
48 #include <sys/debug.h>
50 #include <sys/vmsystm.h>
51 #include <sys/tuneable.h>
52 #include <sys/bitmap.h>
55 #include <sys/sysmacros.h>
56 #include <sys/vtrace.h>
57 #include <sys/cmn_err.h>
58 #include <sys/callb.h>
60 #include <sys/dumphdr.h>
66 #include <vm/seg_vn.h>
73 #include <sys/project.h>
75 #include <sys/shm_impl.h>
77 * Private seg op routines.
79 static int segvn_dup(struct seg
*seg
, struct seg
*newseg
);
80 static int segvn_unmap(struct seg
*seg
, caddr_t addr
, size_t len
);
81 static void segvn_free(struct seg
*seg
);
82 static faultcode_t
segvn_fault(struct hat
*hat
, struct seg
*seg
,
83 caddr_t addr
, size_t len
, enum fault_type type
,
85 static faultcode_t
segvn_faulta(struct seg
*seg
, caddr_t addr
);
86 static int segvn_setprot(struct seg
*seg
, caddr_t addr
,
87 size_t len
, uint_t prot
);
88 static int segvn_checkprot(struct seg
*seg
, caddr_t addr
,
89 size_t len
, uint_t prot
);
90 static int segvn_kluster(struct seg
*seg
, caddr_t addr
, ssize_t delta
);
91 static size_t segvn_swapout(struct seg
*seg
);
92 static int segvn_sync(struct seg
*seg
, caddr_t addr
, size_t len
,
93 int attr
, uint_t flags
);
94 static size_t segvn_incore(struct seg
*seg
, caddr_t addr
, size_t len
,
96 static int segvn_lockop(struct seg
*seg
, caddr_t addr
, size_t len
,
97 int attr
, int op
, ulong_t
*lockmap
, size_t pos
);
98 static int segvn_getprot(struct seg
*seg
, caddr_t addr
, size_t len
,
100 static u_offset_t
segvn_getoffset(struct seg
*seg
, caddr_t addr
);
101 static int segvn_gettype(struct seg
*seg
, caddr_t addr
);
102 static int segvn_getvp(struct seg
*seg
, caddr_t addr
,
104 static int segvn_advise(struct seg
*seg
, caddr_t addr
, size_t len
,
106 static void segvn_dump(struct seg
*seg
);
107 static int segvn_pagelock(struct seg
*seg
, caddr_t addr
, size_t len
,
108 struct page
***ppp
, enum lock_type type
, enum seg_rw rw
);
109 static int segvn_setpagesize(struct seg
*seg
, caddr_t addr
, size_t len
,
111 static int segvn_getmemid(struct seg
*seg
, caddr_t addr
,
113 static lgrp_mem_policy_info_t
*segvn_getpolicy(struct seg
*, caddr_t
);
114 static int segvn_capable(struct seg
*seg
, segcapability_t capable
);
116 struct seg_ops segvn_ops
= {
143 * Common zfod structures, provided as a shorthand for others to use.
145 static segvn_crargs_t zfod_segvn_crargs
=
146 SEGVN_ZFOD_ARGS(PROT_ZFOD
, PROT_ALL
);
147 static segvn_crargs_t kzfod_segvn_crargs
=
148 SEGVN_ZFOD_ARGS(PROT_ZFOD
& ~PROT_USER
,
149 PROT_ALL
& ~PROT_USER
);
150 static segvn_crargs_t stack_noexec_crargs
=
151 SEGVN_ZFOD_ARGS(PROT_ZFOD
& ~PROT_EXEC
, PROT_ALL
);
153 caddr_t zfod_argsp
= (caddr_t
)&zfod_segvn_crargs
; /* user zfod argsp */
154 caddr_t kzfod_argsp
= (caddr_t
)&kzfod_segvn_crargs
; /* kernel zfod argsp */
155 caddr_t stack_exec_argsp
= (caddr_t
)&zfod_segvn_crargs
; /* executable stack */
156 caddr_t stack_noexec_argsp
= (caddr_t
)&stack_noexec_crargs
; /* noexec stack */
158 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */
160 size_t segvn_comb_thrshld
= UINT_MAX
; /* patchable -- see 1196681 */
162 size_t segvn_pglock_comb_thrshld
= (1UL << 16); /* 64K */
163 size_t segvn_pglock_comb_balign
= (1UL << 16); /* 64K */
164 uint_t segvn_pglock_comb_bshift
;
165 size_t segvn_pglock_comb_palign
;
167 static int segvn_concat(struct seg
*, struct seg
*, int);
168 static int segvn_extend_prev(struct seg
*, struct seg
*,
169 struct segvn_crargs
*, size_t);
170 static int segvn_extend_next(struct seg
*, struct seg
*,
171 struct segvn_crargs
*, size_t);
172 static void segvn_softunlock(struct seg
*, caddr_t
, size_t, enum seg_rw
);
173 static void segvn_pagelist_rele(page_t
**);
174 static void segvn_setvnode_mpss(vnode_t
*);
175 static void segvn_relocate_pages(page_t
**, page_t
*);
176 static int segvn_full_szcpages(page_t
**, uint_t
, int *, uint_t
*);
177 static int segvn_fill_vp_pages(struct segvn_data
*, vnode_t
*, u_offset_t
,
178 uint_t
, page_t
**, page_t
**, uint_t
*, int *);
179 static faultcode_t
segvn_fault_vnodepages(struct hat
*, struct seg
*, caddr_t
,
180 caddr_t
, enum fault_type
, enum seg_rw
, caddr_t
, caddr_t
, int);
181 static faultcode_t
segvn_fault_anonpages(struct hat
*, struct seg
*, caddr_t
,
182 caddr_t
, enum fault_type
, enum seg_rw
, caddr_t
, caddr_t
, int);
183 static faultcode_t
segvn_faultpage(struct hat
*, struct seg
*, caddr_t
,
184 u_offset_t
, struct vpage
*, page_t
**, uint_t
,
185 enum fault_type
, enum seg_rw
, int);
186 static void segvn_vpage(struct seg
*);
187 static size_t segvn_count_swap_by_vpages(struct seg
*);
189 static void segvn_purge(struct seg
*seg
);
190 static int segvn_reclaim(void *, caddr_t
, size_t, struct page
**,
192 static int shamp_reclaim(void *, caddr_t
, size_t, struct page
**,
195 static int sameprot(struct seg
*, caddr_t
, size_t);
197 static int segvn_demote_range(struct seg
*, caddr_t
, size_t, int, uint_t
);
198 static int segvn_clrszc(struct seg
*);
199 static struct seg
*segvn_split_seg(struct seg
*, caddr_t
);
200 static int segvn_claim_pages(struct seg
*, struct vpage
*, u_offset_t
,
203 static void segvn_hat_rgn_unload_callback(caddr_t
, caddr_t
, caddr_t
,
204 size_t, void *, u_offset_t
);
206 static struct kmem_cache
*segvn_cache
;
207 static struct kmem_cache
**segvn_szc_cache
;
210 static struct segvnvmstats_str
{
211 ulong_t fill_vp_pages
[31];
212 ulong_t fltvnpages
[49];
213 ulong_t fullszcpages
[10];
214 ulong_t relocatepages
[3];
215 ulong_t fltanpages
[17];
217 ulong_t demoterange
[3];
219 #endif /* VM_STATS */
221 #define SDR_RANGE 1 /* demote entire range */
222 #define SDR_END 2 /* demote non aligned ends only */
224 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \
226 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \
227 ASSERT(lpgaddr >= (seg)->s_base); \
228 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \
230 ASSERT(lpgeaddr > lpgaddr); \
231 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \
233 lpgeaddr = lpgaddr = (addr); \
239 segvn_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
241 struct segvn_data
*svd
= buf
;
243 rw_init(&svd
->lock
, NULL
, RW_DEFAULT
, NULL
);
244 mutex_init(&svd
->segfree_syncmtx
, NULL
, MUTEX_DEFAULT
, NULL
);
245 svd
->svn_trnext
= svd
->svn_trprev
= NULL
;
251 segvn_cache_destructor(void *buf
, void *cdrarg
)
253 struct segvn_data
*svd
= buf
;
255 rw_destroy(&svd
->lock
);
256 mutex_destroy(&svd
->segfree_syncmtx
);
261 svntr_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
263 bzero(buf
, sizeof (svntr_t
));
268 * Patching this variable to non-zero allows the system to run with
269 * stacks marked as "not executable". It's a bit of a kludge, but is
270 * provided as a tweakable for platforms that export those ABIs
271 * (e.g. sparc V8) that have executable stacks enabled by default.
272 * There are also some restrictions for platforms that don't actually
273 * implement 'noexec' protections.
275 * Once enabled, the system is (therefore) unable to provide a fully
276 * ABI-compliant execution environment, though practically speaking,
277 * most everything works. The exceptions are generally some interpreters
278 * and debuggers that create executable code on the stack and jump
279 * into it (without explicitly mprotecting the address range to include
282 * One important class of applications that are disabled are those
283 * that have been transformed into malicious agents using one of the
284 * numerous "buffer overflow" attacks. See 4007890.
286 int noexec_user_stack
= 0;
287 int noexec_user_stack_log
= 1;
289 int segvn_lpg_disable
= 0;
290 uint_t segvn_maxpgszc
= 0;
292 ulong_t segvn_vmpss_clrszc_cnt
;
293 ulong_t segvn_vmpss_clrszc_err
;
294 ulong_t segvn_fltvnpages_clrszc_cnt
;
295 ulong_t segvn_fltvnpages_clrszc_err
;
296 ulong_t segvn_setpgsz_align_err
;
297 ulong_t segvn_setpgsz_anon_align_err
;
298 ulong_t segvn_setpgsz_getattr_err
;
299 ulong_t segvn_setpgsz_eof_err
;
300 ulong_t segvn_faultvnmpss_align_err1
;
301 ulong_t segvn_faultvnmpss_align_err2
;
302 ulong_t segvn_faultvnmpss_align_err3
;
303 ulong_t segvn_faultvnmpss_align_err4
;
304 ulong_t segvn_faultvnmpss_align_err5
;
305 ulong_t segvn_vmpss_pageio_deadlk_err
;
307 int segvn_use_regions
= 1;
310 * Segvn supports text replication optimization for NUMA platforms. Text
311 * replica's are represented by anon maps (amp). There's one amp per text file
312 * region per lgroup. A process chooses the amp for each of its text mappings
313 * based on the lgroup assignment of its main thread (t_tid = 1). All
314 * processes that want a replica on a particular lgroup for the same text file
315 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
316 * with vp,off,size,szc used as a key. Text replication segments are read only
317 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
318 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
319 * pages. Replication amp is assigned to a segment when it gets its first
320 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
321 * rechecks periodically if the process still maps an amp local to the main
322 * thread. If not async thread forces process to remap to an amp in the new
323 * home lgroup of the main thread. Current text replication implementation
324 * only provides the benefit to workloads that do most of their work in the
325 * main thread of a process or all the threads of a process run in the same
326 * lgroup. To extend text replication benefit to different types of
327 * multithreaded workloads further work would be needed in the hat layer to
328 * allow the same virtual address in the same hat to simultaneously map
329 * different physical addresses (i.e. page table replication would be needed
332 * amp pages are used instead of vnode pages as long as segment has a very
333 * simple life cycle. It's created via segvn_create(), handles S_EXEC
334 * (S_READ) pagefaults and is fully unmapped. If anything more complicated
335 * happens such as protection is changed, real COW fault happens, pagesize is
336 * changed, MC_LOCK is requested or segment is partially unmapped we turn off
337 * text replication by converting the segment back to vnode only segment
338 * (unmap segment's address range and set svd->amp to NULL).
340 * The original file can be changed after amp is inserted into
341 * svntr_hashtab. Processes that are launched after the file is already
342 * changed can't use the replica's created prior to the file change. To
343 * implement this functionality hash entries are timestamped. Replica's can
344 * only be used if current file modification time is the same as the timestamp
345 * saved when hash entry was created. However just timestamps alone are not
346 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
347 * deal with file changes via MAP_SHARED mappings differently. When writable
348 * MAP_SHARED mappings are created to vnodes marked as executable we mark all
349 * existing replica's for this vnode as not usable for future text
350 * mappings. And we don't create new replica's for files that currently have
351 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
355 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20)
356 size_t segvn_textrepl_max_bytes_factor
= SEGVN_TEXTREPL_MAXBYTES_FACTOR
;
358 static ulong_t svntr_hashtab_sz
= 512;
359 static svntr_bucket_t
*svntr_hashtab
= NULL
;
360 static struct kmem_cache
*svntr_cache
;
361 static svntr_stats_t
*segvn_textrepl_stats
;
362 static ksema_t segvn_trasync_sem
;
364 int segvn_disable_textrepl
= 1;
365 size_t textrepl_size_thresh
= (size_t)-1;
366 size_t segvn_textrepl_bytes
= 0;
367 size_t segvn_textrepl_max_bytes
= 0;
368 clock_t segvn_update_textrepl_interval
= 0;
369 int segvn_update_tr_time
= 10;
370 int segvn_disable_textrepl_update
= 0;
372 static void segvn_textrepl(struct seg
*);
373 static void segvn_textunrepl(struct seg
*, int);
374 static void segvn_inval_trcache(vnode_t
*);
375 static void segvn_trasync_thread(void);
376 static void segvn_trupdate_wakeup(void *);
377 static void segvn_trupdate(void);
378 static void segvn_trupdate_seg(struct seg
*, segvn_data_t
*, svntr_t
*,
382 * Initialize segvn data structures
391 segvn_cache
= kmem_cache_create("segvn_cache",
392 sizeof (struct segvn_data
), 0,
393 segvn_cache_constructor
, segvn_cache_destructor
, NULL
,
396 if (segvn_lpg_disable
== 0) {
397 szc
= maxszc
= page_num_pagesizes() - 1;
399 segvn_lpg_disable
= 1;
401 if (page_get_pagesize(0) != PAGESIZE
) {
402 panic("segvn_init: bad szc 0");
406 pgsz
= page_get_pagesize(szc
);
407 if (pgsz
<= PAGESIZE
|| !IS_P2ALIGNED(pgsz
, pgsz
)) {
408 panic("segvn_init: bad szc %d", szc
);
413 if (segvn_maxpgszc
== 0 || segvn_maxpgszc
> maxszc
)
414 segvn_maxpgszc
= maxszc
;
417 if (segvn_maxpgszc
) {
418 segvn_szc_cache
= (struct kmem_cache
**)kmem_alloc(
419 (segvn_maxpgszc
+ 1) * sizeof (struct kmem_cache
*),
423 for (szc
= 1; szc
<= segvn_maxpgszc
; szc
++) {
426 (void) sprintf(str
, "segvn_szc_cache%d", szc
);
427 segvn_szc_cache
[szc
] = kmem_cache_create(str
,
428 page_get_pagecnt(szc
) * sizeof (page_t
*), 0,
429 NULL
, NULL
, NULL
, NULL
, NULL
, KMC_NODEBUG
);
433 if (segvn_use_regions
&& !hat_supported(HAT_SHARED_REGIONS
, NULL
))
434 segvn_use_regions
= 0;
437 * For now shared regions and text replication segvn support
438 * are mutually exclusive. This is acceptable because
439 * currently significant benefit from text replication was
440 * only observed on AMD64 NUMA platforms (due to relatively
441 * small L2$ size) and currently we don't support shared
444 if (segvn_use_regions
&& !segvn_disable_textrepl
) {
445 segvn_disable_textrepl
= 1;
449 if (lgrp_optimizations() && textrepl_size_thresh
!= (size_t)-1 &&
450 !segvn_disable_textrepl
) {
452 size_t hsz
= svntr_hashtab_sz
* sizeof (svntr_bucket_t
);
454 svntr_cache
= kmem_cache_create("svntr_cache",
455 sizeof (svntr_t
), 0, svntr_cache_constructor
, NULL
,
456 NULL
, NULL
, NULL
, 0);
457 svntr_hashtab
= kmem_zalloc(hsz
, KM_SLEEP
);
458 for (i
= 0; i
< svntr_hashtab_sz
; i
++) {
459 mutex_init(&svntr_hashtab
[i
].tr_lock
, NULL
,
460 MUTEX_DEFAULT
, NULL
);
462 segvn_textrepl_max_bytes
= ptob(physmem
) /
463 segvn_textrepl_max_bytes_factor
;
464 segvn_textrepl_stats
= kmem_zalloc(NCPU
*
465 sizeof (svntr_stats_t
), KM_SLEEP
);
466 sema_init(&segvn_trasync_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
467 (void) thread_create(NULL
, 0, segvn_trasync_thread
,
468 NULL
, 0, &p0
, TS_RUN
, minclsyspri
);
472 if (!ISP2(segvn_pglock_comb_balign
) ||
473 segvn_pglock_comb_balign
< PAGESIZE
) {
474 segvn_pglock_comb_balign
= 1UL << 16; /* 64K */
476 segvn_pglock_comb_bshift
= highbit(segvn_pglock_comb_balign
) - 1;
477 segvn_pglock_comb_palign
= btop(segvn_pglock_comb_balign
);
480 #define SEGVN_PAGEIO ((void *)0x1)
481 #define SEGVN_NOPAGEIO ((void *)0x2)
484 segvn_setvnode_mpss(vnode_t
*vp
)
488 ASSERT(vp
->v_mpssdata
== NULL
||
489 vp
->v_mpssdata
== SEGVN_PAGEIO
||
490 vp
->v_mpssdata
== SEGVN_NOPAGEIO
);
492 if (vp
->v_mpssdata
== NULL
) {
493 if (vn_vmpss_usepageio(vp
)) {
494 err
= VOP_PAGEIO(vp
, (page_t
*)NULL
,
495 (u_offset_t
)0, 0, 0, CRED(), NULL
);
500 * set v_mpssdata just once per vnode life
501 * so that it never changes.
503 mutex_enter(&vp
->v_lock
);
504 if (vp
->v_mpssdata
== NULL
) {
506 vp
->v_mpssdata
= SEGVN_PAGEIO
;
508 vp
->v_mpssdata
= SEGVN_NOPAGEIO
;
511 mutex_exit(&vp
->v_lock
);
516 segvn_create(struct seg
*seg
, void *argsp
)
518 struct segvn_crargs
*a
= (struct segvn_crargs
*)argsp
;
519 struct segvn_data
*svd
;
522 struct anon_map
*amp
;
525 lgrp_mem_policy_t mpolicy
= LGRP_MEM_POLICY_DEFAULT
;
529 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
531 if (a
->type
!= MAP_PRIVATE
&& a
->type
!= MAP_SHARED
) {
532 panic("segvn_create type");
537 * Check arguments. If a shared anon structure is given then
538 * it is illegal to also specify a vp.
540 if (a
->amp
!= NULL
&& a
->vp
!= NULL
) {
541 panic("segvn_create anon_map");
545 if (a
->type
== MAP_PRIVATE
&& (a
->flags
& MAP_TEXT
) &&
546 a
->vp
!= NULL
&& a
->prot
== (PROT_USER
| PROT_READ
| PROT_EXEC
) &&
551 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
552 if (a
->type
== MAP_SHARED
)
553 a
->flags
&= ~MAP_NORESERVE
;
556 if (segvn_lpg_disable
!= 0 || (a
->szc
== AS_MAP_NO_LPOOB
) ||
557 (a
->amp
!= NULL
&& a
->type
== MAP_PRIVATE
) ||
558 (a
->flags
& MAP_NORESERVE
) || seg
->s_as
== &kas
) {
561 if (a
->szc
> segvn_maxpgszc
)
562 a
->szc
= segvn_maxpgszc
;
563 pgsz
= page_get_pagesize(a
->szc
);
564 if (!IS_P2ALIGNED(seg
->s_base
, pgsz
) ||
565 !IS_P2ALIGNED(seg
->s_size
, pgsz
)) {
567 } else if (a
->vp
!= NULL
) {
568 if (IS_SWAPFSVP(a
->vp
) || VN_ISKAS(a
->vp
)) {
571 * hat_page_demote() is not supported
575 } else if (map_addr_vacalign_check(seg
->s_base
,
576 a
->offset
& PAGEMASK
)) {
579 } else if (a
->amp
!= NULL
) {
580 pgcnt_t anum
= btopr(a
->offset
);
581 pgcnt_t pgcnt
= page_get_pagecnt(a
->szc
);
582 if (!IS_P2ALIGNED(anum
, pgcnt
)) {
590 * If segment may need private pages, reserve them now.
592 if (!(a
->flags
& MAP_NORESERVE
) && ((a
->vp
== NULL
&& a
->amp
== NULL
) ||
593 (a
->type
== MAP_PRIVATE
&& (a
->prot
& PROT_WRITE
)))) {
594 if (anon_resv_zone(seg
->s_size
,
595 seg
->s_as
->a_proc
->p_zone
) == 0)
597 swresv
= seg
->s_size
;
598 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
, "anon proc:%p %lu %u",
603 * Reserve any mapping structures that may be required.
605 * Don't do it for segments that may use regions. It's currently a
606 * noop in the hat implementations anyway.
609 hat_map(seg
->s_as
->a_hat
, seg
->s_base
, seg
->s_size
, HAT_MAP
);
616 crhold(cred
= CRED());
619 /* Inform the vnode of the new mapping */
621 error
= VOP_ADDMAP(a
->vp
, a
->offset
& PAGEMASK
,
622 seg
->s_as
, seg
->s_base
, seg
->s_size
, a
->prot
,
623 a
->maxprot
, a
->type
, cred
, NULL
);
626 anon_unresv_zone(swresv
,
627 seg
->s_as
->a_proc
->p_zone
);
628 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
,
629 "anon proc:%p %lu %u", seg
, swresv
, 0);
633 hat_unload(seg
->s_as
->a_hat
, seg
->s_base
,
634 seg
->s_size
, HAT_UNLOAD_UNMAP
);
639 * svntr_hashtab will be NULL if we support shared regions.
641 trok
= ((a
->flags
& MAP_TEXT
) &&
642 (seg
->s_size
> textrepl_size_thresh
||
643 (a
->flags
& _MAP_TEXTREPL
)) &&
644 lgrp_optimizations() && svntr_hashtab
!= NULL
&&
645 a
->type
== MAP_PRIVATE
&& swresv
== 0 &&
646 !(a
->flags
& MAP_NORESERVE
) &&
647 seg
->s_as
!= &kas
&& a
->vp
->v_type
== VREG
);
649 ASSERT(!trok
|| !use_rgn
);
653 * MAP_NORESERVE mappings don't count towards the VSZ of a process
654 * until we fault the pages in.
656 if ((a
->vp
== NULL
|| a
->vp
->v_type
!= VREG
) &&
657 a
->flags
& MAP_NORESERVE
) {
658 seg
->s_as
->a_resvsize
-= seg
->s_size
;
662 * If more than one segment in the address space, and they're adjacent
663 * virtually, try to concatenate them. Don't concatenate if an
664 * explicit anon_map structure was supplied (e.g., SystemV shared
665 * memory) or if we'll use text replication for this segment.
667 if (a
->amp
== NULL
&& !use_rgn
&& !trok
) {
668 struct seg
*pseg
, *nseg
;
669 struct segvn_data
*psvd
, *nsvd
;
670 lgrp_mem_policy_t ppolicy
, npolicy
;
671 uint_t lgrp_mem_policy_flags
= 0;
672 extern lgrp_mem_policy_t lgrp_mem_default_policy
;
675 * Memory policy flags (lgrp_mem_policy_flags) is valid when
676 * extending stack/heap segments.
678 if ((a
->vp
== NULL
) && (a
->type
== MAP_PRIVATE
) &&
679 !(a
->flags
& MAP_NORESERVE
) && (seg
->s_as
!= &kas
)) {
680 lgrp_mem_policy_flags
= a
->lgrp_mem_policy_flags
;
683 * Get policy when not extending it from another segment
685 mpolicy
= lgrp_mem_policy_default(seg
->s_size
, a
->type
);
689 * First, try to concatenate the previous and new segments
691 pseg
= AS_SEGPREV(seg
->s_as
, seg
);
693 pseg
->s_base
+ pseg
->s_size
== seg
->s_base
&&
694 pseg
->s_ops
== &segvn_ops
) {
696 * Get memory allocation policy from previous segment.
697 * When extension is specified (e.g. for heap) apply
698 * this policy to the new segment regardless of the
699 * outcome of segment concatenation. Extension occurs
700 * for non-default policy otherwise default policy is
701 * used and is based on extended segment size.
703 psvd
= (struct segvn_data
*)pseg
->s_data
;
704 ppolicy
= psvd
->policy_info
.mem_policy
;
705 if (lgrp_mem_policy_flags
==
706 LGRP_MP_FLAG_EXTEND_UP
) {
707 if (ppolicy
!= lgrp_mem_default_policy
) {
710 mpolicy
= lgrp_mem_policy_default(
711 pseg
->s_size
+ seg
->s_size
,
716 if (mpolicy
== ppolicy
&&
717 (pseg
->s_size
+ seg
->s_size
<=
718 segvn_comb_thrshld
|| psvd
->amp
== NULL
) &&
719 segvn_extend_prev(pseg
, seg
, a
, swresv
) == 0) {
721 * success! now try to concatenate
725 nseg
= AS_SEGNEXT(pseg
->s_as
, pseg
);
728 nseg
->s_ops
== &segvn_ops
&&
729 pseg
->s_base
+ pseg
->s_size
==
731 (void) segvn_concat(pseg
, nseg
, 0);
732 ASSERT(pseg
->s_szc
== 0 ||
733 (a
->szc
== pseg
->s_szc
&&
734 IS_P2ALIGNED(pseg
->s_base
, pgsz
) &&
735 IS_P2ALIGNED(pseg
->s_size
, pgsz
)));
741 * Failed, so try to concatenate with following seg
743 nseg
= AS_SEGNEXT(seg
->s_as
, seg
);
745 seg
->s_base
+ seg
->s_size
== nseg
->s_base
&&
746 nseg
->s_ops
== &segvn_ops
) {
748 * Get memory allocation policy from next segment.
749 * When extension is specified (e.g. for stack) apply
750 * this policy to the new segment regardless of the
751 * outcome of segment concatenation. Extension occurs
752 * for non-default policy otherwise default policy is
753 * used and is based on extended segment size.
755 nsvd
= (struct segvn_data
*)nseg
->s_data
;
756 npolicy
= nsvd
->policy_info
.mem_policy
;
757 if (lgrp_mem_policy_flags
==
758 LGRP_MP_FLAG_EXTEND_DOWN
) {
759 if (npolicy
!= lgrp_mem_default_policy
) {
762 mpolicy
= lgrp_mem_policy_default(
763 nseg
->s_size
+ seg
->s_size
,
768 if (mpolicy
== npolicy
&&
769 segvn_extend_next(seg
, nseg
, a
, swresv
) == 0) {
771 ASSERT(nseg
->s_szc
== 0 ||
772 (a
->szc
== nseg
->s_szc
&&
773 IS_P2ALIGNED(nseg
->s_base
, pgsz
) &&
774 IS_P2ALIGNED(nseg
->s_size
, pgsz
)));
782 if (a
->type
== MAP_SHARED
)
783 lgrp_shm_policy_init(NULL
, a
->vp
);
785 svd
= kmem_cache_alloc(segvn_cache
, KM_SLEEP
);
787 seg
->s_ops
= &segvn_ops
;
788 seg
->s_data
= (void *)svd
;
794 * Anonymous mappings have no backing file so the offset is meaningless.
796 svd
->offset
= a
->vp
? (a
->offset
& PAGEMASK
) : 0;
798 svd
->maxprot
= a
->maxprot
;
803 svd
->advice
= MADV_NORMAL
;
805 svd
->flags
= (ushort_t
)a
->flags
;
806 svd
->softlockcnt
= 0;
807 svd
->softlockcnt_sbase
= 0;
808 svd
->softlockcnt_send
= 0;
809 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
812 if (a
->szc
!= 0 && a
->vp
!= NULL
) {
813 segvn_setvnode_mpss(a
->vp
);
815 if (svd
->type
== MAP_SHARED
&& svd
->vp
!= NULL
&&
816 (svd
->vp
->v_flag
& VVMEXEC
) && (svd
->prot
& PROT_WRITE
)) {
817 ASSERT(vn_is_mapped(svd
->vp
, V_WRITE
));
818 segvn_inval_trcache(svd
->vp
);
822 if ((svd
->amp
= amp
) == NULL
) {
824 if (svd
->type
== MAP_SHARED
) {
827 * Shared mappings to a vp need no other setup.
828 * If we have a shared mapping to an anon_map object
829 * which hasn't been allocated yet, allocate the
830 * struct now so that it will be properly shared
831 * by remembering the swap reservation there.
834 svd
->amp
= anonmap_alloc(seg
->s_size
, swresv
,
836 svd
->amp
->a_szc
= seg
->s_szc
;
840 * Private mapping (with or without a vp).
841 * Allocate anon_map when needed.
843 svd
->swresv
= swresv
;
849 * Mapping to an existing anon_map structure without a vp.
850 * For now we will insure that the segment size isn't larger
851 * than the size - offset gives us. Later on we may wish to
852 * have the anon array dynamically allocated itself so that
853 * we don't always have to allocate all the anon pointer slots.
854 * This of course involves adding extra code to check that we
855 * aren't trying to use an anon pointer slot beyond the end
856 * of the currently allocated anon array.
858 if ((amp
->size
- a
->offset
) < seg
->s_size
) {
859 panic("segvn_create anon_map size");
863 anon_num
= btopr(a
->offset
);
865 if (a
->type
== MAP_SHARED
) {
867 * SHARED mapping to a given anon_map.
869 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
871 if (a
->szc
> amp
->a_szc
) {
874 ANON_LOCK_EXIT(&
->a_rwlock
);
875 svd
->anon_index
= anon_num
;
879 * PRIVATE mapping to a given anon_map.
880 * Make sure that all the needed anon
881 * structures are created (so that we will
882 * share the underlying pages if nothing
883 * is written by this mapping) and then
884 * duplicate the anon array as is done
885 * when a privately mapped segment is dup'ed.
891 int hat_flag
= HAT_LOAD
;
893 if (svd
->flags
& MAP_TEXT
) {
894 hat_flag
|= HAT_LOAD_TEXT
;
897 svd
->amp
= anonmap_alloc(seg
->s_size
, 0, ANON_SLEEP
);
898 svd
->amp
->a_szc
= seg
->s_szc
;
900 svd
->swresv
= swresv
;
903 * Prevent 2 threads from allocating anon
904 * slots simultaneously.
906 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
907 eaddr
= seg
->s_base
+ seg
->s_size
;
909 for (anon_idx
= anon_num
, addr
= seg
->s_base
;
910 addr
< eaddr
; addr
+= PAGESIZE
, anon_idx
++) {
913 if ((ap
= anon_get_ptr(amp
->ahp
,
918 * Allocate the anon struct now.
919 * Might as well load up translation
920 * to the page while we're at it...
922 pp
= anon_zero(seg
, addr
, &ap
, cred
);
923 if (ap
== NULL
|| pp
== NULL
) {
924 panic("segvn_create anon_zero");
929 * Re-acquire the anon_map lock and
930 * initialize the anon array entry.
932 ASSERT(anon_get_ptr(amp
->ahp
,
934 (void) anon_set_ptr(amp
->ahp
, anon_idx
, ap
,
937 ASSERT(seg
->s_szc
== 0);
938 ASSERT(!IS_VMODSORT(pp
->p_vnode
));
940 ASSERT(use_rgn
== 0);
941 hat_memload(seg
->s_as
->a_hat
, addr
, pp
,
942 svd
->prot
& ~PROT_WRITE
, hat_flag
);
946 ASSERT(seg
->s_szc
== 0);
947 anon_dup(amp
->ahp
, anon_num
, svd
->amp
->ahp
,
949 ANON_LOCK_EXIT(&
->a_rwlock
);
954 * Set default memory allocation policy for segment
956 * Always set policy for private memory at least for initialization
957 * even if this is a shared memory segment
959 (void) lgrp_privm_policy_set(mpolicy
, &svd
->policy_info
, seg
->s_size
);
961 if (svd
->type
== MAP_SHARED
)
962 (void) lgrp_shm_policy_set(mpolicy
, svd
->amp
, svd
->anon_index
,
963 svd
->vp
, svd
->offset
, seg
->s_size
);
967 ASSERT(svd
->amp
== NULL
);
968 svd
->rcookie
= hat_join_region(seg
->s_as
->a_hat
, seg
->s_base
,
969 seg
->s_size
, (void *)svd
->vp
, svd
->offset
, svd
->prot
,
970 (uchar_t
)seg
->s_szc
, segvn_hat_rgn_unload_callback
,
974 ASSERT(!trok
|| !(svd
->prot
& PROT_WRITE
));
975 svd
->tr_state
= trok
? SEGVN_TR_INIT
: SEGVN_TR_OFF
;
981 * Concatenate two existing segments, if possible.
982 * Return 0 on success, -1 if two segments are not compatible
983 * or -2 on memory allocation failure.
984 * If amp_cat == 1 then try and concat segments with anon maps
987 segvn_concat(struct seg
*seg1
, struct seg
*seg2
, int amp_cat
)
989 struct segvn_data
*svd1
= seg1
->s_data
;
990 struct segvn_data
*svd2
= seg2
->s_data
;
991 struct anon_map
*amp1
= svd1
->amp
;
992 struct anon_map
*amp2
= svd2
->amp
;
993 struct vpage
*vpage1
= svd1
->vpage
;
994 struct vpage
*vpage2
= svd2
->vpage
, *nvpage
= NULL
;
995 size_t size
, nvpsize
;
996 pgcnt_t npages1
, npages2
;
998 ASSERT(seg1
->s_as
&& seg2
->s_as
&& seg1
->s_as
== seg2
->s_as
);
999 ASSERT(AS_WRITE_HELD(seg1
->s_as
, &seg1
->s_as
->a_lock
));
1000 ASSERT(seg1
->s_ops
== seg2
->s_ops
);
1002 if (HAT_IS_REGION_COOKIE_VALID(svd1
->rcookie
) ||
1003 HAT_IS_REGION_COOKIE_VALID(svd2
->rcookie
)) {
1007 /* both segments exist, try to merge them */
1008 #define incompat(x) (svd1->x != svd2->x)
1009 if (incompat(vp
) || incompat(maxprot
) ||
1010 (!svd1
->pageadvice
&& !svd2
->pageadvice
&& incompat(advice
)) ||
1011 (!svd1
->pageprot
&& !svd2
->pageprot
&& incompat(prot
)) ||
1012 incompat(type
) || incompat(cred
) || incompat(flags
) ||
1013 seg1
->s_szc
!= seg2
->s_szc
|| incompat(policy_info
.mem_policy
) ||
1014 (svd2
->softlockcnt
> 0) || svd1
->softlockcnt_send
> 0)
1019 * vp == NULL implies zfod, offset doesn't matter
1021 if (svd1
->vp
!= NULL
&&
1022 svd1
->offset
+ seg1
->s_size
!= svd2
->offset
) {
1027 * Don't concatenate if either segment uses text replication.
1029 if (svd1
->tr_state
!= SEGVN_TR_OFF
|| svd2
->tr_state
!= SEGVN_TR_OFF
) {
1034 * Fail early if we're not supposed to concatenate
1035 * segments with non NULL amp.
1037 if (amp_cat
== 0 && (amp1
!= NULL
|| amp2
!= NULL
)) {
1041 if (svd1
->vp
== NULL
&& svd1
->type
== MAP_SHARED
) {
1045 if (amp1
!= NULL
&& svd1
->anon_index
+ btop(seg1
->s_size
) !=
1049 ASSERT(amp1
== NULL
|| amp1
->refcnt
>= 2);
1053 * If either seg has vpages, create a new merged vpage array.
1055 if (vpage1
!= NULL
|| vpage2
!= NULL
) {
1056 struct vpage
*vp
, *evp
;
1058 npages1
= seg_pages(seg1
);
1059 npages2
= seg_pages(seg2
);
1060 nvpsize
= vpgtob(npages1
+ npages2
);
1062 if ((nvpage
= kmem_zalloc(nvpsize
, KM_NOSLEEP
)) == NULL
) {
1066 if (vpage1
!= NULL
) {
1067 bcopy(vpage1
, nvpage
, vpgtob(npages1
));
1069 evp
= nvpage
+ npages1
;
1070 for (vp
= nvpage
; vp
< evp
; vp
++) {
1071 VPP_SETPROT(vp
, svd1
->prot
);
1072 VPP_SETADVICE(vp
, svd1
->advice
);
1076 if (vpage2
!= NULL
) {
1077 bcopy(vpage2
, nvpage
+ npages1
, vpgtob(npages2
));
1079 evp
= nvpage
+ npages1
+ npages2
;
1080 for (vp
= nvpage
+ npages1
; vp
< evp
; vp
++) {
1081 VPP_SETPROT(vp
, svd2
->prot
);
1082 VPP_SETADVICE(vp
, svd2
->advice
);
1086 if (svd2
->pageswap
&& (!svd1
->pageswap
&& svd1
->swresv
)) {
1087 ASSERT(svd1
->swresv
== seg1
->s_size
);
1088 ASSERT(!(svd1
->flags
& MAP_NORESERVE
));
1089 ASSERT(!(svd2
->flags
& MAP_NORESERVE
));
1090 evp
= nvpage
+ npages1
;
1091 for (vp
= nvpage
; vp
< evp
; vp
++) {
1096 if (svd1
->pageswap
&& (!svd2
->pageswap
&& svd2
->swresv
)) {
1097 ASSERT(svd2
->swresv
== seg2
->s_size
);
1098 ASSERT(!(svd1
->flags
& MAP_NORESERVE
));
1099 ASSERT(!(svd2
->flags
& MAP_NORESERVE
));
1100 vp
= nvpage
+ npages1
;
1102 for (; vp
< evp
; vp
++) {
1107 ASSERT((vpage1
!= NULL
|| vpage2
!= NULL
) ||
1108 (svd1
->pageswap
== 0 && svd2
->pageswap
== 0));
1111 * If either segment has private pages, create a new merged anon
1112 * array. If mergeing shared anon segments just decrement anon map's
1115 if (amp1
!= NULL
&& svd1
->type
== MAP_SHARED
) {
1116 ASSERT(amp1
== amp2
&& svd1
->vp
== NULL
);
1117 ANON_LOCK_ENTER(&1
->a_rwlock
, RW_WRITER
);
1118 ASSERT(amp1
->refcnt
>= 2);
1120 ANON_LOCK_EXIT(&1
->a_rwlock
);
1122 } else if (amp1
!= NULL
|| amp2
!= NULL
) {
1123 struct anon_hdr
*nahp
;
1124 struct anon_map
*namp
= NULL
;
1127 ASSERT(svd1
->type
== MAP_PRIVATE
);
1129 asize
= seg1
->s_size
+ seg2
->s_size
;
1130 if ((nahp
= anon_create(btop(asize
), ANON_NOSLEEP
)) == NULL
) {
1131 if (nvpage
!= NULL
) {
1132 kmem_free(nvpage
, nvpsize
);
1138 * XXX anon rwlock is not really needed because
1139 * this is a private segment and we are writers.
1141 ANON_LOCK_ENTER(&1
->a_rwlock
, RW_WRITER
);
1142 ASSERT(amp1
->refcnt
== 1);
1143 if (anon_copy_ptr(amp1
->ahp
, svd1
->anon_index
,
1144 nahp
, 0, btop(seg1
->s_size
), ANON_NOSLEEP
)) {
1145 anon_release(nahp
, btop(asize
));
1146 ANON_LOCK_EXIT(&1
->a_rwlock
);
1147 if (nvpage
!= NULL
) {
1148 kmem_free(nvpage
, nvpsize
);
1154 ANON_LOCK_ENTER(&2
->a_rwlock
, RW_WRITER
);
1155 ASSERT(amp2
->refcnt
== 1);
1156 if (anon_copy_ptr(amp2
->ahp
, svd2
->anon_index
,
1157 nahp
, btop(seg1
->s_size
), btop(seg2
->s_size
),
1159 anon_release(nahp
, btop(asize
));
1160 ANON_LOCK_EXIT(&2
->a_rwlock
);
1162 ANON_LOCK_EXIT(&1
->a_rwlock
);
1164 if (nvpage
!= NULL
) {
1165 kmem_free(nvpage
, nvpsize
);
1172 anon_release(amp1
->ahp
, btop(amp1
->size
));
1176 ASSERT(amp1
== NULL
);
1178 anon_release(amp2
->ahp
, btop(amp2
->size
));
1181 ANON_LOCK_EXIT(&2
->a_rwlock
);
1184 svd2
->amp
= NULL
; /* needed for seg_free */
1189 svd1
->anon_index
= 0;
1190 ANON_LOCK_EXIT(&namp
->a_rwlock
);
1193 * Now free the old vpage structures.
1195 if (nvpage
!= NULL
) {
1196 if (vpage1
!= NULL
) {
1197 kmem_free(vpage1
, vpgtob(npages1
));
1199 if (vpage2
!= NULL
) {
1201 kmem_free(vpage2
, vpgtob(npages2
));
1203 if (svd2
->pageprot
) {
1206 if (svd2
->pageadvice
) {
1207 svd1
->pageadvice
= 1;
1209 if (svd2
->pageswap
) {
1212 svd1
->vpage
= nvpage
;
1215 /* all looks ok, merge segments */
1216 svd1
->swresv
+= svd2
->swresv
;
1217 svd2
->swresv
= 0; /* so seg_free doesn't release swap space */
1218 size
= seg2
->s_size
;
1220 seg1
->s_size
+= size
;
1225 * Extend the previous segment (seg1) to include the
1226 * new segment (seg2 + a), if possible.
1227 * Return 0 on success.
1230 segvn_extend_prev(seg1
, seg2
, a
, swresv
)
1231 struct seg
*seg1
, *seg2
;
1232 struct segvn_crargs
*a
;
1235 struct segvn_data
*svd1
= (struct segvn_data
*)seg1
->s_data
;
1237 struct anon_map
*amp1
;
1238 struct vpage
*new_vpage
;
1241 * We don't need any segment level locks for "segvn" data
1242 * since the address space is "write" locked.
1244 ASSERT(seg1
->s_as
&& AS_WRITE_HELD(seg1
->s_as
, &seg1
->s_as
->a_lock
));
1246 if (HAT_IS_REGION_COOKIE_VALID(svd1
->rcookie
)) {
1250 /* second segment is new, try to extend first */
1251 /* XXX - should also check cred */
1252 if (svd1
->vp
!= a
->vp
|| svd1
->maxprot
!= a
->maxprot
||
1253 (!svd1
->pageprot
&& (svd1
->prot
!= a
->prot
)) ||
1254 svd1
->type
!= a
->type
|| svd1
->flags
!= a
->flags
||
1255 seg1
->s_szc
!= a
->szc
|| svd1
->softlockcnt_send
> 0)
1258 /* vp == NULL implies zfod, offset doesn't matter */
1259 if (svd1
->vp
!= NULL
&&
1260 svd1
->offset
+ seg1
->s_size
!= (a
->offset
& PAGEMASK
))
1263 if (svd1
->tr_state
!= SEGVN_TR_OFF
) {
1272 * Segment has private pages, can data structures
1275 * Acquire the anon_map lock to prevent it from changing,
1276 * if it is shared. This ensures that the anon_map
1277 * will not change while a thread which has a read/write
1278 * lock on an address space references it.
1279 * XXX - Don't need the anon_map lock at all if "refcnt"
1282 * Can't grow a MAP_SHARED segment with an anonmap because
1283 * there may be existing anon slots where we want to extend
1284 * the segment and we wouldn't know what to do with them
1285 * (e.g., for tmpfs right thing is to just leave them there,
1286 * for /dev/zero they should be cleared out).
1288 if (svd1
->type
== MAP_SHARED
)
1291 ANON_LOCK_ENTER(&1
->a_rwlock
, RW_WRITER
);
1292 if (amp1
->refcnt
> 1) {
1293 ANON_LOCK_EXIT(&1
->a_rwlock
);
1296 newpgs
= anon_grow(amp1
->ahp
, &svd1
->anon_index
,
1297 btop(seg1
->s_size
), btop(seg2
->s_size
), ANON_NOSLEEP
);
1300 ANON_LOCK_EXIT(&1
->a_rwlock
);
1303 amp1
->size
= ptob(newpgs
);
1304 ANON_LOCK_EXIT(&1
->a_rwlock
);
1306 if (svd1
->vpage
!= NULL
) {
1307 struct vpage
*vp
, *evp
;
1309 kmem_zalloc(vpgtob(seg_pages(seg1
) + seg_pages(seg2
)),
1311 if (new_vpage
== NULL
)
1313 bcopy(svd1
->vpage
, new_vpage
, vpgtob(seg_pages(seg1
)));
1314 kmem_free(svd1
->vpage
, vpgtob(seg_pages(seg1
)));
1315 svd1
->vpage
= new_vpage
;
1317 vp
= new_vpage
+ seg_pages(seg1
);
1318 evp
= vp
+ seg_pages(seg2
);
1319 for (; vp
< evp
; vp
++)
1320 VPP_SETPROT(vp
, a
->prot
);
1321 if (svd1
->pageswap
&& swresv
) {
1322 ASSERT(!(svd1
->flags
& MAP_NORESERVE
));
1323 ASSERT(swresv
== seg2
->s_size
);
1324 vp
= new_vpage
+ seg_pages(seg1
);
1325 for (; vp
< evp
; vp
++) {
1330 ASSERT(svd1
->vpage
!= NULL
|| svd1
->pageswap
== 0);
1331 size
= seg2
->s_size
;
1333 seg1
->s_size
+= size
;
1334 svd1
->swresv
+= swresv
;
1335 if (svd1
->pageprot
&& (a
->prot
& PROT_WRITE
) &&
1336 svd1
->type
== MAP_SHARED
&& svd1
->vp
!= NULL
&&
1337 (svd1
->vp
->v_flag
& VVMEXEC
)) {
1338 ASSERT(vn_is_mapped(svd1
->vp
, V_WRITE
));
1339 segvn_inval_trcache(svd1
->vp
);
1345 * Extend the next segment (seg2) to include the
1346 * new segment (seg1 + a), if possible.
1347 * Return 0 on success.
1353 struct segvn_crargs
*a
,
1356 struct segvn_data
*svd2
= (struct segvn_data
*)seg2
->s_data
;
1358 struct anon_map
*amp2
;
1359 struct vpage
*new_vpage
;
1362 * We don't need any segment level locks for "segvn" data
1363 * since the address space is "write" locked.
1365 ASSERT(seg2
->s_as
&& AS_WRITE_HELD(seg2
->s_as
, &seg2
->s_as
->a_lock
));
1367 if (HAT_IS_REGION_COOKIE_VALID(svd2
->rcookie
)) {
1371 /* first segment is new, try to extend second */
1372 /* XXX - should also check cred */
1373 if (svd2
->vp
!= a
->vp
|| svd2
->maxprot
!= a
->maxprot
||
1374 (!svd2
->pageprot
&& (svd2
->prot
!= a
->prot
)) ||
1375 svd2
->type
!= a
->type
|| svd2
->flags
!= a
->flags
||
1376 seg2
->s_szc
!= a
->szc
|| svd2
->softlockcnt_sbase
> 0)
1378 /* vp == NULL implies zfod, offset doesn't matter */
1379 if (svd2
->vp
!= NULL
&&
1380 (a
->offset
& PAGEMASK
) + seg1
->s_size
!= svd2
->offset
)
1383 if (svd2
->tr_state
!= SEGVN_TR_OFF
) {
1392 * Segment has private pages, can data structures
1395 * Acquire the anon_map lock to prevent it from changing,
1396 * if it is shared. This ensures that the anon_map
1397 * will not change while a thread which has a read/write
1398 * lock on an address space references it.
1400 * XXX - Don't need the anon_map lock at all if "refcnt"
1403 if (svd2
->type
== MAP_SHARED
)
1406 ANON_LOCK_ENTER(&2
->a_rwlock
, RW_WRITER
);
1407 if (amp2
->refcnt
> 1) {
1408 ANON_LOCK_EXIT(&2
->a_rwlock
);
1411 newpgs
= anon_grow(amp2
->ahp
, &svd2
->anon_index
,
1412 btop(seg2
->s_size
), btop(seg1
->s_size
),
1413 ANON_NOSLEEP
| ANON_GROWDOWN
);
1416 ANON_LOCK_EXIT(&2
->a_rwlock
);
1419 amp2
->size
= ptob(newpgs
);
1420 ANON_LOCK_EXIT(&2
->a_rwlock
);
1422 if (svd2
->vpage
!= NULL
) {
1423 struct vpage
*vp
, *evp
;
1425 kmem_zalloc(vpgtob(seg_pages(seg1
) + seg_pages(seg2
)),
1427 if (new_vpage
== NULL
) {
1428 /* Not merging segments so adjust anon_index back */
1430 svd2
->anon_index
+= seg_pages(seg1
);
1433 bcopy(svd2
->vpage
, new_vpage
+ seg_pages(seg1
),
1434 vpgtob(seg_pages(seg2
)));
1435 kmem_free(svd2
->vpage
, vpgtob(seg_pages(seg2
)));
1436 svd2
->vpage
= new_vpage
;
1439 evp
= vp
+ seg_pages(seg1
);
1440 for (; vp
< evp
; vp
++)
1441 VPP_SETPROT(vp
, a
->prot
);
1442 if (svd2
->pageswap
&& swresv
) {
1443 ASSERT(!(svd2
->flags
& MAP_NORESERVE
));
1444 ASSERT(swresv
== seg1
->s_size
);
1446 for (; vp
< evp
; vp
++) {
1451 ASSERT(svd2
->vpage
!= NULL
|| svd2
->pageswap
== 0);
1452 size
= seg1
->s_size
;
1454 seg2
->s_size
+= size
;
1455 seg2
->s_base
-= size
;
1456 svd2
->offset
-= size
;
1457 svd2
->swresv
+= swresv
;
1458 if (svd2
->pageprot
&& (a
->prot
& PROT_WRITE
) &&
1459 svd2
->type
== MAP_SHARED
&& svd2
->vp
!= NULL
&&
1460 (svd2
->vp
->v_flag
& VVMEXEC
)) {
1461 ASSERT(vn_is_mapped(svd2
->vp
, V_WRITE
));
1462 segvn_inval_trcache(svd2
->vp
);
1468 segvn_dup(struct seg
*seg
, struct seg
*newseg
)
1470 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1471 struct segvn_data
*newsvd
;
1472 pgcnt_t npages
= seg_pages(seg
);
1476 struct anon_map
*amp
;
1478 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
1479 ASSERT(newseg
->s_as
->a_proc
->p_parent
== curproc
);
1482 * If segment has anon reserved, reserve more for the new seg.
1483 * For a MAP_NORESERVE segment swresv will be a count of all the
1484 * allocated anon slots; thus we reserve for the child as many slots
1485 * as the parent has allocated. This semantic prevents the child or
1486 * parent from dieing during a copy-on-write fault caused by trying
1487 * to write a shared pre-existing anon page.
1489 if ((len
= svd
->swresv
) != 0) {
1490 if (anon_resv(svd
->swresv
) == 0)
1493 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
, "anon proc:%p %lu %u",
1497 newsvd
= kmem_cache_alloc(segvn_cache
, KM_SLEEP
);
1499 newseg
->s_ops
= &segvn_ops
;
1500 newseg
->s_data
= (void *)newsvd
;
1501 newseg
->s_szc
= seg
->s_szc
;
1503 newsvd
->seg
= newseg
;
1504 if ((newsvd
->vp
= svd
->vp
) != NULL
) {
1506 if (svd
->type
== MAP_SHARED
)
1507 lgrp_shm_policy_init(NULL
, svd
->vp
);
1509 newsvd
->offset
= svd
->offset
;
1510 newsvd
->prot
= svd
->prot
;
1511 newsvd
->maxprot
= svd
->maxprot
;
1512 newsvd
->pageprot
= svd
->pageprot
;
1513 newsvd
->type
= svd
->type
;
1514 newsvd
->cred
= svd
->cred
;
1515 crhold(newsvd
->cred
);
1516 newsvd
->advice
= svd
->advice
;
1517 newsvd
->pageadvice
= svd
->pageadvice
;
1518 newsvd
->swresv
= svd
->swresv
;
1519 newsvd
->pageswap
= svd
->pageswap
;
1520 newsvd
->flags
= svd
->flags
;
1521 newsvd
->softlockcnt
= 0;
1522 newsvd
->softlockcnt_sbase
= 0;
1523 newsvd
->softlockcnt_send
= 0;
1524 newsvd
->policy_info
= svd
->policy_info
;
1525 newsvd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
1527 if ((amp
= svd
->amp
) == NULL
|| svd
->tr_state
== SEGVN_TR_ON
) {
1529 * Not attaching to a shared anon object.
1531 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
) ||
1532 svd
->tr_state
== SEGVN_TR_OFF
);
1533 if (svd
->tr_state
== SEGVN_TR_ON
) {
1534 ASSERT(newsvd
->vp
!= NULL
&& amp
!= NULL
);
1535 newsvd
->tr_state
= SEGVN_TR_INIT
;
1537 newsvd
->tr_state
= svd
->tr_state
;
1540 newsvd
->anon_index
= 0;
1542 /* regions for now are only used on pure vnode segments */
1543 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
1544 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1545 newsvd
->tr_state
= SEGVN_TR_OFF
;
1546 if (svd
->type
== MAP_SHARED
) {
1548 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
1550 ANON_LOCK_EXIT(&
->a_rwlock
);
1551 newsvd
->anon_index
= svd
->anon_index
;
1556 * Allocate and initialize new anon_map structure.
1558 newsvd
->amp
= anonmap_alloc(newseg
->s_size
, 0,
1560 newsvd
->amp
->a_szc
= newseg
->s_szc
;
1561 newsvd
->anon_index
= 0;
1564 * We don't have to acquire the anon_map lock
1565 * for the new segment (since it belongs to an
1566 * address space that is still not associated
1567 * with any process), or the segment in the old
1568 * address space (since all threads in it
1569 * are stopped while duplicating the address space).
1573 * The goal of the following code is to make sure that
1574 * softlocked pages do not end up as copy on write
1575 * pages. This would cause problems where one
1576 * thread writes to a page that is COW and a different
1577 * thread in the same process has softlocked it. The
1578 * softlock lock would move away from this process
1579 * because the write would cause this process to get
1580 * a copy (without the softlock).
1582 * The strategy here is to just break the
1583 * sharing on pages that could possibly be
1587 if (svd
->softlockcnt
) {
1588 struct anon
*ap
, *newap
;
1591 page_t
*anon_pl
[1+1], *pp
;
1593 ulong_t old_idx
= svd
->anon_index
;
1594 ulong_t new_idx
= 0;
1597 * The softlock count might be non zero
1598 * because some pages are still stuck in the
1599 * cache for lazy reclaim. Flush the cache
1600 * now. This should drop the count to zero.
1601 * [or there is really I/O going on to these
1602 * pages]. Note, we have the writers lock so
1603 * nothing gets inserted during the flush.
1610 i
= btopr(seg
->s_size
);
1613 * XXX break cow sharing using PAGESIZE
1614 * pages. They will be relocated into larger
1615 * pages at fault time.
1618 if (ap
= anon_get_ptr(amp
->ahp
,
1620 error
= anon_getpage(&ap
,
1621 &vpprot
, anon_pl
, PAGESIZE
,
1625 newsvd
->vpage
= NULL
;
1629 * prot need not be computed
1630 * below 'cause anon_private is
1631 * going to ignore it anyway
1632 * as child doesn't inherit
1633 * pagelock from parent.
1635 prot
= svd
->pageprot
?
1638 seg_page(seg
, addr
)])
1640 pp
= anon_private(&newap
,
1646 newsvd
->vpage
= NULL
;
1650 (void) anon_set_ptr(
1651 newsvd
->amp
->ahp
, new_idx
,
1659 } else { /* common case */
1660 if (seg
->s_szc
!= 0) {
1662 * If at least one of anon slots of a
1663 * large page exists then make sure
1664 * all anon slots of a large page
1665 * exist to avoid partial cow sharing
1666 * of a large page in the future.
1668 anon_dup_fill_holes(amp
->ahp
,
1669 svd
->anon_index
, newsvd
->amp
->ahp
,
1670 0, seg
->s_size
, seg
->s_szc
,
1673 anon_dup(amp
->ahp
, svd
->anon_index
,
1674 newsvd
->amp
->ahp
, 0, seg
->s_size
);
1677 hat_clrattr(seg
->s_as
->a_hat
, seg
->s_base
,
1678 seg
->s_size
, PROT_WRITE
);
1683 * If necessary, create a vpage structure for the new segment.
1684 * Do not copy any page lock indications.
1686 if (svd
->vpage
!= NULL
) {
1688 struct vpage
*ovp
= svd
->vpage
;
1691 nvp
= newsvd
->vpage
=
1692 kmem_alloc(vpgtob(npages
), KM_SLEEP
);
1693 for (i
= 0; i
< npages
; i
++) {
1695 VPP_CLRPPLOCK(nvp
++);
1698 newsvd
->vpage
= NULL
;
1700 /* Inform the vnode of the new mapping */
1701 if (newsvd
->vp
!= NULL
) {
1702 error
= VOP_ADDMAP(newsvd
->vp
, (offset_t
)newsvd
->offset
,
1703 newseg
->s_as
, newseg
->s_base
, newseg
->s_size
, newsvd
->prot
,
1704 newsvd
->maxprot
, newsvd
->type
, newsvd
->cred
, NULL
);
1707 if (error
== 0 && HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
1708 ASSERT(newsvd
->amp
== NULL
);
1709 ASSERT(newsvd
->tr_state
== SEGVN_TR_OFF
);
1710 newsvd
->rcookie
= svd
->rcookie
;
1711 hat_dup_region(newseg
->s_as
->a_hat
, newsvd
->rcookie
);
1718 * callback function to invoke free_vp_pages() for only those pages actually
1719 * processed by the HAT when a shared region is destroyed.
1721 extern int free_pages
;
1724 segvn_hat_rgn_unload_callback(caddr_t saddr
, caddr_t eaddr
, caddr_t r_saddr
,
1725 size_t r_size
, void *r_obj
, u_offset_t r_objoff
)
1729 vnode_t
*vp
= (vnode_t
*)r_obj
;
1731 ASSERT(eaddr
> saddr
);
1732 ASSERT(saddr
>= r_saddr
);
1733 ASSERT(saddr
< r_saddr
+ r_size
);
1734 ASSERT(eaddr
> r_saddr
);
1735 ASSERT(eaddr
<= r_saddr
+ r_size
);
1742 len
= eaddr
- saddr
;
1743 off
= (saddr
- r_saddr
) + r_objoff
;
1744 free_vp_pages(vp
, off
, len
);
1748 * callback function used by segvn_unmap to invoke free_vp_pages() for only
1749 * those pages actually processed by the HAT
1752 segvn_hat_unload_callback(hat_callback_t
*cb
)
1754 struct seg
*seg
= cb
->hcb_data
;
1755 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1759 ASSERT(svd
->vp
!= NULL
);
1760 ASSERT(cb
->hcb_end_addr
> cb
->hcb_start_addr
);
1761 ASSERT(cb
->hcb_start_addr
>= seg
->s_base
);
1763 len
= cb
->hcb_end_addr
- cb
->hcb_start_addr
;
1764 off
= cb
->hcb_start_addr
- seg
->s_base
;
1765 free_vp_pages(svd
->vp
, svd
->offset
+ off
, len
);
1769 * This function determines the number of bytes of swap reserved by
1770 * a segment for which per-page accounting is present. It is used to
1771 * calculate the correct value of a segvn_data's swresv.
1774 segvn_count_swap_by_vpages(struct seg
*seg
)
1776 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1777 struct vpage
*vp
, *evp
;
1778 size_t nswappages
= 0;
1780 ASSERT(svd
->pageswap
);
1781 ASSERT(svd
->vpage
!= NULL
);
1783 evp
= &svd
->vpage
[seg_page(seg
, seg
->s_base
+ seg
->s_size
)];
1785 for (vp
= svd
->vpage
; vp
< evp
; vp
++) {
1786 if (VPP_ISSWAPRES(vp
))
1790 return (nswappages
<< PAGESHIFT
);
1794 segvn_unmap(struct seg
*seg
, caddr_t addr
, size_t len
)
1796 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1797 struct segvn_data
*nsvd
;
1799 struct anon_map
*amp
;
1800 pgcnt_t opages
; /* old segment size in pages */
1801 pgcnt_t npages
; /* new segment size in pages */
1802 pgcnt_t dpages
; /* pages being deleted (unmapped) */
1803 hat_callback_t callback
; /* used for free_vp_pages() */
1804 hat_callback_t
*cbp
= NULL
;
1811 * We don't need any segment level locks for "segvn" data
1812 * since the address space is "write" locked.
1814 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
1817 * Fail the unmap if pages are SOFTLOCKed through this mapping.
1818 * softlockcnt is protected from change by the as write lock.
1821 if (svd
->softlockcnt
> 0) {
1822 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1825 * If this is shared segment non 0 softlockcnt
1826 * means locked pages are still in use.
1828 if (svd
->type
== MAP_SHARED
) {
1833 * since we do have the writers lock nobody can fill
1834 * the cache during the purge. The flush either succeeds
1835 * or we still have pending I/Os.
1846 * Check for bad sizes
1848 if (addr
< seg
->s_base
|| addr
+ len
> seg
->s_base
+ seg
->s_size
||
1849 (len
& PAGEOFFSET
) || ((uintptr_t)addr
& PAGEOFFSET
)) {
1850 panic("segvn_unmap");
1854 if (seg
->s_szc
!= 0) {
1855 size_t pgsz
= page_get_pagesize(seg
->s_szc
);
1857 if (!IS_P2ALIGNED(addr
, pgsz
) || !IS_P2ALIGNED(len
, pgsz
)) {
1858 ASSERT(seg
->s_base
!= addr
|| seg
->s_size
!= len
);
1859 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
1860 ASSERT(svd
->amp
== NULL
);
1861 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1862 hat_leave_region(seg
->s_as
->a_hat
,
1863 svd
->rcookie
, HAT_REGION_TEXT
);
1864 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
1866 * could pass a flag to segvn_demote_range()
1867 * below to tell it not to do any unloads but
1868 * this case is rare enough to not bother for
1871 } else if (svd
->tr_state
== SEGVN_TR_INIT
) {
1872 svd
->tr_state
= SEGVN_TR_OFF
;
1873 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
1874 ASSERT(svd
->amp
!= NULL
);
1875 segvn_textunrepl(seg
, 1);
1876 ASSERT(svd
->amp
== NULL
);
1877 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1879 VM_STAT_ADD(segvnvmstats
.demoterange
[0]);
1880 err
= segvn_demote_range(seg
, addr
, len
, SDR_END
, 0);
1888 /* Inform the vnode of the unmapping. */
1892 error
= VOP_DELMAP(svd
->vp
,
1893 (offset_t
)svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
),
1894 seg
->s_as
, addr
, len
, svd
->prot
, svd
->maxprot
,
1895 svd
->type
, svd
->cred
, NULL
);
1897 if (error
== EAGAIN
)
1902 * Remove any page locks set through this mapping.
1903 * If text replication is not off no page locks could have been
1904 * established via this mapping.
1906 if (svd
->tr_state
== SEGVN_TR_OFF
) {
1907 (void) segvn_lockop(seg
, addr
, len
, 0, MC_UNLOCK
, NULL
, 0);
1910 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
1911 ASSERT(svd
->amp
== NULL
);
1912 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1913 ASSERT(svd
->type
== MAP_PRIVATE
);
1914 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
1916 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
1917 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
1918 ASSERT(svd
->amp
!= NULL
);
1919 ASSERT(svd
->pageprot
== 0 && !(svd
->prot
& PROT_WRITE
));
1920 segvn_textunrepl(seg
, 1);
1921 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
1923 if (svd
->tr_state
!= SEGVN_TR_OFF
) {
1924 ASSERT(svd
->tr_state
== SEGVN_TR_INIT
);
1925 svd
->tr_state
= SEGVN_TR_OFF
;
1928 * Unload any hardware translations in the range to be taken
1929 * out. Use a callback to invoke free_vp_pages() effectively.
1931 if (svd
->vp
!= NULL
&& free_pages
!= 0) {
1932 callback
.hcb_data
= seg
;
1933 callback
.hcb_function
= segvn_hat_unload_callback
;
1936 hat_unload_callback(seg
->s_as
->a_hat
, addr
, len
,
1937 HAT_UNLOAD_UNMAP
, cbp
);
1939 if (svd
->type
== MAP_SHARED
&& svd
->vp
!= NULL
&&
1940 (svd
->vp
->v_flag
& VVMEXEC
) &&
1941 ((svd
->prot
& PROT_WRITE
) || svd
->pageprot
)) {
1942 segvn_inval_trcache(svd
->vp
);
1947 * Check for entire segment
1949 if (addr
== seg
->s_base
&& len
== seg
->s_size
) {
1954 opages
= seg_pages(seg
);
1956 npages
= opages
- dpages
;
1958 ASSERT(amp
== NULL
|| amp
->a_szc
>= seg
->s_szc
);
1961 * Check for beginning of segment
1963 if (addr
== seg
->s_base
) {
1964 if (svd
->vpage
!= NULL
) {
1966 struct vpage
*ovpage
;
1968 ovpage
= svd
->vpage
; /* keep pointer to vpage */
1970 nbytes
= vpgtob(npages
);
1971 svd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
1972 bcopy(&ovpage
[dpages
], svd
->vpage
, nbytes
);
1974 /* free up old vpage */
1975 kmem_free(ovpage
, vpgtob(opages
));
1978 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
1979 if (amp
->refcnt
== 1 || svd
->type
== MAP_PRIVATE
) {
1981 * Shared anon map is no longer in use. Before
1982 * freeing its pages purge all entries from
1983 * pcache that belong to this amp.
1985 if (svd
->type
== MAP_SHARED
) {
1986 ASSERT(amp
->refcnt
== 1);
1987 ASSERT(svd
->softlockcnt
== 0);
1991 * Free up now unused parts of anon_map array.
1993 if (amp
->a_szc
== seg
->s_szc
) {
1994 if (seg
->s_szc
!= 0) {
1995 anon_free_pages(amp
->ahp
,
1996 svd
->anon_index
, len
,
2004 ASSERT(svd
->type
== MAP_SHARED
);
2005 ASSERT(amp
->a_szc
> seg
->s_szc
);
2006 anon_shmap_free_pages(amp
,
2007 svd
->anon_index
, len
);
2011 * Unreserve swap space for the
2012 * unmapped chunk of this segment in
2013 * case it's MAP_SHARED
2015 if (svd
->type
== MAP_SHARED
) {
2016 anon_unresv_zone(len
,
2017 seg
->s_as
->a_proc
->p_zone
);
2021 ANON_LOCK_EXIT(&
->a_rwlock
);
2022 svd
->anon_index
+= dpages
;
2024 if (svd
->vp
!= NULL
)
2031 if (svd
->flags
& MAP_NORESERVE
) {
2033 oswresv
= svd
->swresv
;
2035 svd
->swresv
= ptob(anon_pages(amp
->ahp
,
2036 svd
->anon_index
, npages
));
2037 anon_unresv_zone(oswresv
- svd
->swresv
,
2038 seg
->s_as
->a_proc
->p_zone
);
2039 if (SEG_IS_PARTIAL_RESV(seg
))
2040 seg
->s_as
->a_resvsize
-= oswresv
-
2045 if (svd
->pageswap
) {
2046 oswresv
= svd
->swresv
;
2048 segvn_count_swap_by_vpages(seg
);
2049 ASSERT(oswresv
>= svd
->swresv
);
2050 unlen
= oswresv
- svd
->swresv
;
2053 ASSERT(svd
->swresv
== seg
->s_size
);
2056 anon_unresv_zone(unlen
,
2057 seg
->s_as
->a_proc
->p_zone
);
2059 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
, "anon proc:%p %lu %u",
2067 * Check for end of segment
2069 if (addr
+ len
== seg
->s_base
+ seg
->s_size
) {
2070 if (svd
->vpage
!= NULL
) {
2072 struct vpage
*ovpage
;
2074 ovpage
= svd
->vpage
; /* keep pointer to vpage */
2076 nbytes
= vpgtob(npages
);
2077 svd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
2078 bcopy(ovpage
, svd
->vpage
, nbytes
);
2080 /* free up old vpage */
2081 kmem_free(ovpage
, vpgtob(opages
));
2085 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
2086 if (amp
->refcnt
== 1 || svd
->type
== MAP_PRIVATE
) {
2088 * Free up now unused parts of anon_map array.
2090 ulong_t an_idx
= svd
->anon_index
+ npages
;
2093 * Shared anon map is no longer in use. Before
2094 * freeing its pages purge all entries from
2095 * pcache that belong to this amp.
2097 if (svd
->type
== MAP_SHARED
) {
2098 ASSERT(amp
->refcnt
== 1);
2099 ASSERT(svd
->softlockcnt
== 0);
2103 if (amp
->a_szc
== seg
->s_szc
) {
2104 if (seg
->s_szc
!= 0) {
2105 anon_free_pages(amp
->ahp
,
2109 anon_free(amp
->ahp
, an_idx
,
2113 ASSERT(svd
->type
== MAP_SHARED
);
2114 ASSERT(amp
->a_szc
> seg
->s_szc
);
2115 anon_shmap_free_pages(amp
,
2120 * Unreserve swap space for the
2121 * unmapped chunk of this segment in
2122 * case it's MAP_SHARED
2124 if (svd
->type
== MAP_SHARED
) {
2125 anon_unresv_zone(len
,
2126 seg
->s_as
->a_proc
->p_zone
);
2130 ANON_LOCK_EXIT(&
->a_rwlock
);
2136 if (svd
->flags
& MAP_NORESERVE
) {
2138 oswresv
= svd
->swresv
;
2139 svd
->swresv
= ptob(anon_pages(amp
->ahp
,
2140 svd
->anon_index
, npages
));
2141 anon_unresv_zone(oswresv
- svd
->swresv
,
2142 seg
->s_as
->a_proc
->p_zone
);
2143 if (SEG_IS_PARTIAL_RESV(seg
))
2144 seg
->s_as
->a_resvsize
-= oswresv
-
2149 if (svd
->pageswap
) {
2150 oswresv
= svd
->swresv
;
2152 segvn_count_swap_by_vpages(seg
);
2153 ASSERT(oswresv
>= svd
->swresv
);
2154 unlen
= oswresv
- svd
->swresv
;
2157 ASSERT(svd
->swresv
== seg
->s_size
);
2160 anon_unresv_zone(unlen
,
2161 seg
->s_as
->a_proc
->p_zone
);
2163 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
,
2164 "anon proc:%p %lu %u", seg
, len
, 0);
2171 * The section to go is in the middle of the segment,
2172 * have to make it into two segments. nseg is made for
2173 * the high end while seg is cut down at the low end.
2175 nbase
= addr
+ len
; /* new seg base */
2176 nsize
= (seg
->s_base
+ seg
->s_size
) - nbase
; /* new seg size */
2177 seg
->s_size
= addr
- seg
->s_base
; /* shrink old seg */
2178 nseg
= seg_alloc(seg
->s_as
, nbase
, nsize
);
2180 panic("segvn_unmap seg_alloc");
2183 nseg
->s_ops
= seg
->s_ops
;
2184 nsvd
= kmem_cache_alloc(segvn_cache
, KM_SLEEP
);
2185 nseg
->s_data
= (void *)nsvd
;
2186 nseg
->s_szc
= seg
->s_szc
;
2189 nsvd
->offset
= svd
->offset
+ (uintptr_t)(nseg
->s_base
- seg
->s_base
);
2191 nsvd
->softlockcnt
= 0;
2192 nsvd
->softlockcnt_sbase
= 0;
2193 nsvd
->softlockcnt_send
= 0;
2194 ASSERT(nsvd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2196 if (svd
->vp
!= NULL
) {
2198 if (nsvd
->type
== MAP_SHARED
)
2199 lgrp_shm_policy_init(NULL
, nsvd
->vp
);
2203 if (svd
->vpage
== NULL
) {
2206 /* need to split vpage into two arrays */
2208 struct vpage
*ovpage
;
2210 ovpage
= svd
->vpage
; /* keep pointer to vpage */
2212 npages
= seg_pages(seg
); /* seg has shrunk */
2213 nbytes
= vpgtob(npages
);
2214 svd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
2216 bcopy(ovpage
, svd
->vpage
, nbytes
);
2218 npages
= seg_pages(nseg
);
2219 nbytes
= vpgtob(npages
);
2220 nsvd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
2222 bcopy(&ovpage
[opages
- npages
], nsvd
->vpage
, nbytes
);
2224 /* free up old vpage */
2225 kmem_free(ovpage
, vpgtob(opages
));
2230 nsvd
->anon_index
= 0;
2233 * Need to create a new anon map for the new segment.
2234 * We'll also allocate a new smaller array for the old
2235 * smaller segment to save space.
2237 opages
= btop((uintptr_t)(addr
- seg
->s_base
));
2238 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
2239 if (amp
->refcnt
== 1 || svd
->type
== MAP_PRIVATE
) {
2241 * Free up now unused parts of anon_map array.
2243 ulong_t an_idx
= svd
->anon_index
+ opages
;
2246 * Shared anon map is no longer in use. Before
2247 * freeing its pages purge all entries from
2248 * pcache that belong to this amp.
2250 if (svd
->type
== MAP_SHARED
) {
2251 ASSERT(amp
->refcnt
== 1);
2252 ASSERT(svd
->softlockcnt
== 0);
2256 if (amp
->a_szc
== seg
->s_szc
) {
2257 if (seg
->s_szc
!= 0) {
2258 anon_free_pages(amp
->ahp
, an_idx
, len
,
2261 anon_free(amp
->ahp
, an_idx
,
2265 ASSERT(svd
->type
== MAP_SHARED
);
2266 ASSERT(amp
->a_szc
> seg
->s_szc
);
2267 anon_shmap_free_pages(amp
, an_idx
, len
);
2271 * Unreserve swap space for the
2272 * unmapped chunk of this segment in
2273 * case it's MAP_SHARED
2275 if (svd
->type
== MAP_SHARED
) {
2276 anon_unresv_zone(len
,
2277 seg
->s_as
->a_proc
->p_zone
);
2281 nsvd
->anon_index
= svd
->anon_index
+
2282 btop((uintptr_t)(nseg
->s_base
- seg
->s_base
));
2283 if (svd
->type
== MAP_SHARED
) {
2287 struct anon_map
*namp
;
2288 struct anon_hdr
*nahp
;
2290 ASSERT(svd
->type
== MAP_PRIVATE
);
2291 nahp
= anon_create(btop(seg
->s_size
), ANON_SLEEP
);
2292 namp
= anonmap_alloc(nseg
->s_size
, 0, ANON_SLEEP
);
2293 namp
->a_szc
= seg
->s_szc
;
2294 (void) anon_copy_ptr(amp
->ahp
, svd
->anon_index
, nahp
,
2295 0, btop(seg
->s_size
), ANON_SLEEP
);
2296 (void) anon_copy_ptr(amp
->ahp
, nsvd
->anon_index
,
2297 namp
->ahp
, 0, btop(nseg
->s_size
), ANON_SLEEP
);
2298 anon_release(amp
->ahp
, btop(amp
->size
));
2299 svd
->anon_index
= 0;
2300 nsvd
->anon_index
= 0;
2302 amp
->size
= seg
->s_size
;
2305 ANON_LOCK_EXIT(&
->a_rwlock
);
2308 if (svd
->flags
& MAP_NORESERVE
) {
2310 oswresv
= svd
->swresv
;
2311 svd
->swresv
= ptob(anon_pages(amp
->ahp
,
2312 svd
->anon_index
, btop(seg
->s_size
)));
2313 nsvd
->swresv
= ptob(anon_pages(nsvd
->amp
->ahp
,
2314 nsvd
->anon_index
, btop(nseg
->s_size
)));
2315 ASSERT(oswresv
>= (svd
->swresv
+ nsvd
->swresv
));
2316 anon_unresv_zone(oswresv
- (svd
->swresv
+ nsvd
->swresv
),
2317 seg
->s_as
->a_proc
->p_zone
);
2318 if (SEG_IS_PARTIAL_RESV(seg
))
2319 seg
->s_as
->a_resvsize
-= oswresv
-
2320 (svd
->swresv
+ nsvd
->swresv
);
2324 if (svd
->pageswap
) {
2325 oswresv
= svd
->swresv
;
2326 svd
->swresv
= segvn_count_swap_by_vpages(seg
);
2327 nsvd
->swresv
= segvn_count_swap_by_vpages(nseg
);
2328 ASSERT(oswresv
>= (svd
->swresv
+ nsvd
->swresv
));
2329 unlen
= oswresv
- (svd
->swresv
+ nsvd
->swresv
);
2331 if (seg
->s_size
+ nseg
->s_size
+ len
!=
2333 panic("segvn_unmap: cannot split "
2334 "swap reservation");
2337 svd
->swresv
= seg
->s_size
;
2338 nsvd
->swresv
= nseg
->s_size
;
2341 anon_unresv_zone(unlen
,
2342 seg
->s_as
->a_proc
->p_zone
);
2344 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
, "anon proc:%p %lu %u",
2348 return (0); /* I'm glad that's all over with! */
2352 segvn_free(struct seg
*seg
)
2354 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
2355 pgcnt_t npages
= seg_pages(seg
);
2356 struct anon_map
*amp
;
2360 * We don't need any segment level locks for "segvn" data
2361 * since the address space is "write" locked.
2363 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
2364 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
2366 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2369 * Be sure to unlock pages. XXX Why do things get free'ed instead
2372 (void) segvn_lockop(seg
, seg
->s_base
, seg
->s_size
,
2373 0, MC_UNLOCK
, NULL
, 0);
2376 * Deallocate the vpage and anon pointers if necessary and possible.
2378 if (svd
->vpage
!= NULL
) {
2379 kmem_free(svd
->vpage
, vpgtob(npages
));
2382 if ((amp
= svd
->amp
) != NULL
) {
2384 * If there are no more references to this anon_map
2385 * structure, then deallocate the structure after freeing
2386 * up all the anon slot pointers that we can.
2388 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
2389 ASSERT(amp
->a_szc
>= seg
->s_szc
);
2390 if (--amp
->refcnt
== 0) {
2391 if (svd
->type
== MAP_PRIVATE
) {
2393 * Private - we only need to anon_free
2394 * the part that this segment refers to.
2396 if (seg
->s_szc
!= 0) {
2397 anon_free_pages(amp
->ahp
,
2398 svd
->anon_index
, seg
->s_size
,
2401 anon_free(amp
->ahp
, svd
->anon_index
,
2407 * Shared anon map is no longer in use. Before
2408 * freeing its pages purge all entries from
2409 * pcache that belong to this amp.
2411 ASSERT(svd
->softlockcnt
== 0);
2415 * Shared - anon_free the entire
2416 * anon_map's worth of stuff and
2417 * release any swap reservation.
2419 if (amp
->a_szc
!= 0) {
2420 anon_shmap_free_pages(amp
, 0,
2423 anon_free(amp
->ahp
, 0, amp
->size
);
2425 if ((len
= amp
->swresv
) != 0) {
2426 anon_unresv_zone(len
,
2427 seg
->s_as
->a_proc
->p_zone
);
2428 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
,
2429 "anon proc:%p %lu %u", seg
, len
, 0);
2433 ANON_LOCK_EXIT(&
->a_rwlock
);
2435 } else if (svd
->type
== MAP_PRIVATE
) {
2437 * We had a private mapping which still has
2438 * a held anon_map so just free up all the
2439 * anon slot pointers that we were using.
2441 if (seg
->s_szc
!= 0) {
2442 anon_free_pages(amp
->ahp
, svd
->anon_index
,
2443 seg
->s_size
, seg
->s_szc
);
2445 anon_free(amp
->ahp
, svd
->anon_index
,
2448 ANON_LOCK_EXIT(&
->a_rwlock
);
2450 ANON_LOCK_EXIT(&
->a_rwlock
);
2455 * Release swap reservation.
2457 if ((len
= svd
->swresv
) != 0) {
2458 anon_unresv_zone(svd
->swresv
,
2459 seg
->s_as
->a_proc
->p_zone
);
2460 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
, "anon proc:%p %lu %u",
2462 if (SEG_IS_PARTIAL_RESV(seg
))
2463 seg
->s_as
->a_resvsize
-= svd
->swresv
;
2467 * Release claim on vnode, credentials, and finally free the
2470 if (svd
->vp
!= NULL
) {
2471 if (svd
->type
== MAP_SHARED
)
2472 lgrp_shm_policy_fini(NULL
, svd
->vp
);
2478 svd
->pageadvice
= 0;
2483 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
2484 * still working with this segment without holding as lock (in case
2485 * it's called by pcache async thread).
2487 ASSERT(svd
->softlockcnt
== 0);
2488 mutex_enter(&svd
->segfree_syncmtx
);
2489 mutex_exit(&svd
->segfree_syncmtx
);
2492 kmem_cache_free(segvn_cache
, svd
);
2496 * Do a F_SOFTUNLOCK call over the range requested. The range must have
2497 * already been F_SOFTLOCK'ed.
2498 * Caller must always match addr and len of a softunlock with a previous
2499 * softlock with exactly the same addr and len.
2502 segvn_softunlock(struct seg
*seg
, caddr_t addr
, size_t len
, enum seg_rw rw
)
2504 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
2510 struct anon_map
*amp
;
2511 struct anon
*ap
= NULL
;
2513 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
2514 ASSERT(SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
2516 if ((amp
= svd
->amp
) != NULL
)
2517 anon_index
= svd
->anon_index
+ seg_page(seg
, addr
);
2519 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
2520 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
2521 hat_unlock_region(seg
->s_as
->a_hat
, addr
, len
, svd
->rcookie
);
2523 hat_unlock(seg
->s_as
->a_hat
, addr
, len
);
2525 for (adr
= addr
; adr
< addr
+ len
; adr
+= PAGESIZE
) {
2527 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
2528 if ((ap
= anon_get_ptr(amp
->ahp
, anon_index
++))
2530 swap_xlate(ap
, &vp
, &offset
);
2533 offset
= svd
->offset
+
2534 (uintptr_t)(adr
- seg
->s_base
);
2536 ANON_LOCK_EXIT(&
->a_rwlock
);
2539 offset
= svd
->offset
+
2540 (uintptr_t)(adr
- seg
->s_base
);
2544 * Use page_find() instead of page_lookup() to
2545 * find the page since we know that it is locked.
2547 pp
= page_find(vp
, offset
);
2550 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
2551 (void *)adr
, (void *)ap
, (void *)vp
, offset
);
2555 if (rw
== S_WRITE
) {
2557 if (seg
->s_as
->a_vbits
)
2558 hat_setstat(seg
->s_as
, adr
, PAGESIZE
,
2560 } else if (rw
!= S_OTHER
) {
2562 if (seg
->s_as
->a_vbits
)
2563 hat_setstat(seg
->s_as
, adr
, PAGESIZE
, P_REF
);
2565 TRACE_3(TR_FAC_VM
, TR_SEGVN_FAULT
,
2566 "segvn_fault:pp %p vp %p offset %llx", pp
, vp
, offset
);
2569 ASSERT(svd
->softlockcnt
>= btop(len
));
2570 if (!atomic_add_long_nv((ulong_t
*)&svd
->softlockcnt
, -btop(len
))) {
2572 * All SOFTLOCKS are gone. Wakeup any waiting
2573 * unmappers so they can try again to unmap.
2574 * Check for waiters first without the mutex
2575 * held so we don't always grab the mutex on
2578 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
2579 mutex_enter(&seg
->s_as
->a_contents
);
2580 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
2581 AS_CLRUNMAPWAIT(seg
->s_as
);
2582 cv_broadcast(&seg
->s_as
->a_cv
);
2584 mutex_exit(&seg
->s_as
->a_contents
);
2589 #define PAGE_HANDLED ((page_t *)-1)
2592 * Release all the pages in the NULL terminated ppp list
2593 * which haven't already been converted to PAGE_HANDLED.
2596 segvn_pagelist_rele(page_t
**ppp
)
2598 for (; *ppp
!= NULL
; ppp
++) {
2599 if (*ppp
!= PAGE_HANDLED
)
2604 static int stealcow
= 1;
2607 * Workaround for viking chip bug. See bug id 1220902.
2608 * To fix this down in pagefault() would require importing so
2609 * much as and segvn code as to be unmaintainable.
2611 int enable_mbit_wa
= 0;
2614 * Handles all the dirty work of getting the right
2615 * anonymous pages and loading up the translations.
2616 * This routine is called only from segvn_fault()
2617 * when looping over the range of addresses requested.
2619 * The basic algorithm here is:
2620 * If this is an anon_zero case
2621 * Call anon_zero to allocate page
2622 * Load up translation
2625 * If this is an anon page
2626 * Use anon_getpage to get the page
2628 * Find page in pl[] list passed in
2631 * Load up the translation to the page
2634 * Call anon_private to handle cow
2635 * Load up (writable) translation to new page
2639 struct hat
*hat
, /* the hat to use for mapping */
2640 struct seg
*seg
, /* seg_vn of interest */
2641 caddr_t addr
, /* address in as */
2642 u_offset_t off
, /* offset in vp */
2643 struct vpage
*vpage
, /* pointer to vpage for vp, off */
2644 page_t
*pl
[], /* object source page pointer */
2645 uint_t vpprot
, /* access allowed to object pages */
2646 enum fault_type type
, /* type of fault */
2647 enum seg_rw rw
, /* type of access at fault */
2648 int brkcow
) /* we may need to break cow */
2650 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
2652 uint_t pageflags
= 0;
2653 page_t
*anon_pl
[1 + 1];
2654 page_t
*opp
= NULL
; /* original page */
2661 struct anon
*ap
, *oldap
;
2662 struct anon_map
*amp
;
2663 int hat_flag
= (type
== F_SOFTLOCK
) ? HAT_LOAD_LOCK
: HAT_LOAD
;
2665 anon_sync_obj_t cookie
;
2667 if (svd
->flags
& MAP_TEXT
) {
2668 hat_flag
|= HAT_LOAD_TEXT
;
2671 ASSERT(SEGVN_READ_HELD(seg
->s_as
, &svd
->lock
));
2672 ASSERT(seg
->s_szc
== 0);
2673 ASSERT(svd
->tr_state
!= SEGVN_TR_INIT
);
2676 * Initialize protection value for this page.
2677 * If we have per page protection values check it now.
2679 if (svd
->pageprot
) {
2684 protchk
= PROT_READ
;
2687 protchk
= PROT_WRITE
;
2690 protchk
= PROT_EXEC
;
2694 protchk
= PROT_READ
| PROT_WRITE
| PROT_EXEC
;
2698 prot
= VPP_PROT(vpage
);
2699 if ((prot
& protchk
) == 0)
2700 return (FC_PROT
); /* illegal access type */
2705 if (type
== F_SOFTLOCK
) {
2706 atomic_add_long((ulong_t
*)&svd
->softlockcnt
, 1);
2710 * Always acquire the anon array lock to prevent 2 threads from
2711 * allocating separate anon slots for the same "addr".
2714 if ((amp
= svd
->amp
) != NULL
) {
2715 ASSERT(RW_READ_HELD(&
->a_rwlock
));
2716 anon_index
= svd
->anon_index
+ seg_page(seg
, addr
);
2717 anon_array_enter(amp
, anon_index
, &cookie
);
2721 if (svd
->vp
== NULL
&& amp
!= NULL
) {
2722 if ((ap
= anon_get_ptr(amp
->ahp
, anon_index
)) == NULL
) {
2724 * Allocate a (normally) writable anonymous page of
2725 * zeroes. If no advance reservations, reserve now.
2727 if (svd
->flags
& MAP_NORESERVE
) {
2728 if (anon_resv_zone(ptob(1),
2729 seg
->s_as
->a_proc
->p_zone
)) {
2730 atomic_add_long(&svd
->swresv
, ptob(1));
2731 atomic_add_long(&seg
->s_as
->a_resvsize
,
2738 if ((pp
= anon_zero(seg
, addr
, &ap
,
2739 svd
->cred
)) == NULL
) {
2741 goto out
; /* out of swap space */
2744 * Re-acquire the anon_map lock and
2745 * initialize the anon array entry.
2747 (void) anon_set_ptr(amp
->ahp
, anon_index
, ap
,
2750 ASSERT(pp
->p_szc
== 0);
2753 * Handle pages that have been marked for migration
2755 if (lgrp_optimizations())
2756 page_migrate(seg
, addr
, &pp
, 1);
2758 if (enable_mbit_wa
) {
2761 else if (!hat_ismod(pp
))
2762 prot
&= ~PROT_WRITE
;
2765 * If AS_PAGLCK is set in a_flags (via memcntl(2)
2766 * with MC_LOCKAS, MCL_FUTURE) and this is a
2767 * MAP_NORESERVE segment, we may need to
2768 * permanently lock the page as it is being faulted
2769 * for the first time. The following text applies
2770 * only to MAP_NORESERVE segments:
2772 * As per memcntl(2), if this segment was created
2773 * after MCL_FUTURE was applied (a "future"
2774 * segment), its pages must be locked. If this
2775 * segment existed at MCL_FUTURE application (a
2776 * "past" segment), the interface is unclear.
2778 * We decide to lock only if vpage is present:
2780 * - "future" segments will have a vpage array (see
2781 * as_map), and so will be locked as required
2783 * - "past" segments may not have a vpage array,
2784 * depending on whether events (such as
2785 * mprotect) have occurred. Locking if vpage
2786 * exists will preserve legacy behavior. Not
2787 * locking if vpage is absent, will not break
2788 * the interface or legacy behavior. Note that
2789 * allocating vpage here if it's absent requires
2790 * upgrading the segvn reader lock, the cost of
2791 * which does not seem worthwhile.
2793 * Usually testing and setting VPP_ISPPLOCK and
2794 * VPP_SETPPLOCK requires holding the segvn lock as
2795 * writer, but in this case all readers are
2796 * serializing on the anon array lock.
2798 if (AS_ISPGLCK(seg
->s_as
) && vpage
!= NULL
&&
2799 (svd
->flags
& MAP_NORESERVE
) &&
2800 !VPP_ISPPLOCK(vpage
)) {
2801 proc_t
*p
= seg
->s_as
->a_proc
;
2802 ASSERT(svd
->type
== MAP_PRIVATE
);
2803 mutex_enter(&p
->p_lock
);
2804 if (rctl_incr_locked_mem(p
, NULL
, PAGESIZE
,
2806 claim
= VPP_PROT(vpage
) & PROT_WRITE
;
2807 if (page_pp_lock(pp
, claim
, 0)) {
2808 VPP_SETPPLOCK(vpage
);
2810 rctl_decr_locked_mem(p
, NULL
,
2814 mutex_exit(&p
->p_lock
);
2817 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2818 hat_memload(hat
, addr
, pp
, prot
, hat_flag
);
2820 if (!(hat_flag
& HAT_LOAD_LOCK
))
2823 anon_array_exit(&cookie
);
2829 * Obtain the page structure via anon_getpage() if it is
2830 * a private copy of an object (the result of a previous
2834 if ((ap
= anon_get_ptr(amp
->ahp
, anon_index
)) != NULL
) {
2835 err
= anon_getpage(&ap
, &vpprot
, anon_pl
, PAGESIZE
,
2836 seg
, addr
, rw
, svd
->cred
);
2840 if (svd
->type
== MAP_SHARED
) {
2842 * If this is a shared mapping to an
2843 * anon_map, then ignore the write
2844 * permissions returned by anon_getpage().
2845 * They apply to the private mappings
2848 vpprot
|= PROT_WRITE
;
2855 * Search the pl[] list passed in if it is from the
2856 * original object (i.e., not a private copy).
2860 * Find original page. We must be bringing it in
2861 * from the list in pl[].
2863 for (ppp
= pl
; (opp
= *ppp
) != NULL
; ppp
++) {
2864 if (opp
== PAGE_HANDLED
)
2866 ASSERT(opp
->p_vnode
== svd
->vp
); /* XXX */
2867 if (opp
->p_offset
== off
)
2871 panic("segvn_faultpage not found");
2874 *ppp
= PAGE_HANDLED
;
2878 ASSERT(PAGE_LOCKED(opp
));
2880 TRACE_3(TR_FAC_VM
, TR_SEGVN_FAULT
,
2881 "segvn_fault:pp %p vp %p offset %llx", opp
, NULL
, 0);
2884 * The fault is treated as a copy-on-write fault if a
2885 * write occurs on a private segment and the object
2886 * page (i.e., mapping) is write protected. We assume
2887 * that fatal protection checks have already been made.
2891 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
2892 cow
= !(vpprot
& PROT_WRITE
);
2893 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
2895 * If we are doing text replication COW on first touch.
2897 ASSERT(amp
!= NULL
);
2898 ASSERT(svd
->vp
!= NULL
);
2899 ASSERT(rw
!= S_WRITE
);
2906 * If not a copy-on-write case load the translation
2912 * Handle pages that have been marked for migration
2914 if (lgrp_optimizations())
2915 page_migrate(seg
, addr
, &opp
, 1);
2917 if (IS_VMODSORT(opp
->p_vnode
) || enable_mbit_wa
) {
2920 else if (rw
!= S_OTHER
&& !hat_ismod(opp
))
2921 prot
&= ~PROT_WRITE
;
2924 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
||
2925 (!svd
->pageprot
&& svd
->prot
== (prot
& vpprot
)));
2926 ASSERT(amp
== NULL
||
2927 svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2928 hat_memload_region(hat
, addr
, opp
, prot
& vpprot
, hat_flag
,
2931 if (!(hat_flag
& HAT_LOAD_LOCK
))
2935 anon_array_exit(&cookie
);
2940 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2944 ASSERT(amp
!= NULL
&& anon_lock
);
2947 * Steal the page only if it isn't a private page
2948 * since stealing a private page is not worth the effort.
2950 if ((ap
= anon_get_ptr(amp
->ahp
, anon_index
)) == NULL
)
2954 * Steal the original page if the following conditions are true:
2956 * We are low on memory, the page is not private, page is not large,
2957 * not shared, not modified, not `locked' or if we have it `locked'
2958 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
2959 * that the page is not shared) and if it doesn't have any
2960 * translations. page_struct_lock isn't needed to look at p_cowcnt
2961 * and p_lckcnt because we first get exclusive lock on page.
2963 (void) hat_pagesync(opp
, HAT_SYNC_DONTZERO
| HAT_SYNC_STOPON_MOD
);
2965 if (stealcow
&& freemem
< minfree
&& steal
&& opp
->p_szc
== 0 &&
2966 page_tryupgrade(opp
) && !hat_ismod(opp
) &&
2967 ((opp
->p_lckcnt
== 0 && opp
->p_cowcnt
== 0) ||
2968 (opp
->p_lckcnt
== 0 && opp
->p_cowcnt
== 1 &&
2969 vpage
!= NULL
&& VPP_ISPPLOCK(vpage
)))) {
2971 * Check if this page has other translations
2972 * after unloading our translation.
2974 if (hat_page_is_mapped(opp
)) {
2975 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2976 hat_unload(seg
->s_as
->a_hat
, addr
, PAGESIZE
,
2981 * hat_unload() might sync back someone else's recent
2982 * modification, so check again.
2984 if (!hat_ismod(opp
) && !hat_page_is_mapped(opp
))
2985 pageflags
|= STEAL_PAGE
;
2989 * If we have a vpage pointer, see if it indicates that we have
2990 * ``locked'' the page we map -- if so, tell anon_private to
2991 * transfer the locking resource to the new page.
2993 * See Statement at the beginning of segvn_lockop regarding
2994 * the way lockcnts/cowcnts are handled during COW.
2997 if (vpage
!= NULL
&& VPP_ISPPLOCK(vpage
))
2998 pageflags
|= LOCK_PAGE
;
3001 * Allocate a private page and perform the copy.
3002 * For MAP_NORESERVE reserve swap space now, unless this
3003 * is a cow fault on an existing anon page in which case
3004 * MAP_NORESERVE will have made advance reservations.
3006 if ((svd
->flags
& MAP_NORESERVE
) && (ap
== NULL
)) {
3007 if (anon_resv_zone(ptob(1), seg
->s_as
->a_proc
->p_zone
)) {
3008 atomic_add_long(&svd
->swresv
, ptob(1));
3009 atomic_add_long(&seg
->s_as
->a_resvsize
, ptob(1));
3017 pp
= anon_private(&ap
, seg
, addr
, prot
, opp
, pageflags
, svd
->cred
);
3019 err
= ENOMEM
; /* out of swap space */
3024 * If we copied away from an anonymous page, then
3025 * we are one step closer to freeing up an anon slot.
3027 * NOTE: The original anon slot must be released while
3028 * holding the "anon_map" lock. This is necessary to prevent
3029 * other threads from obtaining a pointer to the anon slot
3030 * which may be freed if its "refcnt" is 1.
3035 (void) anon_set_ptr(amp
->ahp
, anon_index
, ap
, ANON_SLEEP
);
3038 * Handle pages that have been marked for migration
3040 if (lgrp_optimizations())
3041 page_migrate(seg
, addr
, &pp
, 1);
3043 ASSERT(pp
->p_szc
== 0);
3045 ASSERT(!IS_VMODSORT(pp
->p_vnode
));
3046 if (enable_mbit_wa
) {
3049 else if (!hat_ismod(pp
))
3050 prot
&= ~PROT_WRITE
;
3053 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
3054 hat_memload(hat
, addr
, pp
, prot
, hat_flag
);
3056 if (!(hat_flag
& HAT_LOAD_LOCK
))
3060 anon_array_exit(&cookie
);
3064 anon_array_exit(&cookie
);
3066 if (type
== F_SOFTLOCK
) {
3067 atomic_add_long((ulong_t
*)&svd
->softlockcnt
, -1);
3069 return (FC_MAKE_ERR(err
));
3073 * relocate a bunch of smaller targ pages into one large repl page. all targ
3074 * pages must be complete pages smaller than replacement pages.
3075 * it's assumed that no page's szc can change since they are all PAGESIZE or
3076 * complete large pages locked SHARED.
3079 segvn_relocate_pages(page_t
**targ
, page_t
*replacement
)
3082 pgcnt_t repl_npgs
, curnpgs
;
3084 uint_t repl_szc
= replacement
->p_szc
;
3085 page_t
*first_repl
= replacement
;
3089 VM_STAT_ADD(segvnvmstats
.relocatepages
[0]);
3091 ASSERT(repl_szc
!= 0);
3092 npgs
= repl_npgs
= page_get_pagecnt(repl_szc
);
3098 ASSERT(replacement
!= NULL
);
3100 ASSERT(pp
->p_szc
< repl_szc
);
3101 ASSERT(PAGE_EXCL(pp
));
3102 ASSERT(!PP_ISFREE(pp
));
3103 curnpgs
= page_get_pagecnt(pp
->p_szc
);
3105 VM_STAT_ADD(segvnvmstats
.relocatepages
[1]);
3107 page_sub(&replacement
, repl
);
3108 ASSERT(PAGE_EXCL(repl
));
3109 ASSERT(!PP_ISFREE(repl
));
3110 ASSERT(repl
->p_szc
== repl_szc
);
3112 page_t
*repl_savepp
;
3114 VM_STAT_ADD(segvnvmstats
.relocatepages
[2]);
3115 repl_savepp
= replacement
;
3116 for (j
= 0; j
< curnpgs
; j
++) {
3118 page_sub(&replacement
, repl
);
3119 ASSERT(PAGE_EXCL(repl
));
3120 ASSERT(!PP_ISFREE(repl
));
3121 ASSERT(repl
->p_szc
== repl_szc
);
3122 ASSERT(page_pptonum(targ
[i
+ j
]) ==
3123 page_pptonum(targ
[i
]) + j
);
3126 ASSERT(IS_P2ALIGNED(page_pptonum(repl
), curnpgs
));
3128 err
= page_relocate(&pp
, &repl
, 0, 1, &nreloc
, NULL
);
3129 if (err
|| nreloc
!= curnpgs
) {
3130 panic("segvn_relocate_pages: "
3131 "page_relocate failed err=%d curnpgs=%ld "
3132 "nreloc=%ld", err
, curnpgs
, nreloc
);
3134 ASSERT(curnpgs
<= repl_npgs
);
3135 repl_npgs
-= curnpgs
;
3138 ASSERT(replacement
== NULL
);
3142 for (i
= 0; i
< repl_npgs
; i
++) {
3143 ASSERT(PAGE_EXCL(repl
));
3144 ASSERT(!PP_ISFREE(repl
));
3146 page_downgrade(targ
[i
]);
3152 * Check if all pages in ppa array are complete smaller than szc pages and
3153 * their roots will still be aligned relative to their current size if the
3154 * entire ppa array is relocated into one szc page. If these conditions are
3157 * If all pages are properly aligned attempt to upgrade their locks
3158 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0.
3159 * upgrdfail was set to 0 by caller.
3161 * Return 1 if all pages are aligned and locked exclusively.
3163 * If all pages in ppa array happen to be physically contiguous to make one
3164 * szc page and all exclusive locks are successfully obtained promote the page
3165 * size to szc and set *pszc to szc. Return 1 with pages locked shared.
3168 segvn_full_szcpages(page_t
**ppa
, uint_t szc
, int *upgrdfail
, uint_t
*pszc
)
3172 pgcnt_t totnpgs
= page_get_pagecnt(szc
);
3183 VM_STAT_ADD(segvnvmstats
.fullszcpages
[0]);
3185 for (i
= 0; i
< totnpgs
; i
++) {
3187 ASSERT(PAGE_SHARED(pp
));
3188 ASSERT(!PP_ISFREE(pp
));
3189 pfn
= page_pptonum(pp
);
3191 if (!IS_P2ALIGNED(pfn
, totnpgs
)) {
3196 } else if (contig
&& pfn
!= first_pfn
+ i
) {
3199 if (pp
->p_szc
== 0) {
3201 VM_STAT_ADD(segvnvmstats
.fullszcpages
[1]);
3205 if ((curszc
= pp
->p_szc
) >= szc
) {
3206 VM_STAT_ADD(segvnvmstats
.fullszcpages
[2]);
3211 * p_szc changed means we don't have all pages
3212 * locked. return failure.
3214 VM_STAT_ADD(segvnvmstats
.fullszcpages
[3]);
3217 curnpgs
= page_get_pagecnt(curszc
);
3218 if (!IS_P2ALIGNED(pfn
, curnpgs
) ||
3219 !IS_P2ALIGNED(i
, curnpgs
)) {
3220 VM_STAT_ADD(segvnvmstats
.fullszcpages
[4]);
3226 VM_STAT_ADD(segvnvmstats
.fullszcpages
[5]);
3227 if (pp
->p_szc
!= curszc
) {
3228 VM_STAT_ADD(segvnvmstats
.fullszcpages
[6]);
3231 if (pfn
- 1 != page_pptonum(ppa
[i
- 1])) {
3232 panic("segvn_full_szcpages: "
3233 "large page not physically contiguous");
3235 if (P2PHASE(pfn
, curnpgs
) == curnpgs
- 1) {
3241 for (i
= 0; i
< totnpgs
; i
++) {
3242 ASSERT(ppa
[i
]->p_szc
< szc
);
3243 if (!page_tryupgrade(ppa
[i
])) {
3244 for (j
= 0; j
< i
; j
++) {
3245 page_downgrade(ppa
[j
]);
3247 *pszc
= ppa
[i
]->p_szc
;
3249 VM_STAT_ADD(segvnvmstats
.fullszcpages
[7]);
3255 * When a page is put a free cachelist its szc is set to 0. if file
3256 * system reclaimed pages from cachelist targ pages will be physically
3257 * contiguous with 0 p_szc. in this case just upgrade szc of targ
3258 * pages without any relocations.
3259 * To avoid any hat issues with previous small mappings
3260 * hat_pageunload() the target pages first.
3263 VM_STAT_ADD(segvnvmstats
.fullszcpages
[8]);
3264 for (i
= 0; i
< totnpgs
; i
++) {
3265 (void) hat_pageunload(ppa
[i
], HAT_FORCE_PGUNLOAD
);
3267 for (i
= 0; i
< totnpgs
; i
++) {
3268 ppa
[i
]->p_szc
= szc
;
3270 for (i
= 0; i
< totnpgs
; i
++) {
3271 ASSERT(PAGE_EXCL(ppa
[i
]));
3272 page_downgrade(ppa
[i
]);
3278 VM_STAT_ADD(segvnvmstats
.fullszcpages
[9]);
3283 * Create physically contiguous pages for [vp, off] - [vp, off +
3284 * page_size(szc)) range and for private segment return them in ppa array.
3285 * Pages are created either via IO or relocations.
3287 * Return 1 on success and 0 on failure.
3289 * If physically contiguous pages already exist for this range return 1 without
3290 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa
3291 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE().
3295 segvn_fill_vp_pages(struct segvn_data
*svd
, vnode_t
*vp
, u_offset_t off
,
3296 uint_t szc
, page_t
**ppa
, page_t
**ppplist
, uint_t
*ret_pszc
,
3300 page_t
*pplist
= *ppplist
;
3301 size_t pgsz
= page_get_pagesize(szc
);
3302 pgcnt_t pages
= btop(pgsz
);
3303 ulong_t start_off
= off
;
3304 u_offset_t eoff
= off
+ pgsz
;
3306 u_offset_t io_off
= off
;
3308 page_t
*io_pplist
= NULL
;
3309 page_t
*done_pplist
= NULL
;
3318 page_t
*targ_pplist
= NULL
;
3319 page_t
*repl_pplist
= NULL
;
3325 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[0]);
3328 ASSERT(pplist
->p_szc
== szc
);
3331 * downsize will be set to 1 only if we fail to lock pages. this will
3332 * allow subsequent faults to try to relocate the page again. If we
3333 * fail due to misalignment don't downsize and let the caller map the
3334 * whole region with small mappings to avoid more faults into the area
3335 * where we can't get large pages anyway.
3339 while (off
< eoff
) {
3341 ASSERT(newpp
!= NULL
);
3342 ASSERT(PAGE_EXCL(newpp
));
3343 ASSERT(!PP_ISFREE(newpp
));
3345 * we pass NULL for nrelocp to page_lookup_create()
3346 * so that it doesn't relocate. We relocate here
3347 * later only after we make sure we can lock all
3348 * pages in the range we handle and they are all
3351 pp
= page_lookup_create(vp
, off
, SE_SHARED
, newpp
, NULL
, 0);
3353 ASSERT(!PP_ISFREE(pp
));
3354 ASSERT(pp
->p_vnode
== vp
);
3355 ASSERT(pp
->p_offset
== off
);
3357 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[1]);
3358 page_sub(&pplist
, pp
);
3359 ASSERT(PAGE_EXCL(pp
));
3360 ASSERT(page_iolock_assert(pp
));
3361 page_list_concat(&io_pplist
, &pp
);
3365 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[2]);
3366 pfn
= page_pptonum(pp
);
3368 if (pszc
>= szc
&& targ_pplist
== NULL
&& io_pplist
== NULL
&&
3369 IS_P2ALIGNED(pfn
, pages
)) {
3370 ASSERT(repl_pplist
== NULL
);
3371 ASSERT(done_pplist
== NULL
);
3372 ASSERT(pplist
== *ppplist
);
3374 page_free_replacement_page(pplist
);
3375 page_create_putback(pages
);
3377 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[3]);
3382 segvn_faultvnmpss_align_err1
++;
3385 ppages
= page_get_pagecnt(pszc
);
3386 if (!IS_P2ALIGNED(pfn
, ppages
)) {
3389 * sizing down to pszc won't help.
3392 segvn_faultvnmpss_align_err2
++;
3395 pfn
= page_pptonum(newpp
);
3396 if (!IS_P2ALIGNED(pfn
, ppages
)) {
3399 * sizing down to pszc won't help.
3402 segvn_faultvnmpss_align_err3
++;
3405 if (!PAGE_EXCL(pp
)) {
3406 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[4]);
3409 *ret_pszc
= pp
->p_szc
;
3413 if (io_pplist
!= NULL
) {
3414 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[5]);
3415 io_len
= off
- io_off
;
3417 * Some file systems like NFS don't check EOF
3418 * conditions in VOP_PAGEIO(). Check it here
3419 * now that pages are locked SE_EXCL. Any file
3420 * truncation will wait until the pages are
3421 * unlocked so no need to worry that file will
3422 * be truncated after we check its size here.
3423 * XXX fix NFS to remove this check.
3425 va
.va_mask
= AT_SIZE
;
3426 if (VOP_GETATTR(vp
, &va
, ATTR_HINT
, svd
->cred
, NULL
)) {
3427 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[6]);
3428 page_unlock(targpp
);
3431 if (btopr(va
.va_size
) < btopr(io_off
+ io_len
)) {
3432 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[7]);
3435 page_unlock(targpp
);
3438 io_err
= VOP_PAGEIO(vp
, io_pplist
, io_off
, io_len
,
3439 B_READ
, svd
->cred
, NULL
);
3441 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[8]);
3442 page_unlock(targpp
);
3443 if (io_err
== EDEADLK
) {
3444 segvn_vmpss_pageio_deadlk_err
++;
3449 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[9]);
3450 while (io_pplist
!= NULL
) {
3452 page_sub(&io_pplist
, pp
);
3453 ASSERT(page_iolock_assert(pp
));
3455 pgidx
= (pp
->p_offset
- start_off
) >>
3457 ASSERT(pgidx
< pages
);
3459 page_list_concat(&done_pplist
, &pp
);
3463 ASSERT(PAGE_EXCL(pp
));
3464 ASSERT(pp
->p_szc
<= pszc
);
3465 if (pszc
!= 0 && !group_page_trylock(pp
, SE_EXCL
)) {
3466 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[10]);
3469 *ret_pszc
= pp
->p_szc
;
3472 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[11]);
3474 * page szc chould have changed before the entire group was
3475 * locked. reread page szc.
3478 ppages
= page_get_pagecnt(pszc
);
3480 /* link just the roots */
3481 page_list_concat(&targ_pplist
, &pp
);
3482 page_sub(&pplist
, newpp
);
3483 page_list_concat(&repl_pplist
, &newpp
);
3485 while (--ppages
!= 0) {
3487 page_sub(&pplist
, newpp
);
3492 if (io_pplist
!= NULL
) {
3493 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[12]);
3494 io_len
= eoff
- io_off
;
3495 va
.va_mask
= AT_SIZE
;
3496 if (VOP_GETATTR(vp
, &va
, ATTR_HINT
, svd
->cred
, NULL
) != 0) {
3497 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[13]);
3500 if (btopr(va
.va_size
) < btopr(io_off
+ io_len
)) {
3501 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[14]);
3506 io_err
= VOP_PAGEIO(vp
, io_pplist
, io_off
, io_len
,
3507 B_READ
, svd
->cred
, NULL
);
3509 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[15]);
3510 if (io_err
== EDEADLK
) {
3511 segvn_vmpss_pageio_deadlk_err
++;
3516 while (io_pplist
!= NULL
) {
3518 page_sub(&io_pplist
, pp
);
3519 ASSERT(page_iolock_assert(pp
));
3521 pgidx
= (pp
->p_offset
- start_off
) >> PAGESHIFT
;
3522 ASSERT(pgidx
< pages
);
3527 * we're now bound to succeed or panic.
3528 * remove pages from done_pplist. it's not needed anymore.
3530 while (done_pplist
!= NULL
) {
3532 page_sub(&done_pplist
, pp
);
3534 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[16]);
3535 ASSERT(pplist
== NULL
);
3537 while (targ_pplist
!= NULL
) {
3539 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[17]);
3540 ASSERT(repl_pplist
);
3542 page_sub(&targ_pplist
, pp
);
3543 pgidx
= (pp
->p_offset
- start_off
) >> PAGESHIFT
;
3544 newpp
= repl_pplist
;
3545 page_sub(&repl_pplist
, newpp
);
3547 pfn
= page_pptonum(pp
);
3549 ppages
= page_get_pagecnt(pszc
);
3550 ASSERT(IS_P2ALIGNED(pfn
, ppages
));
3551 pfn
= page_pptonum(newpp
);
3552 ASSERT(IS_P2ALIGNED(pfn
, ppages
));
3553 ASSERT(P2PHASE(pfn
, pages
) == pgidx
);
3556 ret
= page_relocate(&pp
, &newpp
, 0, 1, &nreloc
, NULL
);
3557 if (ret
!= 0 || nreloc
== 0) {
3558 panic("segvn_fill_vp_pages: "
3559 "page_relocate failed");
3562 while (nreloc
-- != 0) {
3563 ASSERT(PAGE_EXCL(pp
));
3564 ASSERT(pp
->p_vnode
== vp
);
3566 ((pp
->p_offset
- start_off
) >> PAGESHIFT
));
3572 if (svd
->type
== MAP_PRIVATE
) {
3573 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[18]);
3574 for (i
= 0; i
< pages
; i
++) {
3575 ASSERT(ppa
[i
] != NULL
);
3576 ASSERT(PAGE_EXCL(ppa
[i
]));
3577 ASSERT(ppa
[i
]->p_vnode
== vp
);
3578 ASSERT(ppa
[i
]->p_offset
==
3579 start_off
+ (i
<< PAGESHIFT
));
3580 page_downgrade(ppa
[i
]);
3584 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[19]);
3586 * the caller will still call VOP_GETPAGE() for shared segments
3587 * to check FS write permissions. For private segments we map
3588 * file read only anyway. so no VOP_GETPAGE is needed.
3590 for (i
= 0; i
< pages
; i
++) {
3591 ASSERT(ppa
[i
] != NULL
);
3592 ASSERT(PAGE_EXCL(ppa
[i
]));
3593 ASSERT(ppa
[i
]->p_vnode
== vp
);
3594 ASSERT(ppa
[i
]->p_offset
==
3595 start_off
+ (i
<< PAGESHIFT
));
3596 page_unlock(ppa
[i
]);
3604 * Do the cleanup. Unlock target pages we didn't relocate. They are
3605 * linked on targ_pplist by root pages. reassemble unused replacement
3606 * and io pages back to pplist.
3608 if (io_pplist
!= NULL
) {
3609 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[20]);
3612 ASSERT(pp
->p_vnode
== vp
);
3613 ASSERT(pp
->p_offset
== io_off
);
3614 ASSERT(page_iolock_assert(pp
));
3616 page_hashout(pp
, NULL
);
3618 } while ((pp
= pp
->p_next
) != io_pplist
);
3619 page_list_concat(&io_pplist
, &pplist
);
3623 while (targ_pplist
!= NULL
) {
3624 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[21]);
3626 ASSERT(PAGE_EXCL(pp
));
3627 page_sub(&targ_pplist
, pp
);
3630 ppages
= page_get_pagecnt(pszc
);
3631 ASSERT(IS_P2ALIGNED(page_pptonum(pp
), ppages
));
3634 group_page_unlock(pp
);
3640 ASSERT(PAGE_EXCL(pp
));
3641 ASSERT(pp
->p_szc
== szc
);
3642 page_sub(&repl_pplist
, pp
);
3644 ASSERT(IS_P2ALIGNED(page_pptonum(pp
), ppages
));
3646 /* relink replacement page */
3647 page_list_concat(&tmp_pplist
, &pp
);
3648 while (--ppages
!= 0) {
3649 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[22]);
3651 ASSERT(PAGE_EXCL(pp
));
3652 ASSERT(pp
->p_szc
== szc
);
3653 page_list_concat(&tmp_pplist
, &pp
);
3656 if (tmp_pplist
!= NULL
) {
3657 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[23]);
3658 page_list_concat(&tmp_pplist
, &pplist
);
3659 pplist
= tmp_pplist
;
3662 * at this point all pages are either on done_pplist or
3663 * pplist. They can't be all on done_pplist otherwise
3664 * we'd've been done.
3666 ASSERT(pplist
!= NULL
);
3668 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[24]);
3671 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[25]);
3672 ASSERT(pp
->p_szc
== szc
);
3673 ASSERT(PAGE_EXCL(pp
));
3674 ASSERT(pp
->p_vnode
!= vp
);
3676 } while ((pp
= pp
->p_next
) != pplist
);
3680 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[26]);
3681 ASSERT(pp
->p_szc
== szc
);
3682 ASSERT(PAGE_EXCL(pp
));
3683 ASSERT(pp
->p_vnode
== vp
);
3685 } while ((pp
= pp
->p_next
) != done_pplist
);
3687 while (pplist
!= NULL
) {
3688 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[27]);
3690 page_sub(&pplist
, pp
);
3694 while (done_pplist
!= NULL
) {
3695 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[28]);
3697 page_sub(&done_pplist
, pp
);
3703 ASSERT(pplist
== *ppplist
);
3705 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[29]);
3707 * don't downsize on io error.
3708 * see if vop_getpage succeeds.
3709 * pplist may still be used in this case
3714 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[30]);
3715 page_free_replacement_page(pplist
);
3716 page_create_putback(pages
);
3721 int segvn_anypgsz
= 0;
3723 #define SEGVN_RESTORE_SOFTLOCK_VP(type, pages) \
3724 if ((type) == F_SOFTLOCK) { \
3725 atomic_add_long((ulong_t *)&(svd)->softlockcnt, \
3729 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \
3730 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \
3731 if ((rw) == S_WRITE) { \
3732 for (i = 0; i < (pages); i++) { \
3733 ASSERT((ppa)[i]->p_vnode == \
3734 (ppa)[0]->p_vnode); \
3735 hat_setmod((ppa)[i]); \
3737 } else if ((rw) != S_OTHER && \
3738 ((prot) & (vpprot) & PROT_WRITE)) { \
3739 for (i = 0; i < (pages); i++) { \
3740 ASSERT((ppa)[i]->p_vnode == \
3741 (ppa)[0]->p_vnode); \
3742 if (!hat_ismod((ppa)[i])) { \
3743 prot &= ~PROT_WRITE; \
3752 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \
3753 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]);
3755 #else /* VM_STATS */
3757 #define SEGVN_VMSTAT_FLTVNPAGES(idx)
3762 segvn_fault_vnodepages(struct hat
*hat
, struct seg
*seg
, caddr_t lpgaddr
,
3763 caddr_t lpgeaddr
, enum fault_type type
, enum seg_rw rw
, caddr_t addr
,
3764 caddr_t eaddr
, int brkcow
)
3766 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
3767 struct anon_map
*amp
= svd
->amp
;
3768 uchar_t segtype
= svd
->type
;
3769 uint_t szc
= seg
->s_szc
;
3770 size_t pgsz
= page_get_pagesize(szc
);
3771 size_t maxpgsz
= pgsz
;
3772 pgcnt_t pages
= btop(pgsz
);
3773 pgcnt_t maxpages
= pages
;
3774 size_t ppasize
= (pages
+ 1) * sizeof (page_t
*);
3775 caddr_t a
= lpgaddr
;
3776 caddr_t maxlpgeaddr
= lpgeaddr
;
3777 u_offset_t off
= svd
->offset
+ (uintptr_t)(a
- seg
->s_base
);
3778 ulong_t aindx
= svd
->anon_index
+ seg_page(seg
, a
);
3779 struct vpage
*vpage
= (svd
->vpage
!= NULL
) ?
3780 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
3781 vnode_t
*vp
= svd
->vp
;
3786 faultcode_t err
= 0;
3788 int vop_size_err
= 0;
3789 uint_t protchk
, prot
, vpprot
;
3791 int hat_flag
= (type
== F_SOFTLOCK
) ? HAT_LOAD_LOCK
: HAT_LOAD
;
3792 anon_sync_obj_t an_cookie
;
3794 int alloc_failed
= 0;
3802 int segvn_anypgsz_vnode
= 0; /* for now map vnode with 2 page sizes */
3803 int tron
= (svd
->tr_state
== SEGVN_TR_ON
);
3807 ASSERT(brkcow
== 0 || amp
!= NULL
);
3808 ASSERT(tron
== 0 || amp
!= NULL
);
3809 ASSERT(enable_mbit_wa
== 0); /* no mbit simulations with large pages */
3810 ASSERT(!(svd
->flags
& MAP_NORESERVE
));
3811 ASSERT(type
!= F_SOFTUNLOCK
);
3812 ASSERT(IS_P2ALIGNED(a
, maxpgsz
));
3813 ASSERT(amp
== NULL
|| IS_P2ALIGNED(aindx
, maxpages
));
3814 ASSERT(SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
3815 ASSERT(seg
->s_szc
< NBBY
* sizeof (int));
3816 ASSERT(type
!= F_SOFTLOCK
|| lpgeaddr
- a
== maxpgsz
);
3817 ASSERT(svd
->tr_state
!= SEGVN_TR_INIT
);
3819 VM_STAT_COND_ADD(type
== F_SOFTLOCK
, segvnvmstats
.fltvnpages
[0]);
3820 VM_STAT_COND_ADD(type
!= F_SOFTLOCK
, segvnvmstats
.fltvnpages
[1]);
3822 if (svd
->flags
& MAP_TEXT
) {
3823 hat_flag
|= HAT_LOAD_TEXT
;
3826 if (svd
->pageprot
) {
3829 protchk
= PROT_READ
;
3832 protchk
= PROT_WRITE
;
3835 protchk
= PROT_EXEC
;
3839 protchk
= PROT_READ
| PROT_WRITE
| PROT_EXEC
;
3844 /* caller has already done segment level protection check. */
3847 if (seg
->s_as
->a_hat
!= hat
) {
3851 if (rw
== S_WRITE
&& segtype
== MAP_PRIVATE
) {
3852 SEGVN_VMSTAT_FLTVNPAGES(2);
3858 ppa
= kmem_alloc(ppasize
, KM_SLEEP
);
3860 VM_STAT_COND_ADD(amp
!= NULL
, segvnvmstats
.fltvnpages
[3]);
3864 for (; a
< lpgeaddr
; a
+= pgsz
, off
+= pgsz
, aindx
+= pages
) {
3866 while (szc
< seg
->s_szc
) {
3869 tszc
= segvn_anypgsz_vnode
? szc
+ 1 :
3871 ppgsz
= page_get_pagesize(tszc
);
3872 if (!IS_P2ALIGNED(a
, ppgsz
) ||
3873 ((alloc_failed
>> tszc
) & 0x1)) {
3876 SEGVN_VMSTAT_FLTVNPAGES(4);
3880 e
= P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
3881 lpgeaddr
= (caddr_t
)e
;
3886 if (IS_P2ALIGNED(a
, maxpgsz
) && amp
!= NULL
) {
3887 ASSERT(IS_P2ALIGNED(aindx
, maxpages
));
3888 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
3889 anon_array_enter(amp
, aindx
, &an_cookie
);
3890 if (anon_get_ptr(amp
->ahp
, aindx
) != NULL
) {
3891 SEGVN_VMSTAT_FLTVNPAGES(5);
3892 ASSERT(anon_pages(amp
->ahp
, aindx
,
3893 maxpages
) == maxpages
);
3894 anon_array_exit(&an_cookie
);
3895 ANON_LOCK_EXIT(&
->a_rwlock
);
3896 err
= segvn_fault_anonpages(hat
, seg
,
3897 a
, a
+ maxpgsz
, type
, rw
,
3899 MIN(a
+ maxpgsz
, eaddr
), brkcow
);
3901 SEGVN_VMSTAT_FLTVNPAGES(6);
3904 if (szc
< seg
->s_szc
) {
3908 lpgeaddr
= maxlpgeaddr
;
3912 ASSERT(anon_pages(amp
->ahp
, aindx
,
3914 SEGVN_VMSTAT_FLTVNPAGES(7);
3915 anon_array_exit(&an_cookie
);
3916 ANON_LOCK_EXIT(&
->a_rwlock
);
3919 ASSERT(!brkcow
|| IS_P2ALIGNED(a
, maxpgsz
));
3920 ASSERT(!tron
|| IS_P2ALIGNED(a
, maxpgsz
));
3922 if (svd
->pageprot
!= 0 && IS_P2ALIGNED(a
, maxpgsz
)) {
3923 ASSERT(vpage
!= NULL
);
3924 prot
= VPP_PROT(vpage
);
3925 ASSERT(sameprot(seg
, a
, maxpgsz
));
3926 if ((prot
& protchk
) == 0) {
3927 SEGVN_VMSTAT_FLTVNPAGES(8);
3932 if (type
== F_SOFTLOCK
) {
3933 atomic_add_long((ulong_t
*)&svd
->softlockcnt
,
3940 if (!brkcow
&& !tron
&& szc
&&
3941 !page_exists_physcontig(vp
, off
, szc
,
3942 segtype
== MAP_PRIVATE
? ppa
: NULL
)) {
3943 SEGVN_VMSTAT_FLTVNPAGES(9);
3944 if (page_alloc_pages(vp
, seg
, a
, &pplist
, NULL
,
3945 szc
, 0, 0) && type
!= F_SOFTLOCK
) {
3946 SEGVN_VMSTAT_FLTVNPAGES(10);
3949 alloc_failed
|= (1 << szc
);
3952 if (pplist
!= NULL
&&
3953 vp
->v_mpssdata
== SEGVN_PAGEIO
) {
3955 SEGVN_VMSTAT_FLTVNPAGES(11);
3956 physcontig
= segvn_fill_vp_pages(svd
,
3957 vp
, off
, szc
, ppa
, &pplist
,
3959 ASSERT(!physcontig
|| pplist
== NULL
);
3960 if (!physcontig
&& downsize
&&
3961 type
!= F_SOFTLOCK
) {
3962 ASSERT(pplist
== NULL
);
3963 SEGVN_VMSTAT_FLTVNPAGES(12);
3967 ASSERT(!physcontig
||
3968 segtype
== MAP_PRIVATE
||
3970 if (physcontig
&& ppa
[0] == NULL
) {
3974 } else if (!brkcow
&& !tron
&& szc
&& ppa
[0] != NULL
) {
3975 SEGVN_VMSTAT_FLTVNPAGES(13);
3976 ASSERT(segtype
== MAP_PRIVATE
);
3981 SEGVN_VMSTAT_FLTVNPAGES(14);
3983 ierr
= VOP_GETPAGE(vp
, (offset_t
)off
, pgsz
,
3984 &vpprot
, ppa
, pgsz
, seg
, a
, arw
,
3988 for (i
= 0; i
< pages
; i
++) {
3989 ASSERT(PAGE_LOCKED(ppa
[i
]));
3990 ASSERT(!PP_ISFREE(ppa
[i
]));
3991 ASSERT(ppa
[i
]->p_vnode
== vp
);
3992 ASSERT(ppa
[i
]->p_offset
==
3993 off
+ (i
<< PAGESHIFT
));
3997 if (segtype
== MAP_PRIVATE
) {
3998 SEGVN_VMSTAT_FLTVNPAGES(15);
3999 vpprot
&= ~PROT_WRITE
;
4002 ASSERT(segtype
== MAP_PRIVATE
);
4003 SEGVN_VMSTAT_FLTVNPAGES(16);
4004 vpprot
= PROT_ALL
& ~PROT_WRITE
;
4009 SEGVN_VMSTAT_FLTVNPAGES(17);
4010 if (pplist
!= NULL
) {
4011 SEGVN_VMSTAT_FLTVNPAGES(18);
4012 page_free_replacement_page(pplist
);
4013 page_create_putback(pages
);
4015 SEGVN_RESTORE_SOFTLOCK_VP(type
, pages
);
4016 if (a
+ pgsz
<= eaddr
) {
4017 SEGVN_VMSTAT_FLTVNPAGES(19);
4018 err
= FC_MAKE_ERR(ierr
);
4021 va
.va_mask
= AT_SIZE
;
4022 if (VOP_GETATTR(vp
, &va
, 0, svd
->cred
, NULL
)) {
4023 SEGVN_VMSTAT_FLTVNPAGES(20);
4024 err
= FC_MAKE_ERR(EIO
);
4027 if (btopr(va
.va_size
) >= btopr(off
+ pgsz
)) {
4028 SEGVN_VMSTAT_FLTVNPAGES(21);
4029 err
= FC_MAKE_ERR(ierr
);
4032 if (btopr(va
.va_size
) <
4033 btopr(off
+ (eaddr
- a
))) {
4034 SEGVN_VMSTAT_FLTVNPAGES(22);
4035 err
= FC_MAKE_ERR(ierr
);
4038 if (brkcow
|| tron
|| type
== F_SOFTLOCK
) {
4039 /* can't reduce map area */
4040 SEGVN_VMSTAT_FLTVNPAGES(23);
4044 SEGVN_VMSTAT_FLTVNPAGES(24);
4052 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
4053 anon_array_enter(amp
, aindx
, &an_cookie
);
4056 anon_get_ptr(amp
->ahp
, aindx
) != NULL
) {
4057 ulong_t taindx
= P2ALIGN(aindx
, maxpages
);
4059 SEGVN_VMSTAT_FLTVNPAGES(25);
4060 ASSERT(anon_pages(amp
->ahp
, taindx
,
4061 maxpages
) == maxpages
);
4062 for (i
= 0; i
< pages
; i
++) {
4063 page_unlock(ppa
[i
]);
4065 anon_array_exit(&an_cookie
);
4066 ANON_LOCK_EXIT(&
->a_rwlock
);
4067 if (pplist
!= NULL
) {
4068 page_free_replacement_page(pplist
);
4069 page_create_putback(pages
);
4071 SEGVN_RESTORE_SOFTLOCK_VP(type
, pages
);
4072 if (szc
< seg
->s_szc
) {
4073 SEGVN_VMSTAT_FLTVNPAGES(26);
4075 * For private segments SOFTLOCK
4076 * either always breaks cow (any rw
4077 * type except S_READ_NOCOW) or
4078 * address space is locked as writer
4079 * (S_READ_NOCOW case) and anon slots
4080 * can't show up on second check.
4081 * Therefore if we are here for
4082 * SOFTLOCK case it must be a cow
4083 * break but cow break never reduces
4084 * szc. text replication (tron) in
4085 * this case works as cow break.
4086 * Thus the assert below.
4088 ASSERT(!brkcow
&& !tron
&&
4089 type
!= F_SOFTLOCK
);
4094 ASSERT(IS_P2ALIGNED(a
, maxpgsz
));
4099 ulong_t taindx
= P2ALIGN(aindx
, maxpages
);
4100 ASSERT(!anon_pages(amp
->ahp
, taindx
, maxpages
));
4104 if (brkcow
|| tron
) {
4105 ASSERT(amp
!= NULL
);
4106 ASSERT(pplist
== NULL
);
4107 ASSERT(szc
== seg
->s_szc
);
4108 ASSERT(IS_P2ALIGNED(a
, maxpgsz
));
4109 ASSERT(IS_P2ALIGNED(aindx
, maxpages
));
4110 SEGVN_VMSTAT_FLTVNPAGES(27);
4111 ierr
= anon_map_privatepages(amp
, aindx
, szc
,
4112 seg
, a
, prot
, ppa
, vpage
, segvn_anypgsz
,
4113 tron
? PG_LOCAL
: 0, svd
->cred
);
4115 SEGVN_VMSTAT_FLTVNPAGES(28);
4116 anon_array_exit(&an_cookie
);
4117 ANON_LOCK_EXIT(&
->a_rwlock
);
4118 SEGVN_RESTORE_SOFTLOCK_VP(type
, pages
);
4119 err
= FC_MAKE_ERR(ierr
);
4123 ASSERT(!IS_VMODSORT(ppa
[0]->p_vnode
));
4125 * p_szc can't be changed for locked
4128 ASSERT(svd
->rcookie
==
4129 HAT_INVALID_REGION_COOKIE
);
4130 hat_memload_array(hat
, a
, pgsz
, ppa
, prot
,
4133 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4134 SEGVN_VMSTAT_FLTVNPAGES(29);
4135 for (i
= 0; i
< pages
; i
++) {
4136 page_unlock(ppa
[i
]);
4139 anon_array_exit(&an_cookie
);
4140 ANON_LOCK_EXIT(&
->a_rwlock
);
4144 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
||
4145 (!svd
->pageprot
&& svd
->prot
== (prot
& vpprot
)));
4147 pfn
= page_pptonum(ppa
[0]);
4149 * hat_page_demote() needs an SE_EXCL lock on one of
4150 * constituent page_t's and it decreases root's p_szc
4151 * last. This means if root's p_szc is equal szc and
4152 * all its constituent pages are locked
4153 * hat_page_demote() that could have changed p_szc to
4154 * szc is already done and no new have page_demote()
4155 * can start for this large page.
4159 * we need to make sure same mapping size is used for
4160 * the same address range if there's a possibility the
4161 * adddress is already mapped because hat layer panics
4162 * when translation is loaded for the range already
4163 * mapped with a different page size. We achieve it
4164 * by always using largest page size possible subject
4165 * to the constraints of page size, segment page size
4166 * and page alignment. Since mappings are invalidated
4167 * when those constraints change and make it
4168 * impossible to use previously used mapping size no
4169 * mapping size conflicts should happen.
4173 if ((pszc
= ppa
[0]->p_szc
) == szc
&&
4174 IS_P2ALIGNED(pfn
, pages
)) {
4176 SEGVN_VMSTAT_FLTVNPAGES(30);
4178 for (i
= 0; i
< pages
; i
++) {
4179 ASSERT(PAGE_LOCKED(ppa
[i
]));
4180 ASSERT(!PP_ISFREE(ppa
[i
]));
4181 ASSERT(page_pptonum(ppa
[i
]) ==
4183 ASSERT(ppa
[i
]->p_szc
== szc
);
4184 ASSERT(ppa
[i
]->p_vnode
== vp
);
4185 ASSERT(ppa
[i
]->p_offset
==
4186 off
+ (i
<< PAGESHIFT
));
4190 * All pages are of szc we need and they are
4191 * all locked so they can't change szc. load
4194 * if page got promoted since last check
4195 * we don't need pplist.
4197 if (pplist
!= NULL
) {
4198 page_free_replacement_page(pplist
);
4199 page_create_putback(pages
);
4201 if (PP_ISMIGRATE(ppa
[0])) {
4202 page_migrate(seg
, a
, ppa
, pages
);
4204 SEGVN_UPDATE_MODBITS(ppa
, pages
, rw
,
4207 hat_memload_array_region(hat
, a
, pgsz
,
4208 ppa
, prot
& vpprot
, hat_flag
,
4212 * avoid large xhat mappings to FS
4213 * pages so that hat_page_demote()
4214 * doesn't need to check for xhat
4216 * Don't use regions with xhats.
4218 for (i
= 0; i
< pages
; i
++) {
4220 a
+ (i
<< PAGESHIFT
),
4221 ppa
[i
], prot
& vpprot
,
4226 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4227 for (i
= 0; i
< pages
; i
++) {
4228 page_unlock(ppa
[i
]);
4232 anon_array_exit(&an_cookie
);
4233 ANON_LOCK_EXIT(&
->a_rwlock
);
4239 * See if upsize is possible.
4241 if (pszc
> szc
&& szc
< seg
->s_szc
&&
4242 (segvn_anypgsz_vnode
|| pszc
>= seg
->s_szc
)) {
4244 uint_t pszc1
= MIN(pszc
, seg
->s_szc
);
4245 ppgsz
= page_get_pagesize(pszc1
);
4246 ppages
= btop(ppgsz
);
4247 aphase
= btop(P2PHASE((uintptr_t)a
, ppgsz
));
4249 ASSERT(type
!= F_SOFTLOCK
);
4251 SEGVN_VMSTAT_FLTVNPAGES(31);
4252 if (aphase
!= P2PHASE(pfn
, ppages
)) {
4253 segvn_faultvnmpss_align_err4
++;
4255 SEGVN_VMSTAT_FLTVNPAGES(32);
4256 if (pplist
!= NULL
) {
4257 page_t
*pl
= pplist
;
4258 page_free_replacement_page(pl
);
4259 page_create_putback(pages
);
4261 for (i
= 0; i
< pages
; i
++) {
4262 page_unlock(ppa
[i
]);
4265 anon_array_exit(&an_cookie
);
4266 ANON_LOCK_EXIT(&
->a_rwlock
);
4275 * check if we should use smallest mapping size.
4278 if (szc
== 0 || xhat
||
4280 !IS_P2ALIGNED(pfn
, pages
)) ||
4282 !segvn_full_szcpages(ppa
, szc
, &upgrdfail
,
4285 if (upgrdfail
&& type
!= F_SOFTLOCK
) {
4287 * segvn_full_szcpages failed to lock
4288 * all pages EXCL. Size down.
4292 SEGVN_VMSTAT_FLTVNPAGES(33);
4294 if (pplist
!= NULL
) {
4295 page_t
*pl
= pplist
;
4296 page_free_replacement_page(pl
);
4297 page_create_putback(pages
);
4300 for (i
= 0; i
< pages
; i
++) {
4301 page_unlock(ppa
[i
]);
4304 anon_array_exit(&an_cookie
);
4305 ANON_LOCK_EXIT(&
->a_rwlock
);
4310 if (szc
!= 0 && !xhat
&& !upgrdfail
) {
4311 segvn_faultvnmpss_align_err5
++;
4313 SEGVN_VMSTAT_FLTVNPAGES(34);
4314 if (pplist
!= NULL
) {
4315 page_free_replacement_page(pplist
);
4316 page_create_putback(pages
);
4318 SEGVN_UPDATE_MODBITS(ppa
, pages
, rw
,
4320 if (upgrdfail
&& segvn_anypgsz_vnode
) {
4322 hat_memload_array_region(hat
, a
, pgsz
,
4323 ppa
, prot
& vpprot
, hat_flag
,
4326 for (i
= 0; i
< pages
; i
++) {
4327 hat_memload_region(hat
,
4328 a
+ (i
<< PAGESHIFT
),
4329 ppa
[i
], prot
& vpprot
,
4330 hat_flag
, svd
->rcookie
);
4333 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4334 for (i
= 0; i
< pages
; i
++) {
4335 page_unlock(ppa
[i
]);
4339 anon_array_exit(&an_cookie
);
4340 ANON_LOCK_EXIT(&
->a_rwlock
);
4347 * segvn_full_szcpages() upgraded pages szc.
4349 ASSERT(pszc
== ppa
[0]->p_szc
);
4350 ASSERT(IS_P2ALIGNED(pfn
, pages
));
4356 SEGVN_VMSTAT_FLTVNPAGES(35);
4358 * p_szc of ppa[0] can change since we haven't
4359 * locked all constituent pages. Call
4360 * page_lock_szc() to prevent szc changes.
4361 * This should be a rare case that happens when
4362 * multiple segments use a different page size
4363 * to map the same file offsets.
4365 szcmtx
= page_szc_lock(ppa
[0]);
4366 pszc
= ppa
[0]->p_szc
;
4367 ASSERT(szcmtx
!= NULL
|| pszc
== 0);
4368 ASSERT(ppa
[0]->p_szc
<= pszc
);
4370 SEGVN_VMSTAT_FLTVNPAGES(36);
4371 if (szcmtx
!= NULL
) {
4376 if (pplist
!= NULL
) {
4378 * page got promoted since last check.
4379 * we don't need preaalocated large
4382 SEGVN_VMSTAT_FLTVNPAGES(37);
4383 page_free_replacement_page(pplist
);
4384 page_create_putback(pages
);
4386 SEGVN_UPDATE_MODBITS(ppa
, pages
, rw
,
4388 hat_memload_array_region(hat
, a
, pgsz
, ppa
,
4389 prot
& vpprot
, hat_flag
, svd
->rcookie
);
4391 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4392 for (i
= 0; i
< pages
; i
++) {
4393 page_unlock(ppa
[i
]);
4397 anon_array_exit(&an_cookie
);
4398 ANON_LOCK_EXIT(&
->a_rwlock
);
4404 * if page got demoted since last check
4405 * we could have not allocated larger page.
4408 if (pplist
== NULL
&&
4409 page_alloc_pages(vp
, seg
, a
, &pplist
, NULL
,
4410 szc
, 0, 0) && type
!= F_SOFTLOCK
) {
4411 SEGVN_VMSTAT_FLTVNPAGES(38);
4412 for (i
= 0; i
< pages
; i
++) {
4413 page_unlock(ppa
[i
]);
4416 anon_array_exit(&an_cookie
);
4417 ANON_LOCK_EXIT(&
->a_rwlock
);
4420 alloc_failed
|= (1 << szc
);
4424 SEGVN_VMSTAT_FLTVNPAGES(39);
4426 if (pplist
!= NULL
) {
4427 segvn_relocate_pages(ppa
, pplist
);
4430 ASSERT(type
== F_SOFTLOCK
);
4431 SEGVN_VMSTAT_FLTVNPAGES(40);
4435 SEGVN_UPDATE_MODBITS(ppa
, pages
, rw
, prot
, vpprot
);
4437 if (pplist
== NULL
&& segvn_anypgsz_vnode
== 0) {
4438 ASSERT(type
== F_SOFTLOCK
);
4439 for (i
= 0; i
< pages
; i
++) {
4440 ASSERT(ppa
[i
]->p_szc
< szc
);
4441 hat_memload_region(hat
,
4442 a
+ (i
<< PAGESHIFT
),
4443 ppa
[i
], prot
& vpprot
, hat_flag
,
4447 ASSERT(pplist
!= NULL
|| type
== F_SOFTLOCK
);
4448 hat_memload_array_region(hat
, a
, pgsz
, ppa
,
4449 prot
& vpprot
, hat_flag
, svd
->rcookie
);
4451 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4452 for (i
= 0; i
< pages
; i
++) {
4453 ASSERT(PAGE_SHARED(ppa
[i
]));
4454 page_unlock(ppa
[i
]);
4458 anon_array_exit(&an_cookie
);
4459 ANON_LOCK_EXIT(&
->a_rwlock
);
4463 if (vpage
!= NULL
) {
4470 ASSERT(a
< lpgeaddr
);
4472 ASSERT(!brkcow
&& !tron
&& type
!= F_SOFTLOCK
);
4475 * ierr == -1 means we failed to map with a large page.
4476 * (either due to allocation/relocation failures or
4477 * misalignment with other mappings to this file.
4479 * ierr == -2 means some other thread allocated a large page
4480 * after we gave up tp map with a large page. retry with
4483 ASSERT(ierr
== -1 || ierr
== -2);
4484 ASSERT(ierr
== -2 || szc
!= 0);
4485 ASSERT(ierr
== -1 || szc
< seg
->s_szc
);
4487 SEGVN_VMSTAT_FLTVNPAGES(41);
4488 ASSERT(pszc
> szc
&& pszc
<= seg
->s_szc
);
4490 } else if (segvn_anypgsz_vnode
) {
4491 SEGVN_VMSTAT_FLTVNPAGES(42);
4494 SEGVN_VMSTAT_FLTVNPAGES(43);
4497 * other process created pszc large page.
4498 * but we still have to drop to 0 szc.
4503 pgsz
= page_get_pagesize(szc
);
4507 * Size up case. Note lpgaddr may only be needed for
4508 * softlock case so we don't adjust it here.
4510 a
= (caddr_t
)P2ALIGN((uintptr_t)a
, pgsz
);
4511 ASSERT(a
>= lpgaddr
);
4512 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
4513 off
= svd
->offset
+ (uintptr_t)(a
- seg
->s_base
);
4514 aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4515 vpage
= (svd
->vpage
!= NULL
) ?
4516 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4519 * Size down case. Note lpgaddr may only be needed for
4520 * softlock case so we don't adjust it here.
4522 ASSERT(IS_P2ALIGNED(a
, pgsz
));
4523 ASSERT(IS_P2ALIGNED(lpgeaddr
, pgsz
));
4524 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
4525 ASSERT(a
< lpgeaddr
);
4527 SEGVN_VMSTAT_FLTVNPAGES(44);
4529 * The beginning of the large page region can
4530 * be pulled to the right to make a smaller
4531 * region. We haven't yet faulted a single
4534 a
= (caddr_t
)P2ALIGN((uintptr_t)addr
, pgsz
);
4535 ASSERT(a
>= lpgaddr
);
4537 (uintptr_t)(a
- seg
->s_base
);
4538 aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4539 vpage
= (svd
->vpage
!= NULL
) ?
4540 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4545 kmem_free(ppa
, ppasize
);
4546 if (!err
&& !vop_size_err
) {
4547 SEGVN_VMSTAT_FLTVNPAGES(45);
4550 if (type
== F_SOFTLOCK
&& a
> lpgaddr
) {
4551 SEGVN_VMSTAT_FLTVNPAGES(46);
4552 segvn_softunlock(seg
, lpgaddr
, a
- lpgaddr
, S_OTHER
);
4554 if (!vop_size_err
) {
4555 SEGVN_VMSTAT_FLTVNPAGES(47);
4558 ASSERT(brkcow
|| tron
|| type
== F_SOFTLOCK
);
4560 * Large page end is mapped beyond the end of file and it's a cow
4561 * fault (can be a text replication induced cow) or softlock so we can't
4562 * reduce the map area. For now just demote the segment. This should
4563 * really only happen if the end of the file changed after the mapping
4564 * was established since when large page segments are created we make
4565 * sure they don't extend beyond the end of the file.
4567 SEGVN_VMSTAT_FLTVNPAGES(48);
4569 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4570 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
4572 if (seg
->s_szc
!= 0) {
4573 segvn_fltvnpages_clrszc_cnt
++;
4574 ASSERT(svd
->softlockcnt
== 0);
4575 err
= segvn_clrszc(seg
);
4577 segvn_fltvnpages_clrszc_err
++;
4580 ASSERT(err
|| seg
->s_szc
== 0);
4581 SEGVN_LOCK_DOWNGRADE(seg
->s_as
, &svd
->lock
);
4582 /* segvn_fault will do its job as if szc had been zero to begin with */
4583 return (err
== 0 ? IE_RETRY
: FC_MAKE_ERR(err
));
4587 * This routine will attempt to fault in one large page.
4588 * it will use smaller pages if that fails.
4589 * It should only be called for pure anonymous segments.
4592 segvn_fault_anonpages(struct hat
*hat
, struct seg
*seg
, caddr_t lpgaddr
,
4593 caddr_t lpgeaddr
, enum fault_type type
, enum seg_rw rw
, caddr_t addr
,
4594 caddr_t eaddr
, int brkcow
)
4596 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
4597 struct anon_map
*amp
= svd
->amp
;
4598 uchar_t segtype
= svd
->type
;
4599 uint_t szc
= seg
->s_szc
;
4600 size_t pgsz
= page_get_pagesize(szc
);
4601 size_t maxpgsz
= pgsz
;
4602 pgcnt_t pages
= btop(pgsz
);
4603 uint_t ppaszc
= szc
;
4604 caddr_t a
= lpgaddr
;
4605 ulong_t aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4606 struct vpage
*vpage
= (svd
->vpage
!= NULL
) ?
4607 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4612 uint_t protchk
, prot
, vpprot
;
4614 int hat_flag
= (type
== F_SOFTLOCK
) ? HAT_LOAD_LOCK
: HAT_LOAD
;
4615 anon_sync_obj_t cookie
;
4617 int pgflags
= (svd
->tr_state
== SEGVN_TR_ON
) ? PG_LOCAL
: 0;
4620 ASSERT(amp
!= NULL
);
4621 ASSERT(enable_mbit_wa
== 0); /* no mbit simulations with large pages */
4622 ASSERT(!(svd
->flags
& MAP_NORESERVE
));
4623 ASSERT(type
!= F_SOFTUNLOCK
);
4624 ASSERT(IS_P2ALIGNED(a
, maxpgsz
));
4625 ASSERT(!brkcow
|| svd
->tr_state
== SEGVN_TR_OFF
);
4626 ASSERT(svd
->tr_state
!= SEGVN_TR_INIT
);
4628 ASSERT(SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
4630 VM_STAT_COND_ADD(type
== F_SOFTLOCK
, segvnvmstats
.fltanpages
[0]);
4631 VM_STAT_COND_ADD(type
!= F_SOFTLOCK
, segvnvmstats
.fltanpages
[1]);
4633 if (svd
->flags
& MAP_TEXT
) {
4634 hat_flag
|= HAT_LOAD_TEXT
;
4637 if (svd
->pageprot
) {
4640 protchk
= PROT_READ
;
4643 protchk
= PROT_WRITE
;
4646 protchk
= PROT_EXEC
;
4650 protchk
= PROT_READ
| PROT_WRITE
| PROT_EXEC
;
4653 VM_STAT_ADD(segvnvmstats
.fltanpages
[2]);
4656 /* caller has already done segment level protection check. */
4659 ppa
= kmem_cache_alloc(segvn_szc_cache
[ppaszc
], KM_SLEEP
);
4660 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
4663 for (; a
< lpgeaddr
; a
+= pgsz
, aindx
+= pages
) {
4664 if (svd
->pageprot
!= 0 && IS_P2ALIGNED(a
, maxpgsz
)) {
4665 VM_STAT_ADD(segvnvmstats
.fltanpages
[3]);
4666 ASSERT(vpage
!= NULL
);
4667 prot
= VPP_PROT(vpage
);
4668 ASSERT(sameprot(seg
, a
, maxpgsz
));
4669 if ((prot
& protchk
) == 0) {
4674 if (adjszc_chk
&& IS_P2ALIGNED(a
, maxpgsz
) &&
4676 ASSERT(a
> lpgaddr
);
4680 ASSERT(IS_P2ALIGNED(aindx
, pages
));
4681 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
,
4684 if (type
== F_SOFTLOCK
) {
4685 atomic_add_long((ulong_t
*)&svd
->softlockcnt
,
4688 anon_array_enter(amp
, aindx
, &cookie
);
4689 ppa_szc
= (uint_t
)-1;
4690 ierr
= anon_map_getpages(amp
, aindx
, szc
, seg
, a
,
4691 prot
, &vpprot
, ppa
, &ppa_szc
, vpage
, rw
, brkcow
,
4692 segvn_anypgsz
, pgflags
, svd
->cred
);
4694 anon_array_exit(&cookie
);
4695 VM_STAT_ADD(segvnvmstats
.fltanpages
[4]);
4696 if (type
== F_SOFTLOCK
) {
4698 (ulong_t
*)&svd
->softlockcnt
,
4702 VM_STAT_ADD(segvnvmstats
.fltanpages
[6]);
4703 err
= FC_MAKE_ERR(ierr
);
4709 ASSERT(!IS_VMODSORT(ppa
[0]->p_vnode
));
4711 ASSERT(segtype
== MAP_SHARED
||
4712 ppa
[0]->p_szc
<= szc
);
4713 ASSERT(segtype
== MAP_PRIVATE
||
4714 ppa
[0]->p_szc
>= szc
);
4717 * Handle pages that have been marked for migration
4719 if (lgrp_optimizations())
4720 page_migrate(seg
, a
, ppa
, pages
);
4722 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
4724 if (segtype
== MAP_SHARED
) {
4725 vpprot
|= PROT_WRITE
;
4728 hat_memload_array(hat
, a
, pgsz
, ppa
,
4729 prot
& vpprot
, hat_flag
);
4731 if (hat_flag
& HAT_LOAD_LOCK
) {
4732 VM_STAT_ADD(segvnvmstats
.fltanpages
[7]);
4734 VM_STAT_ADD(segvnvmstats
.fltanpages
[8]);
4735 for (i
= 0; i
< pages
; i
++)
4736 page_unlock(ppa
[i
]);
4741 anon_array_exit(&cookie
);
4746 ASSERT(a
< lpgeaddr
);
4748 * ierr == -1 means we failed to allocate a large page.
4749 * so do a size down operation.
4751 * ierr == -2 means some other process that privately shares
4752 * pages with this process has allocated a larger page and we
4753 * need to retry with larger pages. So do a size up
4754 * operation. This relies on the fact that large pages are
4755 * never partially shared i.e. if we share any constituent
4756 * page of a large page with another process we must share the
4757 * entire large page. Note this cannot happen for SOFTLOCK
4758 * case, unless current address (a) is at the beginning of the
4759 * next page size boundary because the other process couldn't
4760 * have relocated locked pages.
4762 ASSERT(ierr
== -1 || ierr
== -2);
4764 if (segvn_anypgsz
) {
4765 ASSERT(ierr
== -2 || szc
!= 0);
4766 ASSERT(ierr
== -1 || szc
< seg
->s_szc
);
4767 szc
= (ierr
== -1) ? szc
- 1 : szc
+ 1;
4770 * For non COW faults and segvn_anypgsz == 0
4771 * we need to be careful not to loop forever
4772 * if existing page is found with szc other
4773 * than 0 or seg->s_szc. This could be due
4774 * to page relocations on behalf of DR or
4775 * more likely large page creation. For this
4776 * case simply re-size to existing page's szc
4777 * if returned by anon_map_getpages().
4779 if (ppa_szc
== (uint_t
)-1) {
4780 szc
= (ierr
== -1) ? 0 : seg
->s_szc
;
4782 ASSERT(ppa_szc
<= seg
->s_szc
);
4783 ASSERT(ierr
== -2 || ppa_szc
< szc
);
4784 ASSERT(ierr
== -1 || ppa_szc
> szc
);
4789 pgsz
= page_get_pagesize(szc
);
4791 ASSERT(type
!= F_SOFTLOCK
|| ierr
== -1 ||
4792 (IS_P2ALIGNED(a
, pgsz
) && IS_P2ALIGNED(lpgeaddr
, pgsz
)));
4793 if (type
== F_SOFTLOCK
) {
4795 * For softlocks we cannot reduce the fault area
4796 * (calculated based on the largest page size for this
4797 * segment) for size down and a is already next
4798 * page size aligned as assertted above for size
4799 * ups. Therefore just continue in case of softlock.
4801 VM_STAT_ADD(segvnvmstats
.fltanpages
[9]);
4802 continue; /* keep lint happy */
4803 } else if (ierr
== -2) {
4806 * Size up case. Note lpgaddr may only be needed for
4807 * softlock case so we don't adjust it here.
4809 VM_STAT_ADD(segvnvmstats
.fltanpages
[10]);
4810 a
= (caddr_t
)P2ALIGN((uintptr_t)a
, pgsz
);
4811 ASSERT(a
>= lpgaddr
);
4812 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
4813 aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4814 vpage
= (svd
->vpage
!= NULL
) ?
4815 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4818 * Size down case. Note lpgaddr may only be needed for
4819 * softlock case so we don't adjust it here.
4821 VM_STAT_ADD(segvnvmstats
.fltanpages
[11]);
4822 ASSERT(IS_P2ALIGNED(a
, pgsz
));
4823 ASSERT(IS_P2ALIGNED(lpgeaddr
, pgsz
));
4824 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
4825 ASSERT(a
< lpgeaddr
);
4828 * The beginning of the large page region can
4829 * be pulled to the right to make a smaller
4830 * region. We haven't yet faulted a single
4833 VM_STAT_ADD(segvnvmstats
.fltanpages
[12]);
4834 a
= (caddr_t
)P2ALIGN((uintptr_t)addr
, pgsz
);
4835 ASSERT(a
>= lpgaddr
);
4836 aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4837 vpage
= (svd
->vpage
!= NULL
) ?
4838 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4842 VM_STAT_ADD(segvnvmstats
.fltanpages
[13]);
4843 ANON_LOCK_EXIT(&
->a_rwlock
);
4844 kmem_cache_free(segvn_szc_cache
[ppaszc
], ppa
);
4847 VM_STAT_ADD(segvnvmstats
.fltanpages
[14]);
4848 ANON_LOCK_EXIT(&
->a_rwlock
);
4849 kmem_cache_free(segvn_szc_cache
[ppaszc
], ppa
);
4850 if (type
== F_SOFTLOCK
&& a
> lpgaddr
) {
4851 VM_STAT_ADD(segvnvmstats
.fltanpages
[15]);
4852 segvn_softunlock(seg
, lpgaddr
, a
- lpgaddr
, S_OTHER
);
4857 int fltadvice
= 1; /* set to free behind pages for sequential access */
4860 * This routine is called via a machine specific fault handling routine.
4861 * It is also called by software routines wishing to lock or unlock
4862 * a range of addresses.
4864 * Here is the basic algorithm:
4866 * Call segvn_softunlock
4869 * Checking and set up work
4870 * If we will need some non-anonymous pages
4871 * Call VOP_GETPAGE over the range of non-anonymous pages
4873 * Loop over all addresses requested
4874 * Call segvn_faultpage passing in page list
4875 * to load up translations and handle anonymous pages
4877 * Load up translation to any additional pages in page list not
4878 * already handled that fit into this segment
4881 segvn_fault(struct hat
*hat
, struct seg
*seg
, caddr_t addr
, size_t len
,
4882 enum fault_type type
, enum seg_rw rw
)
4884 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
4885 page_t
**plp
, **ppp
, *pp
;
4888 struct vpage
*vpage
;
4889 uint_t vpprot
, prot
;
4891 page_t
*pl
[PVN_GETPAGE_NUM
+ 1];
4892 size_t plsz
, pl_alloc_sz
;
4895 struct anon_map
*amp
;
4897 caddr_t lpgaddr
, lpgeaddr
;
4899 anon_sync_obj_t cookie
;
4900 int brkcow
= BREAK_COW_SHARE(rw
, type
, svd
->type
);
4902 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
4903 ASSERT(svd
->amp
== NULL
|| svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
4906 * First handle the easy stuff
4908 if (type
== F_SOFTUNLOCK
) {
4909 if (rw
== S_READ_NOCOW
) {
4911 ASSERT(AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
4913 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
4914 pgsz
= (seg
->s_szc
== 0) ? PAGESIZE
:
4915 page_get_pagesize(seg
->s_szc
);
4916 VM_STAT_COND_ADD(pgsz
> PAGESIZE
, segvnvmstats
.fltanpages
[16]);
4917 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
, lpgeaddr
);
4918 segvn_softunlock(seg
, lpgaddr
, lpgeaddr
- lpgaddr
, rw
);
4919 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4923 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
||
4924 !HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
));
4926 if (svd
->tr_state
== SEGVN_TR_INIT
) {
4927 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
4928 if (svd
->tr_state
== SEGVN_TR_INIT
) {
4929 ASSERT(svd
->vp
!= NULL
&& svd
->amp
== NULL
);
4930 ASSERT(svd
->flags
& MAP_TEXT
);
4931 ASSERT(svd
->type
== MAP_PRIVATE
);
4932 segvn_textrepl(seg
);
4933 ASSERT(svd
->tr_state
!= SEGVN_TR_INIT
);
4934 ASSERT(svd
->tr_state
!= SEGVN_TR_ON
||
4937 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4939 } else if (svd
->tr_state
!= SEGVN_TR_OFF
) {
4940 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
4942 if (rw
== S_WRITE
&& svd
->tr_state
!= SEGVN_TR_OFF
) {
4943 ASSERT(!svd
->pageprot
&& !(svd
->prot
& PROT_WRITE
));
4944 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4948 if (svd
->tr_state
== SEGVN_TR_ON
) {
4949 ASSERT(svd
->vp
!= NULL
&& svd
->amp
!= NULL
);
4950 segvn_textunrepl(seg
, 0);
4951 ASSERT(svd
->amp
== NULL
&&
4952 svd
->tr_state
== SEGVN_TR_OFF
);
4953 } else if (svd
->tr_state
!= SEGVN_TR_OFF
) {
4954 svd
->tr_state
= SEGVN_TR_OFF
;
4956 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
4957 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4961 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
4964 * If we have the same protections for the entire segment,
4965 * insure that the access being attempted is legitimate.
4968 if (svd
->pageprot
== 0) {
4974 protchk
= PROT_READ
;
4977 protchk
= PROT_WRITE
;
4980 protchk
= PROT_EXEC
;
4984 protchk
= PROT_READ
| PROT_WRITE
| PROT_EXEC
;
4988 if ((svd
->prot
& protchk
) == 0) {
4989 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4990 return (FC_PROT
); /* illegal access type */
4994 if (brkcow
&& HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
4995 /* this must be SOFTLOCK S_READ fault */
4996 ASSERT(svd
->amp
== NULL
);
4997 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
4998 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4999 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
5000 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
5002 * this must be the first ever non S_READ_NOCOW
5003 * softlock for this segment.
5005 ASSERT(svd
->softlockcnt
== 0);
5006 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
5008 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
5010 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5015 * We can't allow the long term use of softlocks for vmpss segments,
5016 * because in some file truncation cases we should be able to demote
5017 * the segment, which requires that there are no softlocks. The
5018 * only case where it's ok to allow a SOFTLOCK fault against a vmpss
5019 * segment is S_READ_NOCOW, where the caller holds the address space
5020 * locked as writer and calls softunlock before dropping the as lock.
5021 * S_READ_NOCOW is used by /proc to read memory from another user.
5023 * Another deadlock between SOFTLOCK and file truncation can happen
5024 * because segvn_fault_vnodepages() calls the FS one pagesize at
5025 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages()
5026 * can cause a deadlock because the first set of page_t's remain
5027 * locked SE_SHARED. To avoid this, we demote segments on a first
5028 * SOFTLOCK if they have a length greater than the segment's
5031 * So for now, we only avoid demoting a segment on a SOFTLOCK when
5032 * the access type is S_READ_NOCOW and the fault length is less than
5033 * or equal to the segment's page size. While this is quite restrictive,
5034 * it should be the most common case of SOFTLOCK against a vmpss
5037 * For S_READ_NOCOW, it's safe not to do a copy on write because the
5038 * caller makes sure no COW will be caused by another thread for a
5041 if (type
== F_SOFTLOCK
&& svd
->vp
!= NULL
&& seg
->s_szc
!= 0) {
5044 if (rw
!= S_READ_NOCOW
) {
5047 if (!demote
&& len
> PAGESIZE
) {
5048 pgsz
= page_get_pagesize(seg
->s_szc
);
5049 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
,
5051 if (lpgeaddr
- lpgaddr
> pgsz
) {
5056 ASSERT(demote
|| AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
5059 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5060 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
5061 if (seg
->s_szc
!= 0) {
5062 segvn_vmpss_clrszc_cnt
++;
5063 ASSERT(svd
->softlockcnt
== 0);
5064 err
= segvn_clrszc(seg
);
5066 segvn_vmpss_clrszc_err
++;
5067 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5068 return (FC_MAKE_ERR(err
));
5071 ASSERT(seg
->s_szc
== 0);
5072 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5078 * Check to see if we need to allocate an anon_map structure.
5080 if (svd
->amp
== NULL
&& (svd
->vp
== NULL
|| brkcow
)) {
5081 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
5083 * Drop the "read" lock on the segment and acquire
5084 * the "write" version since we have to allocate the
5087 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5088 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
5090 if (svd
->amp
== NULL
) {
5091 svd
->amp
= anonmap_alloc(seg
->s_size
, 0, ANON_SLEEP
);
5092 svd
->amp
->a_szc
= seg
->s_szc
;
5094 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5097 * Start all over again since segment protections
5098 * may have changed after we dropped the "read" lock.
5104 * S_READ_NOCOW vs S_READ distinction was
5105 * only needed for the code above. After
5106 * that we treat it as S_READ.
5108 if (rw
== S_READ_NOCOW
) {
5109 ASSERT(type
== F_SOFTLOCK
);
5110 ASSERT(AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
5117 * MADV_SEQUENTIAL work is ignored for large page segments.
5119 if (seg
->s_szc
!= 0) {
5120 pgsz
= page_get_pagesize(seg
->s_szc
);
5121 ASSERT(SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
5122 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
, lpgeaddr
);
5123 if (svd
->vp
== NULL
) {
5124 err
= segvn_fault_anonpages(hat
, seg
, lpgaddr
,
5125 lpgeaddr
, type
, rw
, addr
, addr
+ len
, brkcow
);
5127 err
= segvn_fault_vnodepages(hat
, seg
, lpgaddr
,
5128 lpgeaddr
, type
, rw
, addr
, addr
+ len
, brkcow
);
5129 if (err
== IE_RETRY
) {
5130 ASSERT(seg
->s_szc
== 0);
5131 ASSERT(SEGVN_READ_HELD(seg
->s_as
, &svd
->lock
));
5132 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5136 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5140 page
= seg_page(seg
, addr
);
5142 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
5143 anon_index
= svd
->anon_index
+ page
;
5145 if (type
== F_PROT
&& rw
== S_READ
&&
5146 svd
->tr_state
== SEGVN_TR_OFF
&&
5147 svd
->type
== MAP_PRIVATE
&& svd
->pageprot
== 0) {
5148 size_t index
= anon_index
;
5151 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5153 * The fast path could apply to S_WRITE also, except
5154 * that the protection fault could be caused by lazy
5155 * tlb flush when ro->rw. In this case, the pte is
5156 * RW already. But RO in the other cpu's tlb causes
5157 * the fault. Since hat_chgprot won't do anything if
5158 * pte doesn't change, we may end up faulting
5159 * indefinitely until the RO tlb entry gets replaced.
5161 for (a
= addr
; a
< addr
+ len
; a
+= PAGESIZE
, index
++) {
5162 anon_array_enter(amp
, index
, &cookie
);
5163 ap
= anon_get_ptr(amp
->ahp
, index
);
5164 anon_array_exit(&cookie
);
5165 if ((ap
== NULL
) || (ap
->an_refcnt
!= 1)) {
5166 ANON_LOCK_EXIT(&
->a_rwlock
);
5170 hat_chgprot(seg
->s_as
->a_hat
, addr
, len
, svd
->prot
);
5171 ANON_LOCK_EXIT(&
->a_rwlock
);
5172 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5178 if (svd
->vpage
== NULL
)
5181 vpage
= &svd
->vpage
[page
];
5183 off
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
5186 * If MADV_SEQUENTIAL has been set for the particular page we
5187 * are faulting on, free behind all pages in the segment and put
5188 * them on the free list.
5191 if ((page
!= 0) && fltadvice
&& svd
->tr_state
!= SEGVN_TR_ON
) {
5193 ulong_t fanon_index
;
5195 u_offset_t pgoff
, fpgoff
;
5197 struct anon
*fap
= NULL
;
5199 if (svd
->advice
== MADV_SEQUENTIAL
||
5201 VPP_ADVICE(vpage
) == MADV_SEQUENTIAL
)) {
5202 pgoff
= off
- PAGESIZE
;
5205 vpp
= &svd
->vpage
[fpage
];
5207 fanon_index
= svd
->anon_index
+ fpage
;
5209 while (pgoff
> svd
->offset
) {
5210 if (svd
->advice
!= MADV_SEQUENTIAL
&&
5211 (!svd
->pageadvice
|| (vpage
&&
5212 VPP_ADVICE(vpp
) != MADV_SEQUENTIAL
)))
5216 * If this is an anon page, we must find the
5217 * correct <vp, offset> for it
5221 ANON_LOCK_ENTER(&
->a_rwlock
,
5223 anon_array_enter(amp
, fanon_index
,
5225 fap
= anon_get_ptr(amp
->ahp
,
5228 swap_xlate(fap
, &fvp
, &fpgoff
);
5233 anon_array_exit(&cookie
);
5234 ANON_LOCK_EXIT(&
->a_rwlock
);
5242 * Skip pages that are free or have an
5245 pp
= page_lookup_nowait(fvp
, fpgoff
, SE_SHARED
);
5249 * We don't need the page_struct_lock to test
5250 * as this is only advisory; even if we
5251 * acquire it someone might race in and lock
5252 * the page after we unlock and before the
5253 * PUTPAGE, then VOP_PUTPAGE will do nothing.
5255 if (pp
->p_lckcnt
== 0 && pp
->p_cowcnt
== 0) {
5257 * Hold the vnode before releasing
5258 * the page lock to prevent it from
5259 * being freed and re-used by some
5265 * We should build a page list
5266 * to kluster putpages XXX
5268 (void) VOP_PUTPAGE(fvp
,
5269 (offset_t
)fpgoff
, PAGESIZE
,
5270 (B_DONTNEED
|B_FREE
|B_ASYNC
),
5275 * XXX - Should the loop terminate if
5276 * the page is `locked'?
5292 * See if we need to call VOP_GETPAGE for
5293 * *any* of the range being faulted on.
5294 * We can skip all of this work if there
5295 * was no original vnode.
5297 if (svd
->vp
!= NULL
) {
5310 * Only acquire reader lock to prevent amp->ahp
5311 * from being changed. It's ok to miss pages,
5312 * hence we don't do anon_array_enter
5314 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5315 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
5317 if (len
<= PAGESIZE
)
5318 /* inline non_anon() */
5319 dogetpage
= (ap
== NULL
);
5321 dogetpage
= non_anon(amp
->ahp
, anon_index
,
5323 ANON_LOCK_EXIT(&
->a_rwlock
);
5328 struct as
*as
= seg
->s_as
;
5330 if (len
> ptob((sizeof (pl
) / sizeof (pl
[0])) - 1)) {
5332 * Page list won't fit in local array,
5333 * allocate one of the needed size.
5336 (btop(len
) + 1) * sizeof (page_t
*);
5337 plp
= kmem_alloc(pl_alloc_sz
, KM_SLEEP
);
5340 } else if (rw
== S_WRITE
&& svd
->type
== MAP_PRIVATE
||
5341 svd
->tr_state
== SEGVN_TR_ON
|| rw
== S_OTHER
||
5342 (((size_t)(addr
+ PAGESIZE
) <
5343 (size_t)(seg
->s_base
+ seg
->s_size
)) &&
5344 hat_probe(as
->a_hat
, addr
+ PAGESIZE
))) {
5346 * Ask VOP_GETPAGE to return the exact number
5348 * (a) this is a COW fault, or
5349 * (b) this is a software fault, or
5350 * (c) next page is already mapped.
5355 * Ask VOP_GETPAGE to return adjacent pages
5356 * within the segment.
5358 plsz
= MIN((size_t)PVN_GETPAGE_SZ
, (size_t)
5359 ((seg
->s_base
+ seg
->s_size
) - addr
));
5360 ASSERT((addr
+ plsz
) <=
5361 (seg
->s_base
+ seg
->s_size
));
5365 * Need to get some non-anonymous pages.
5366 * We need to make only one call to GETPAGE to do
5367 * this to prevent certain deadlocking conditions
5368 * when we are doing locking. In this case
5369 * non_anon() should have picked up the smallest
5370 * range which includes all the non-anonymous
5371 * pages in the requested range. We have to
5372 * be careful regarding which rw flag to pass in
5373 * because on a private mapping, the underlying
5374 * object is never allowed to be written.
5376 if (rw
== S_WRITE
&& svd
->type
== MAP_PRIVATE
) {
5382 TRACE_3(TR_FAC_VM
, TR_SEGVN_GETPAGE
,
5383 "segvn_getpage:seg %p addr %p vp %p",
5385 err
= VOP_GETPAGE(vp
, (offset_t
)vp_off
, vp_len
,
5386 &vpprot
, plp
, plsz
, seg
, addr
+ (vp_off
- off
), arw
,
5389 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5390 segvn_pagelist_rele(plp
);
5392 kmem_free(plp
, pl_alloc_sz
);
5393 return (FC_MAKE_ERR(err
));
5395 if (svd
->type
== MAP_PRIVATE
)
5396 vpprot
&= ~PROT_WRITE
;
5401 * N.B. at this time the plp array has all the needed non-anon
5402 * pages in addition to (possibly) having some adjacent pages.
5406 * Always acquire the anon_array_lock to prevent
5407 * 2 threads from allocating separate anon slots for
5410 * If this is a copy-on-write fault and we don't already
5411 * have the anon_array_lock, acquire it to prevent the
5412 * fault routine from handling multiple copy-on-write faults
5413 * on the same "addr" in the same address space.
5415 * Only one thread should deal with the fault since after
5416 * it is handled, the other threads can acquire a translation
5417 * to the newly created private page. This prevents two or
5418 * more threads from creating different private pages for the
5421 * We grab "serialization" lock here if this is a MAP_PRIVATE segment
5422 * to prevent deadlock between this thread and another thread
5423 * which has soft-locked this page and wants to acquire serial_lock.
5426 * The fix for bug 4026339 becomes unnecessary when using the
5427 * locking scheme with per amp rwlock and a global set of hash
5428 * lock, anon_array_lock. If we steal a vnode page when low
5429 * on memory and upgrad the page lock through page_rename,
5430 * then the page is PAGE_HANDLED, nothing needs to be done
5431 * for this page after returning from segvn_faultpage.
5433 * But really, the page lock should be downgraded after
5434 * the stolen page is page_rename'd.
5438 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5441 * Ok, now loop over the address range and handle faults
5443 for (a
= addr
; a
< addr
+ len
; a
+= PAGESIZE
, off
+= PAGESIZE
) {
5444 err
= segvn_faultpage(hat
, seg
, a
, off
, vpage
, plp
, vpprot
,
5448 ANON_LOCK_EXIT(&
->a_rwlock
);
5449 if (type
== F_SOFTLOCK
&& a
> addr
) {
5450 segvn_softunlock(seg
, addr
, (a
- addr
),
5453 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5454 segvn_pagelist_rele(plp
);
5456 kmem_free(plp
, pl_alloc_sz
);
5461 } else if (svd
->vpage
) {
5462 page
= seg_page(seg
, addr
);
5463 vpage
= &svd
->vpage
[++page
];
5467 /* Didn't get pages from the underlying fs so we're done */
5472 * Now handle any other pages in the list returned.
5473 * If the page can be used, load up the translations now.
5474 * Note that the for loop will only be entered if "plp"
5475 * is pointing to a non-NULL page pointer which means that
5476 * VOP_GETPAGE() was called and vpprot has been initialized.
5478 if (svd
->pageprot
== 0)
5479 prot
= svd
->prot
& vpprot
;
5483 * Large Files: diff should be unsigned value because we started
5484 * supporting > 2GB segment sizes from 2.5.1 and when a
5485 * large file of size > 2GB gets mapped to address space
5486 * the diff value can be > 2GB.
5489 for (ppp
= plp
; (pp
= *ppp
) != NULL
; ppp
++) {
5493 anon_sync_obj_t cookie
;
5494 int hat_flag
= HAT_LOAD_ADV
;
5496 if (svd
->flags
& MAP_TEXT
) {
5497 hat_flag
|= HAT_LOAD_TEXT
;
5500 if (pp
== PAGE_HANDLED
)
5503 if (svd
->tr_state
!= SEGVN_TR_ON
&&
5504 pp
->p_offset
>= svd
->offset
&&
5505 pp
->p_offset
< svd
->offset
+ seg
->s_size
) {
5507 diff
= pp
->p_offset
- svd
->offset
;
5510 * Large Files: Following is the assertion
5511 * validating the above cast.
5513 ASSERT(svd
->vp
== pp
->p_vnode
);
5517 prot
= VPP_PROT(&svd
->vpage
[page
]) & vpprot
;
5520 * Prevent other threads in the address space from
5521 * creating private pages (i.e., allocating anon slots)
5522 * while we are in the process of loading translations
5523 * to additional pages returned by the underlying
5527 anon_index
= svd
->anon_index
+ page
;
5528 anon_array_enter(amp
, anon_index
, &cookie
);
5529 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
5531 if ((amp
== NULL
) || (ap
== NULL
)) {
5532 if (IS_VMODSORT(pp
->p_vnode
) ||
5536 else if (rw
!= S_OTHER
&&
5538 prot
&= ~PROT_WRITE
;
5541 * Skip mapping read ahead pages marked
5542 * for migration, so they will get migrated
5545 ASSERT(amp
== NULL
||
5546 svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
5547 if ((prot
& PROT_READ
) && !PP_ISMIGRATE(pp
)) {
5548 hat_memload_region(hat
,
5555 anon_array_exit(&cookie
);
5561 ANON_LOCK_EXIT(&
->a_rwlock
);
5562 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5564 kmem_free(plp
, pl_alloc_sz
);
5569 * This routine is used to start I/O on pages asynchronously. XXX it will
5570 * only create PAGESIZE pages. At fault time they will be relocated into
5574 segvn_faulta(struct seg
*seg
, caddr_t addr
)
5576 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
5578 struct anon_map
*amp
;
5581 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
5583 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
5584 if ((amp
= svd
->amp
) != NULL
) {
5588 * Reader lock to prevent amp->ahp from being changed.
5589 * This is advisory, it's ok to miss a page, so
5590 * we don't do anon_array_enter lock.
5592 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5593 if ((ap
= anon_get_ptr(amp
->ahp
,
5594 svd
->anon_index
+ seg_page(seg
, addr
))) != NULL
) {
5596 err
= anon_getpage(&ap
, NULL
, NULL
,
5597 0, seg
, addr
, S_READ
, svd
->cred
);
5599 ANON_LOCK_EXIT(&
->a_rwlock
);
5600 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5602 return (FC_MAKE_ERR(err
));
5605 ANON_LOCK_EXIT(&
->a_rwlock
);
5608 if (svd
->vp
== NULL
) {
5609 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5610 return (0); /* zfod page - do nothing now */
5614 TRACE_3(TR_FAC_VM
, TR_SEGVN_GETPAGE
,
5615 "segvn_getpage:seg %p addr %p vp %p", seg
, addr
, vp
);
5616 err
= VOP_GETPAGE(vp
,
5617 (offset_t
)(svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
)),
5618 PAGESIZE
, NULL
, NULL
, 0, seg
, addr
,
5619 S_OTHER
, svd
->cred
, NULL
);
5621 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5623 return (FC_MAKE_ERR(err
));
5628 segvn_setprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t prot
)
5630 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
5631 struct vpage
*cvp
, *svp
, *evp
;
5635 anon_sync_obj_t cookie
;
5636 int unload_done
= 0;
5638 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
5640 if ((svd
->maxprot
& prot
) != prot
)
5641 return (EACCES
); /* violated maxprot */
5643 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
5645 /* return if prot is the same */
5646 if (!svd
->pageprot
&& svd
->prot
== prot
) {
5647 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5652 * Since we change protections we first have to flush the cache.
5653 * This makes sure all the pagelock calls have to recheck
5656 if (svd
->softlockcnt
> 0) {
5657 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
5660 * If this is shared segment non 0 softlockcnt
5661 * means locked pages are still in use.
5663 if (svd
->type
== MAP_SHARED
) {
5664 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5669 * Since we do have the segvn writers lock nobody can fill
5670 * the cache with entries belonging to this seg during
5671 * the purge. The flush either succeeds or we still have
5675 if (svd
->softlockcnt
> 0) {
5676 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5681 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
5682 ASSERT(svd
->amp
== NULL
);
5683 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
5684 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
5686 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
5688 } else if (svd
->tr_state
== SEGVN_TR_INIT
) {
5689 svd
->tr_state
= SEGVN_TR_OFF
;
5690 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
5691 ASSERT(svd
->amp
!= NULL
);
5692 segvn_textunrepl(seg
, 0);
5693 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
5697 if ((prot
& PROT_WRITE
) && svd
->type
== MAP_SHARED
&&
5698 svd
->vp
!= NULL
&& (svd
->vp
->v_flag
& VVMEXEC
)) {
5699 ASSERT(vn_is_mapped(svd
->vp
, V_WRITE
));
5700 segvn_inval_trcache(svd
->vp
);
5702 if (seg
->s_szc
!= 0) {
5704 pgsz
= page_get_pagesize(seg
->s_szc
);
5705 pgcnt
= pgsz
>> PAGESHIFT
;
5706 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
5707 if (!IS_P2ALIGNED(addr
, pgsz
) || !IS_P2ALIGNED(len
, pgsz
)) {
5708 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5709 ASSERT(seg
->s_base
!= addr
|| seg
->s_size
!= len
);
5711 * If we are holding the as lock as a reader then
5712 * we need to return IE_RETRY and let the as
5713 * layer drop and re-acquire the lock as a writer.
5715 if (AS_READ_HELD(seg
->s_as
, &seg
->s_as
->a_lock
))
5717 VM_STAT_ADD(segvnvmstats
.demoterange
[1]);
5718 if (svd
->type
== MAP_PRIVATE
|| svd
->vp
!= NULL
) {
5719 err
= segvn_demote_range(seg
, addr
, len
,
5722 uint_t szcvec
= map_pgszcvec(seg
->s_base
,
5723 pgsz
, (uintptr_t)seg
->s_base
,
5724 (svd
->flags
& MAP_TEXT
), MAPPGSZC_SHM
, 0);
5725 err
= segvn_demote_range(seg
, addr
, len
,
5738 * If it's a private mapping and we're making it writable then we
5739 * may have to reserve the additional swap space now. If we are
5740 * making writable only a part of the segment then we use its vpage
5741 * array to keep a record of the pages for which we have reserved
5742 * swap. In this case we set the pageswap field in the segment's
5743 * segvn structure to record this.
5745 * If it's a private mapping to a file (i.e., vp != NULL) and we're
5746 * removing write permission on the entire segment and we haven't
5747 * modified any pages, we can release the swap space.
5749 if (svd
->type
== MAP_PRIVATE
) {
5750 if (prot
& PROT_WRITE
) {
5751 if (!(svd
->flags
& MAP_NORESERVE
) &&
5752 !(svd
->swresv
&& svd
->pageswap
== 0)) {
5756 * Start by determining how much swap
5757 * space is required.
5759 if (addr
== seg
->s_base
&&
5760 len
== seg
->s_size
&&
5761 svd
->pageswap
== 0) {
5762 /* The whole segment */
5766 * Make sure that the vpage array
5767 * exists, and make a note of the
5768 * range of elements corresponding
5772 svp
= &svd
->vpage
[seg_page(seg
, addr
)];
5773 evp
= &svd
->vpage
[seg_page(seg
,
5776 if (svd
->pageswap
== 0) {
5778 * This is the first time we've
5779 * asked for a part of this
5780 * segment, so we need to
5781 * reserve everything we've
5787 * We have to count the number
5788 * of pages required.
5790 for (cvp
= svp
; cvp
< evp
;
5792 if (!VPP_ISSWAPRES(cvp
))
5799 /* Try to reserve the necessary swap. */
5800 if (anon_resv_zone(sz
,
5801 seg
->s_as
->a_proc
->p_zone
) == 0) {
5802 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5807 * Make a note of how much swap space
5810 if (svd
->pageswap
== 0 && sz
== seg
->s_size
) {
5813 ASSERT(svd
->vpage
!= NULL
);
5816 for (cvp
= svp
; cvp
< evp
; cvp
++) {
5817 if (!VPP_ISSWAPRES(cvp
))
5818 VPP_SETSWAPRES(cvp
);
5824 * Swap space is released only if this segment
5825 * does not map anonymous memory, since read faults
5826 * on such segments still need an anon slot to read
5829 if (svd
->swresv
!= 0 && svd
->vp
!= NULL
&&
5830 svd
->amp
== NULL
&& addr
== seg
->s_base
&&
5831 len
== seg
->s_size
&& svd
->pageprot
== 0) {
5832 ASSERT(svd
->pageswap
== 0);
5833 anon_unresv_zone(svd
->swresv
,
5834 seg
->s_as
->a_proc
->p_zone
);
5836 TRACE_3(TR_FAC_VM
, TR_ANON_PROC
,
5837 "anon proc:%p %lu %u", seg
, 0, 0);
5842 if (addr
== seg
->s_base
&& len
== seg
->s_size
&& svd
->vpage
== NULL
) {
5843 if (svd
->prot
== prot
) {
5844 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5845 return (0); /* all done */
5847 svd
->prot
= (uchar_t
)prot
;
5848 } else if (svd
->type
== MAP_PRIVATE
) {
5849 struct anon
*ap
= NULL
;
5851 u_offset_t offset
, off
;
5852 struct anon_map
*amp
;
5853 ulong_t anon_idx
= 0;
5856 * A vpage structure exists or else the change does not
5857 * involve the entire segment. Establish a vpage structure
5858 * if none is there. Then, for each page in the range,
5859 * adjust its individual permissions. Note that write-
5860 * enabling a MAP_PRIVATE page can affect the claims for
5861 * locked down memory. Overcommitting memory terminates
5866 if ((amp
= svd
->amp
) != NULL
) {
5867 anon_idx
= svd
->anon_index
+ seg_page(seg
, addr
);
5868 ASSERT(seg
->s_szc
== 0 ||
5869 IS_P2ALIGNED(anon_idx
, pgcnt
));
5870 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5873 offset
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
5874 evp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
5877 * See Statement at the beginning of segvn_lockop regarding
5878 * the way cowcnts and lckcnts are handled.
5880 for (svp
= &svd
->vpage
[seg_page(seg
, addr
)]; svp
< evp
; svp
++) {
5882 if (seg
->s_szc
!= 0) {
5884 anon_array_enter(amp
, anon_idx
,
5887 if (IS_P2ALIGNED(anon_idx
, pgcnt
) &&
5888 !segvn_claim_pages(seg
, svp
, offset
,
5891 anon_array_exit(&cookie
);
5896 anon_array_exit(&cookie
);
5901 anon_array_enter(amp
, anon_idx
,
5903 ap
= anon_get_ptr(amp
->ahp
, anon_idx
++);
5906 if (VPP_ISPPLOCK(svp
) &&
5907 VPP_PROT(svp
) != prot
) {
5909 if (amp
== NULL
|| ap
== NULL
) {
5913 swap_xlate(ap
, &vp
, &off
);
5915 anon_array_exit(&cookie
);
5917 if ((pp
= page_lookup(vp
, off
,
5918 SE_SHARED
)) == NULL
) {
5919 panic("segvn_setprot: no page");
5922 ASSERT(seg
->s_szc
== 0);
5923 if ((VPP_PROT(svp
) ^ prot
) &
5925 if (prot
& PROT_WRITE
) {
5940 } else if (amp
!= NULL
)
5941 anon_array_exit(&cookie
);
5943 VPP_SETPROT(svp
, prot
);
5947 ANON_LOCK_EXIT(&
->a_rwlock
);
5950 * Did we terminate prematurely? If so, simply unload
5951 * the translations to the things we've updated so far.
5955 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5958 len
= (svp
- &svd
->vpage
[seg_page(seg
, addr
)]) *
5960 ASSERT(seg
->s_szc
== 0 || IS_P2ALIGNED(len
, pgsz
));
5962 hat_unload(seg
->s_as
->a_hat
, addr
,
5964 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5970 evp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
5971 for (svp
= &svd
->vpage
[seg_page(seg
, addr
)]; svp
< evp
; svp
++) {
5972 VPP_SETPROT(svp
, prot
);
5977 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5981 if (((prot
& PROT_WRITE
) != 0 &&
5982 (svd
->vp
!= NULL
|| svd
->type
== MAP_PRIVATE
)) ||
5983 (prot
& ~PROT_USER
) == PROT_NONE
) {
5985 * Either private or shared data with write access (in
5986 * which case we need to throw out all former translations
5987 * so that we get the right translations set up on fault
5988 * and we don't allow write access to any copy-on-write pages
5989 * that might be around or to prevent write access to pages
5990 * representing holes in a file), or we don't have permission
5991 * to access the memory at all (in which case we have to
5992 * unload any current translations that might exist).
5994 hat_unload(seg
->s_as
->a_hat
, addr
, len
, HAT_UNLOAD
);
5997 * A shared mapping or a private mapping in which write
5998 * protection is going to be denied - just change all the
5999 * protections over the range of addresses in question.
6000 * segvn does not support any other attributes other
6001 * than prot so we can use hat_chgattr.
6003 hat_chgattr(seg
->s_as
->a_hat
, addr
, len
, prot
);
6006 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6012 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize,
6013 * to determine if the seg is capable of mapping the requested szc.
6016 segvn_setpagesize(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t szc
)
6018 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6019 struct segvn_data
*nsvd
;
6020 struct anon_map
*amp
= svd
->amp
;
6022 caddr_t eaddr
= addr
+ len
, a
;
6023 size_t pgsz
= page_get_pagesize(szc
);
6024 pgcnt_t pgcnt
= page_get_pagecnt(szc
);
6026 u_offset_t off
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
6028 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6029 ASSERT(addr
>= seg
->s_base
&& eaddr
<= seg
->s_base
+ seg
->s_size
);
6031 if (seg
->s_szc
== szc
|| segvn_lpg_disable
!= 0) {
6036 * addr should always be pgsz aligned but eaddr may be misaligned if
6037 * it's at the end of the segment.
6039 * XXX we should assert this condition since as_setpagesize() logic
6042 if (!IS_P2ALIGNED(addr
, pgsz
) ||
6043 (!IS_P2ALIGNED(eaddr
, pgsz
) &&
6044 eaddr
!= seg
->s_base
+ seg
->s_size
)) {
6046 segvn_setpgsz_align_err
++;
6050 if (amp
!= NULL
&& svd
->type
== MAP_SHARED
) {
6051 ulong_t an_idx
= svd
->anon_index
+ seg_page(seg
, addr
);
6052 if (!IS_P2ALIGNED(an_idx
, pgcnt
)) {
6054 segvn_setpgsz_anon_align_err
++;
6059 if ((svd
->flags
& MAP_NORESERVE
) || seg
->s_as
== &kas
||
6060 szc
> segvn_maxpgszc
) {
6064 /* paranoid check */
6065 if (svd
->vp
!= NULL
&&
6066 (IS_SWAPFSVP(svd
->vp
) || VN_ISKAS(svd
->vp
))) {
6070 if (seg
->s_szc
== 0 && svd
->vp
!= NULL
&&
6071 map_addr_vacalign_check(addr
, off
)) {
6076 * Check that protections are the same within new page
6079 if (svd
->pageprot
) {
6080 for (a
= addr
; a
< eaddr
; a
+= pgsz
) {
6081 if ((a
+ pgsz
) > eaddr
) {
6082 if (!sameprot(seg
, a
, eaddr
- a
)) {
6086 if (!sameprot(seg
, a
, pgsz
)) {
6094 * Since we are changing page size we first have to flush
6095 * the cache. This makes sure all the pagelock calls have
6096 * to recheck protections.
6098 if (svd
->softlockcnt
> 0) {
6099 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6102 * If this is shared segment non 0 softlockcnt
6103 * means locked pages are still in use.
6105 if (svd
->type
== MAP_SHARED
) {
6110 * Since we do have the segvn writers lock nobody can fill
6111 * the cache with entries belonging to this seg during
6112 * the purge. The flush either succeeds or we still have
6116 if (svd
->softlockcnt
> 0) {
6121 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
6122 ASSERT(svd
->amp
== NULL
);
6123 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6124 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
6126 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
6127 } else if (svd
->tr_state
== SEGVN_TR_INIT
) {
6128 svd
->tr_state
= SEGVN_TR_OFF
;
6129 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
6130 ASSERT(svd
->amp
!= NULL
);
6131 segvn_textunrepl(seg
, 1);
6132 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
6137 * Operation for sub range of existing segment.
6139 if (addr
!= seg
->s_base
|| eaddr
!= (seg
->s_base
+ seg
->s_size
)) {
6140 if (szc
< seg
->s_szc
) {
6141 VM_STAT_ADD(segvnvmstats
.demoterange
[2]);
6142 err
= segvn_demote_range(seg
, addr
, len
, SDR_RANGE
, 0);
6146 if (err
== ENOMEM
) {
6151 if (addr
!= seg
->s_base
) {
6152 nseg
= segvn_split_seg(seg
, addr
);
6153 if (eaddr
!= (nseg
->s_base
+ nseg
->s_size
)) {
6154 /* eaddr is szc aligned */
6155 (void) segvn_split_seg(nseg
, eaddr
);
6159 if (eaddr
!= (seg
->s_base
+ seg
->s_size
)) {
6160 /* eaddr is szc aligned */
6161 (void) segvn_split_seg(seg
, eaddr
);
6167 * Break any low level sharing and reset seg->s_szc to 0.
6169 if ((err
= segvn_clrszc(seg
)) != 0) {
6170 if (err
== ENOMEM
) {
6175 ASSERT(seg
->s_szc
== 0);
6178 * If the end of the current segment is not pgsz aligned
6179 * then attempt to concatenate with the next segment.
6181 if (!IS_P2ALIGNED(eaddr
, pgsz
)) {
6182 nseg
= AS_SEGNEXT(seg
->s_as
, seg
);
6183 if (nseg
== NULL
|| nseg
== seg
|| eaddr
!= nseg
->s_base
) {
6186 if (nseg
->s_ops
!= &segvn_ops
) {
6189 nsvd
= (struct segvn_data
*)nseg
->s_data
;
6190 if (nsvd
->softlockcnt
> 0) {
6192 * If this is shared segment non 0 softlockcnt
6193 * means locked pages are still in use.
6195 if (nsvd
->type
== MAP_SHARED
) {
6199 if (nsvd
->softlockcnt
> 0) {
6203 err
= segvn_clrszc(nseg
);
6204 if (err
== ENOMEM
) {
6210 ASSERT(nsvd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6211 err
= segvn_concat(seg
, nseg
, 1);
6222 * May need to re-align anon array to
6226 if (!IS_P2ALIGNED(svd
->anon_index
, pgcnt
)) {
6227 struct anon_hdr
*nahp
;
6229 ASSERT(svd
->type
== MAP_PRIVATE
);
6231 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
6232 ASSERT(amp
->refcnt
== 1);
6233 nahp
= anon_create(btop(amp
->size
), ANON_NOSLEEP
);
6235 ANON_LOCK_EXIT(&
->a_rwlock
);
6238 if (anon_copy_ptr(amp
->ahp
, svd
->anon_index
,
6239 nahp
, 0, btop(seg
->s_size
), ANON_NOSLEEP
)) {
6240 anon_release(nahp
, btop(amp
->size
));
6241 ANON_LOCK_EXIT(&
->a_rwlock
);
6244 anon_release(amp
->ahp
, btop(amp
->size
));
6246 svd
->anon_index
= 0;
6247 ANON_LOCK_EXIT(&
->a_rwlock
);
6250 if (svd
->vp
!= NULL
&& szc
!= 0) {
6252 u_offset_t eoffpage
= svd
->offset
;
6253 va
.va_mask
= AT_SIZE
;
6254 eoffpage
+= seg
->s_size
;
6255 eoffpage
= btopr(eoffpage
);
6256 if (VOP_GETATTR(svd
->vp
, &va
, 0, svd
->cred
, NULL
) != 0) {
6257 segvn_setpgsz_getattr_err
++;
6260 if (btopr(va
.va_size
) < eoffpage
) {
6261 segvn_setpgsz_eof_err
++;
6266 * anon_fill_cow_holes() may call VOP_GETPAGE().
6267 * don't take anon map lock here to avoid holding it
6268 * across VOP_GETPAGE() calls that may call back into
6269 * segvn for klsutering checks. We don't really need
6270 * anon map lock here since it's a private segment and
6271 * we hold as level lock as writers.
6273 if ((err
= anon_fill_cow_holes(seg
, seg
->s_base
,
6274 amp
->ahp
, svd
->anon_index
, svd
->vp
, svd
->offset
,
6275 seg
->s_size
, szc
, svd
->prot
, svd
->vpage
,
6280 segvn_setvnode_mpss(svd
->vp
);
6284 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
6285 if (svd
->type
== MAP_PRIVATE
) {
6287 } else if (szc
> amp
->a_szc
) {
6290 ANON_LOCK_EXIT(&
->a_rwlock
);
6299 segvn_clrszc(struct seg
*seg
)
6301 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6302 struct anon_map
*amp
= svd
->amp
;
6306 caddr_t a
= seg
->s_base
;
6307 caddr_t ea
= a
+ seg
->s_size
;
6308 ulong_t an_idx
= svd
->anon_index
;
6309 vnode_t
*vp
= svd
->vp
;
6310 struct vpage
*vpage
= svd
->vpage
;
6311 page_t
*anon_pl
[1 + 1], *pp
;
6312 struct anon
*ap
, *oldap
;
6313 uint_t prot
= svd
->prot
, vpprot
;
6316 ASSERT(AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
) ||
6317 SEGVN_WRITE_HELD(seg
->s_as
, &svd
->lock
));
6318 ASSERT(svd
->softlockcnt
== 0);
6320 if (vp
== NULL
&& amp
== NULL
) {
6321 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6326 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
6327 ASSERT(svd
->amp
== NULL
);
6328 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6329 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
6331 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
6332 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
6333 ASSERT(svd
->amp
!= NULL
);
6334 segvn_textunrepl(seg
, 1);
6335 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
6338 if (svd
->tr_state
!= SEGVN_TR_OFF
) {
6339 ASSERT(svd
->tr_state
== SEGVN_TR_INIT
);
6340 svd
->tr_state
= SEGVN_TR_OFF
;
6344 * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
6345 * unload argument is 0 when we are freeing the segment
6346 * and unload was already done.
6348 hat_unload(seg
->s_as
->a_hat
, seg
->s_base
, seg
->s_size
,
6352 if (amp
== NULL
|| svd
->type
== MAP_SHARED
) {
6357 pgsz
= page_get_pagesize(seg
->s_szc
);
6361 * XXX anon rwlock is not really needed because this is a
6362 * private segment and we are writers.
6364 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
6366 for (; a
< ea
; a
+= pgsz
, an_idx
+= pages
) {
6367 if ((oldap
= anon_get_ptr(amp
->ahp
, an_idx
)) != NULL
) {
6368 ASSERT(vpage
!= NULL
|| svd
->pageprot
== 0);
6369 if (vpage
!= NULL
) {
6370 ASSERT(sameprot(seg
, a
, pgsz
));
6371 prot
= VPP_PROT(vpage
);
6372 pageflag
= VPP_ISPPLOCK(vpage
) ? LOCK_PAGE
: 0;
6374 if (seg
->s_szc
!= 0) {
6375 ASSERT(vp
== NULL
|| anon_pages(amp
->ahp
,
6376 an_idx
, pages
) == pages
);
6377 if ((err
= anon_map_demotepages(amp
, an_idx
,
6378 seg
, a
, prot
, vpage
, svd
->cred
)) != 0) {
6382 if (oldap
->an_refcnt
== 1) {
6385 if ((err
= anon_getpage(&oldap
, &vpprot
,
6386 anon_pl
, PAGESIZE
, seg
, a
, S_READ
,
6390 if ((pp
= anon_private(&ap
, seg
, a
, prot
,
6391 anon_pl
[0], pageflag
, svd
->cred
)) == NULL
) {
6396 (void) anon_set_ptr(amp
->ahp
, an_idx
, ap
,
6401 vpage
= (vpage
== NULL
) ? NULL
: vpage
+ pages
;
6407 ANON_LOCK_EXIT(&
->a_rwlock
);
6419 pgcnt_t pgcnt
= page_get_pagecnt(seg
->s_szc
);
6420 size_t ppasize
= (pgcnt
+ 1) * sizeof (page_t
*);
6422 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6423 struct anon_map
*amp
= svd
->amp
;
6424 struct vpage
*evp
= svp
+ pgcnt
;
6425 caddr_t addr
= ((uintptr_t)(svp
- svd
->vpage
) << PAGESHIFT
)
6428 struct vnode
*vp
= svd
->vp
;
6433 int anon
= (amp
!= NULL
) ? 1 : 0;
6435 ASSERT(svd
->type
== MAP_PRIVATE
);
6436 ASSERT(svd
->vpage
!= NULL
);
6437 ASSERT(seg
->s_szc
!= 0);
6438 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
6439 ASSERT(amp
== NULL
|| IS_P2ALIGNED(anon_idx
, pgcnt
));
6440 ASSERT(sameprot(seg
, addr
, pgcnt
<< PAGESHIFT
));
6442 if (VPP_PROT(svp
) == prot
)
6444 if (!((VPP_PROT(svp
) ^ prot
) & PROT_WRITE
))
6447 ppa
= kmem_alloc(ppasize
, KM_SLEEP
);
6448 if (anon
&& vp
!= NULL
) {
6449 if (anon_get_ptr(amp
->ahp
, anon_idx
) == NULL
) {
6451 ASSERT(!anon_pages(amp
->ahp
, anon_idx
, pgcnt
));
6454 anon_pages(amp
->ahp
, anon_idx
, pgcnt
) == pgcnt
);
6457 for (*ppa
= NULL
, pg_idx
= 0; svp
< evp
; svp
++, anon_idx
++) {
6458 if (!VPP_ISPPLOCK(svp
))
6461 ap
= anon_get_ptr(amp
->ahp
, anon_idx
);
6463 panic("segvn_claim_pages: no anon slot");
6465 swap_xlate(ap
, &vp
, &aoff
);
6466 off
= (u_offset_t
)aoff
;
6469 if ((pp
= page_lookup(vp
,
6470 (u_offset_t
)off
, SE_SHARED
)) == NULL
) {
6471 panic("segvn_claim_pages: no page");
6477 if (ppa
[0] == NULL
) {
6478 kmem_free(ppa
, ppasize
);
6482 ASSERT(pg_idx
<= pgcnt
);
6486 /* Find each large page within ppa, and adjust its claim */
6488 /* Does ppa cover a single large page? */
6489 if (ppa
[0]->p_szc
== seg
->s_szc
) {
6490 if (prot
& PROT_WRITE
)
6491 err
= page_addclaim_pages(ppa
);
6493 err
= page_subclaim_pages(ppa
);
6495 for (i
= 0; ppa
[i
]; i
+= pgcnt
) {
6496 ASSERT(IS_P2ALIGNED(page_pptonum(ppa
[i
]), pgcnt
));
6497 if (prot
& PROT_WRITE
)
6498 err
= page_addclaim_pages(&ppa
[i
]);
6500 err
= page_subclaim_pages(&ppa
[i
]);
6506 for (i
= 0; i
< pg_idx
; i
++) {
6507 ASSERT(ppa
[i
] != NULL
);
6508 page_unlock(ppa
[i
]);
6511 kmem_free(ppa
, ppasize
);
6516 * Returns right (upper address) segment if split occurred.
6517 * If the address is equal to the beginning or end of its segment it returns
6518 * the current segment.
6521 segvn_split_seg(struct seg
*seg
, caddr_t addr
)
6523 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6526 struct segvn_data
*nsvd
;
6528 ASSERT(AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6529 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6531 ASSERT(addr
>= seg
->s_base
);
6532 ASSERT(addr
<= seg
->s_base
+ seg
->s_size
);
6533 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6535 if (addr
== seg
->s_base
|| addr
== seg
->s_base
+ seg
->s_size
)
6538 nsize
= seg
->s_base
+ seg
->s_size
- addr
;
6539 seg
->s_size
= addr
- seg
->s_base
;
6540 nseg
= seg_alloc(seg
->s_as
, addr
, nsize
);
6541 ASSERT(nseg
!= NULL
);
6542 nseg
->s_ops
= seg
->s_ops
;
6543 nsvd
= kmem_cache_alloc(segvn_cache
, KM_SLEEP
);
6544 nseg
->s_data
= (void *)nsvd
;
6545 nseg
->s_szc
= seg
->s_szc
;
6547 ASSERT(nsvd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6549 rw_init(&nsvd
->lock
, NULL
, RW_DEFAULT
, NULL
);
6551 if (nsvd
->vp
!= NULL
) {
6553 nsvd
->offset
= svd
->offset
+
6554 (uintptr_t)(nseg
->s_base
- seg
->s_base
);
6555 if (nsvd
->type
== MAP_SHARED
)
6556 lgrp_shm_policy_init(NULL
, nsvd
->vp
);
6559 * The offset for an anonymous segment has no signifigance in
6560 * terms of an offset into a file. If we were to use the above
6561 * calculation instead, the structures read out of
6562 * /proc/<pid>/xmap would be more difficult to decipher since
6563 * it would be unclear whether two seemingly contiguous
6564 * prxmap_t structures represented different segments or a
6565 * single segment that had been split up into multiple prxmap_t
6566 * structures (e.g. if some part of the segment had not yet
6572 ASSERT(svd
->softlockcnt
== 0);
6573 ASSERT(svd
->softlockcnt_sbase
== 0);
6574 ASSERT(svd
->softlockcnt_send
== 0);
6577 if (svd
->vpage
!= NULL
) {
6578 size_t bytes
= vpgtob(seg_pages(seg
));
6579 size_t nbytes
= vpgtob(seg_pages(nseg
));
6580 struct vpage
*ovpage
= svd
->vpage
;
6582 svd
->vpage
= kmem_alloc(bytes
, KM_SLEEP
);
6583 bcopy(ovpage
, svd
->vpage
, bytes
);
6584 nsvd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
6585 bcopy(ovpage
+ seg_pages(seg
), nsvd
->vpage
, nbytes
);
6586 kmem_free(ovpage
, bytes
+ nbytes
);
6588 if (svd
->amp
!= NULL
&& svd
->type
== MAP_PRIVATE
) {
6589 struct anon_map
*oamp
= svd
->amp
, *namp
;
6590 struct anon_hdr
*nahp
;
6592 ANON_LOCK_ENTER(&oamp
->a_rwlock
, RW_WRITER
);
6593 ASSERT(oamp
->refcnt
== 1);
6594 nahp
= anon_create(btop(seg
->s_size
), ANON_SLEEP
);
6595 (void) anon_copy_ptr(oamp
->ahp
, svd
->anon_index
,
6596 nahp
, 0, btop(seg
->s_size
), ANON_SLEEP
);
6598 namp
= anonmap_alloc(nseg
->s_size
, 0, ANON_SLEEP
);
6599 namp
->a_szc
= nseg
->s_szc
;
6600 (void) anon_copy_ptr(oamp
->ahp
,
6601 svd
->anon_index
+ btop(seg
->s_size
),
6602 namp
->ahp
, 0, btop(nseg
->s_size
), ANON_SLEEP
);
6603 anon_release(oamp
->ahp
, btop(oamp
->size
));
6605 oamp
->size
= seg
->s_size
;
6606 svd
->anon_index
= 0;
6608 nsvd
->anon_index
= 0;
6609 ANON_LOCK_EXIT(&oamp
->a_rwlock
);
6610 } else if (svd
->amp
!= NULL
) {
6611 pgcnt_t pgcnt
= page_get_pagecnt(seg
->s_szc
);
6612 ASSERT(svd
->amp
== nsvd
->amp
);
6613 ASSERT(seg
->s_szc
<= svd
->amp
->a_szc
);
6614 nsvd
->anon_index
= svd
->anon_index
+ seg_pages(seg
);
6615 ASSERT(IS_P2ALIGNED(nsvd
->anon_index
, pgcnt
));
6616 ANON_LOCK_ENTER(&svd
->amp
->a_rwlock
, RW_WRITER
);
6618 ANON_LOCK_EXIT(&svd
->amp
->a_rwlock
);
6622 * Split the amount of swap reserved.
6626 * For MAP_NORESERVE, only allocate swap reserve for pages
6627 * being used. Other segments get enough to cover whole
6630 if (svd
->flags
& MAP_NORESERVE
) {
6634 oswresv
= svd
->swresv
;
6635 svd
->swresv
= ptob(anon_pages(svd
->amp
->ahp
,
6636 svd
->anon_index
, btop(seg
->s_size
)));
6637 nsvd
->swresv
= ptob(anon_pages(nsvd
->amp
->ahp
,
6638 nsvd
->anon_index
, btop(nseg
->s_size
)));
6639 ASSERT(oswresv
>= (svd
->swresv
+ nsvd
->swresv
));
6641 if (svd
->pageswap
) {
6642 svd
->swresv
= segvn_count_swap_by_vpages(seg
);
6643 ASSERT(nsvd
->swresv
>= svd
->swresv
);
6644 nsvd
->swresv
-= svd
->swresv
;
6646 ASSERT(svd
->swresv
== seg
->s_size
+
6648 svd
->swresv
= seg
->s_size
;
6649 nsvd
->swresv
= nseg
->s_size
;
6658 * called on memory operations (unmap, setprot, setpagesize) for a subset
6659 * of a large page segment to either demote the memory range (SDR_RANGE)
6660 * or the ends (SDR_END) by addr/len.
6662 * returns 0 on success. returns errno, including ENOMEM, on failure.
6672 caddr_t eaddr
= addr
+ len
;
6673 caddr_t lpgaddr
, lpgeaddr
;
6675 struct seg
*badseg1
= NULL
;
6676 struct seg
*badseg2
= NULL
;
6678 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6680 uint_t szc
= seg
->s_szc
;
6683 ASSERT(AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6684 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6686 pgsz
= page_get_pagesize(szc
);
6687 ASSERT(seg
->s_base
!= addr
|| seg
->s_size
!= len
);
6688 ASSERT(addr
>= seg
->s_base
&& eaddr
<= seg
->s_base
+ seg
->s_size
);
6689 ASSERT(svd
->softlockcnt
== 0);
6690 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6691 ASSERT(szcvec
== 0 || (flag
== SDR_END
&& svd
->type
== MAP_SHARED
));
6693 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
, lpgeaddr
);
6694 ASSERT(flag
== SDR_RANGE
|| eaddr
< lpgeaddr
|| addr
> lpgaddr
);
6695 if (flag
== SDR_RANGE
) {
6696 /* demote entire range */
6697 badseg1
= nseg
= segvn_split_seg(seg
, lpgaddr
);
6698 (void) segvn_split_seg(nseg
, lpgeaddr
);
6699 ASSERT(badseg1
->s_base
== lpgaddr
);
6700 ASSERT(badseg1
->s_size
== lpgeaddr
- lpgaddr
);
6701 } else if (addr
!= lpgaddr
) {
6702 ASSERT(flag
== SDR_END
);
6703 badseg1
= nseg
= segvn_split_seg(seg
, lpgaddr
);
6704 if (eaddr
!= lpgeaddr
&& eaddr
> lpgaddr
+ pgsz
&&
6705 eaddr
< lpgaddr
+ 2 * pgsz
) {
6706 (void) segvn_split_seg(nseg
, lpgeaddr
);
6707 ASSERT(badseg1
->s_base
== lpgaddr
);
6708 ASSERT(badseg1
->s_size
== 2 * pgsz
);
6710 nseg
= segvn_split_seg(nseg
, lpgaddr
+ pgsz
);
6711 ASSERT(badseg1
->s_base
== lpgaddr
);
6712 ASSERT(badseg1
->s_size
== pgsz
);
6713 if (eaddr
!= lpgeaddr
&& eaddr
> lpgaddr
+ pgsz
) {
6714 ASSERT(lpgeaddr
- lpgaddr
> 2 * pgsz
);
6715 nseg
= segvn_split_seg(nseg
, lpgeaddr
- pgsz
);
6717 (void) segvn_split_seg(nseg
, lpgeaddr
);
6718 ASSERT(badseg2
->s_base
== lpgeaddr
- pgsz
);
6719 ASSERT(badseg2
->s_size
== pgsz
);
6723 ASSERT(flag
== SDR_END
);
6724 ASSERT(eaddr
< lpgeaddr
);
6725 badseg1
= nseg
= segvn_split_seg(seg
, lpgeaddr
- pgsz
);
6726 (void) segvn_split_seg(nseg
, lpgeaddr
);
6727 ASSERT(badseg1
->s_base
== lpgeaddr
- pgsz
);
6728 ASSERT(badseg1
->s_size
== pgsz
);
6731 ASSERT(badseg1
!= NULL
);
6732 ASSERT(badseg1
->s_szc
== szc
);
6733 ASSERT(flag
== SDR_RANGE
|| badseg1
->s_size
== pgsz
||
6734 badseg1
->s_size
== 2 * pgsz
);
6735 ASSERT(sameprot(badseg1
, badseg1
->s_base
, pgsz
));
6736 ASSERT(badseg1
->s_size
== pgsz
||
6737 sameprot(badseg1
, badseg1
->s_base
+ pgsz
, pgsz
));
6738 if (err
= segvn_clrszc(badseg1
)) {
6741 ASSERT(badseg1
->s_szc
== 0);
6743 if (szc
> 1 && (tszcvec
= P2PHASE(szcvec
, 1 << szc
)) > 1) {
6744 uint_t tszc
= highbit(tszcvec
) - 1;
6745 caddr_t ta
= MAX(addr
, badseg1
->s_base
);
6747 size_t tpgsz
= page_get_pagesize(tszc
);
6749 ASSERT(svd
->type
== MAP_SHARED
);
6750 ASSERT(flag
== SDR_END
);
6751 ASSERT(tszc
< szc
&& tszc
> 0);
6753 if (eaddr
> badseg1
->s_base
+ badseg1
->s_size
) {
6754 te
= badseg1
->s_base
+ badseg1
->s_size
;
6760 badseg1
->s_szc
= tszc
;
6761 if (!IS_P2ALIGNED(ta
, tpgsz
) || !IS_P2ALIGNED(te
, tpgsz
)) {
6762 if (badseg2
!= NULL
) {
6763 err
= segvn_demote_range(badseg1
, ta
, te
- ta
,
6769 return (segvn_demote_range(badseg1
, ta
,
6770 te
- ta
, SDR_END
, tszcvec
));
6775 if (badseg2
== NULL
)
6777 ASSERT(badseg2
->s_szc
== szc
);
6778 ASSERT(badseg2
->s_size
== pgsz
);
6779 ASSERT(sameprot(badseg2
, badseg2
->s_base
, badseg2
->s_size
));
6780 if (err
= segvn_clrszc(badseg2
)) {
6783 ASSERT(badseg2
->s_szc
== 0);
6785 if (szc
> 1 && (tszcvec
= P2PHASE(szcvec
, 1 << szc
)) > 1) {
6786 uint_t tszc
= highbit(tszcvec
) - 1;
6787 size_t tpgsz
= page_get_pagesize(tszc
);
6789 ASSERT(svd
->type
== MAP_SHARED
);
6790 ASSERT(flag
== SDR_END
);
6791 ASSERT(tszc
< szc
&& tszc
> 0);
6792 ASSERT(badseg2
->s_base
> addr
);
6793 ASSERT(eaddr
> badseg2
->s_base
);
6794 ASSERT(eaddr
< badseg2
->s_base
+ badseg2
->s_size
);
6796 badseg2
->s_szc
= tszc
;
6797 if (!IS_P2ALIGNED(eaddr
, tpgsz
)) {
6798 return (segvn_demote_range(badseg2
, badseg2
->s_base
,
6799 eaddr
- badseg2
->s_base
, SDR_END
, tszcvec
));
6807 segvn_checkprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t prot
)
6809 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6810 struct vpage
*vp
, *evp
;
6812 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6814 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
6816 * If segment protection can be used, simply check against them.
6818 if (svd
->pageprot
== 0) {
6821 err
= ((svd
->prot
& prot
) != prot
) ? EACCES
: 0;
6822 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6827 * Have to check down to the vpage level.
6829 evp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
6830 for (vp
= &svd
->vpage
[seg_page(seg
, addr
)]; vp
< evp
; vp
++) {
6831 if ((VPP_PROT(vp
) & prot
) != prot
) {
6832 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6836 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6841 segvn_getprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t
*protv
)
6843 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6844 size_t pgno
= seg_page(seg
, addr
+ len
) - seg_page(seg
, addr
) + 1;
6846 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6849 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
6850 if (svd
->pageprot
== 0) {
6852 protv
[--pgno
] = svd
->prot
;
6853 } while (pgno
!= 0);
6855 size_t pgoff
= seg_page(seg
, addr
);
6859 protv
[pgno
] = VPP_PROT(&svd
->vpage
[pgno
+pgoff
]);
6860 } while (pgno
!= 0);
6862 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6868 segvn_getoffset(struct seg
*seg
, caddr_t addr
)
6870 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6872 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6874 return (svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
));
6879 segvn_gettype(struct seg
*seg
, caddr_t addr
)
6881 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6883 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6885 return (svd
->type
| (svd
->flags
& (MAP_NORESERVE
| MAP_TEXT
|
6891 segvn_getvp(struct seg
*seg
, caddr_t addr
, struct vnode
**vpp
)
6893 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6895 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6902 * Check to see if it makes sense to do kluster/read ahead to
6903 * addr + delta relative to the mapping at addr. We assume here
6904 * that delta is a signed PAGESIZE'd multiple (which can be negative).
6906 * For segvn, we currently "approve" of the action if we are
6907 * still in the segment and it maps from the same vp/off,
6908 * or if the advice stored in segvn_data or vpages allows it.
6909 * Currently, klustering is not allowed only if MADV_RANDOM is set.
6912 segvn_kluster(struct seg
*seg
, caddr_t addr
, ssize_t delta
)
6914 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6915 struct anon
*oap
, *ap
;
6918 struct vnode
*vp1
, *vp2
;
6919 u_offset_t off1
, off2
;
6920 struct anon_map
*amp
;
6922 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
6923 ASSERT(AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
) ||
6924 SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
6926 if (addr
+ delta
< seg
->s_base
||
6927 addr
+ delta
>= (seg
->s_base
+ seg
->s_size
))
6928 return (-1); /* exceeded segment bounds */
6930 pd
= delta
/ (ssize_t
)PAGESIZE
; /* divide to preserve sign bit */
6931 page
= seg_page(seg
, addr
);
6934 * Check to see if either of the pages addr or addr + delta
6935 * have advice set that prevents klustering (if MADV_RANDOM advice
6936 * is set for entire segment, or MADV_SEQUENTIAL is set and delta
6939 if (svd
->advice
== MADV_RANDOM
||
6940 svd
->advice
== MADV_SEQUENTIAL
&& delta
< 0)
6942 else if (svd
->pageadvice
&& svd
->vpage
) {
6943 struct vpage
*bvpp
, *evpp
;
6945 bvpp
= &svd
->vpage
[page
];
6946 evpp
= &svd
->vpage
[page
+ pd
];
6947 if (VPP_ADVICE(bvpp
) == MADV_RANDOM
||
6948 VPP_ADVICE(evpp
) == MADV_SEQUENTIAL
&& delta
< 0)
6950 if (VPP_ADVICE(bvpp
) != VPP_ADVICE(evpp
) &&
6951 VPP_ADVICE(evpp
) == MADV_RANDOM
)
6955 if (svd
->type
== MAP_SHARED
)
6956 return (0); /* shared mapping - all ok */
6958 if ((amp
= svd
->amp
) == NULL
)
6959 return (0); /* off original vnode */
6961 page
+= svd
->anon_index
;
6963 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
6965 oap
= anon_get_ptr(amp
->ahp
, page
);
6966 ap
= anon_get_ptr(amp
->ahp
, page
+ pd
);
6968 ANON_LOCK_EXIT(&
->a_rwlock
);
6970 if ((oap
== NULL
&& ap
!= NULL
) || (oap
!= NULL
&& ap
== NULL
)) {
6971 return (-1); /* one with and one without an anon */
6974 if (oap
== NULL
) { /* implies that ap == NULL */
6975 return (0); /* off original vnode */
6979 * Now we know we have two anon pointers - check to
6980 * see if they happen to be properly allocated.
6984 * XXX We cheat here and don't lock the anon slots. We can't because
6985 * we may have been called from the anon layer which might already
6986 * have locked them. We are holding a refcnt on the slots so they
6987 * can't disappear. The worst that will happen is we'll get the wrong
6988 * names (vp, off) for the slots and make a poor klustering decision.
6990 swap_xlate(ap
, &vp1
, &off1
);
6991 swap_xlate(oap
, &vp2
, &off2
);
6994 if (!VOP_CMP(vp1
, vp2
, NULL
) || off1
- off2
!= delta
)
7000 * Swap the pages of seg out to secondary storage, returning the
7001 * number of bytes of storage freed.
7003 * The basic idea is first to unload all translations and then to call
7004 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the
7005 * swap device. Pages to which other segments have mappings will remain
7006 * mapped and won't be swapped. Our caller (as_swapout) has already
7007 * performed the unloading step.
7009 * The value returned is intended to correlate well with the process's
7010 * memory requirements. However, there are some caveats:
7011 * 1) When given a shared segment as argument, this routine will
7012 * only succeed in swapping out pages for the last sharer of the
7013 * segment. (Previous callers will only have decremented mapping
7014 * reference counts.)
7015 * 2) We assume that the hat layer maintains a large enough translation
7016 * cache to capture process reference patterns.
7019 segvn_swapout(struct seg
*seg
)
7021 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
7022 struct anon_map
*amp
;
7028 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
7030 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
7032 * Find pages unmapped by our caller and force them
7033 * out to the virtual swap device.
7035 if ((amp
= svd
->amp
) != NULL
)
7036 anon_index
= svd
->anon_index
;
7037 npages
= seg
->s_size
>> PAGESHIFT
;
7038 for (page
= 0; page
< npages
; page
++) {
7043 anon_sync_obj_t cookie
;
7046 * Obtain <vp, off> pair for the page, then look it up.
7048 * Note that this code is willing to consider regular
7049 * pages as well as anon pages. Is this appropriate here?
7053 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7054 if (anon_array_try_enter(amp
, anon_index
+ page
,
7056 ANON_LOCK_EXIT(&
->a_rwlock
);
7059 ap
= anon_get_ptr(amp
->ahp
, anon_index
+ page
);
7061 swap_xlate(ap
, &vp
, &off
);
7064 off
= svd
->offset
+ ptob(page
);
7066 anon_array_exit(&cookie
);
7067 ANON_LOCK_EXIT(&
->a_rwlock
);
7070 off
= svd
->offset
+ ptob(page
);
7072 if (vp
== NULL
) { /* untouched zfod page */
7077 pp
= page_lookup_nowait(vp
, off
, SE_SHARED
);
7083 * Examine the page to see whether it can be tossed out,
7084 * keeping track of how many we've found.
7086 if (!page_tryupgrade(pp
)) {
7088 * If the page has an i/o lock and no mappings,
7089 * it's very likely that the page is being
7090 * written out as a result of klustering.
7091 * Assume this is so and take credit for it here.
7093 if (!page_io_trylock(pp
)) {
7094 if (!hat_page_is_mapped(pp
))
7102 ASSERT(!page_iolock_assert(pp
));
7106 * Skip if page is locked or has mappings.
7107 * We don't need the page_struct_lock to look at lckcnt
7108 * and cowcnt because the page is exclusive locked.
7110 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0 ||
7111 hat_page_is_mapped(pp
)) {
7117 * dispose skips large pages so try to demote first.
7119 if (pp
->p_szc
!= 0 && !page_try_demote_pages(pp
)) {
7122 * XXX should skip the remaining page_t's of this
7128 ASSERT(pp
->p_szc
== 0);
7131 * No longer mapped -- we can toss it out. How
7132 * we do so depends on whether or not it's dirty.
7134 if (hat_ismod(pp
) && pp
->p_vnode
) {
7136 * We must clean the page before it can be
7137 * freed. Setting B_FREE will cause pvn_done
7138 * to free the page when the i/o completes.
7139 * XXX: This also causes it to be accounted
7140 * as a pageout instead of a swap: need
7141 * B_SWAPOUT bit to use instead of B_FREE.
7143 * Hold the vnode before releasing the page lock
7144 * to prevent it from being freed and re-used by
7145 * some other thread.
7151 * Queue all i/o requests for the pageout thread
7152 * to avoid saturating the pageout devices.
7154 if (!queue_io_request(vp
, off
))
7158 * The page was clean, free it.
7160 * XXX: Can we ever encounter modified pages
7161 * with no associated vnode here?
7163 ASSERT(pp
->p_vnode
!= NULL
);
7164 /*LINTED: constant in conditional context*/
7165 VN_DISPOSE(pp
, B_FREE
, 0, kcred
);
7169 * Credit now even if i/o is in progress.
7173 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7176 * Wakeup pageout to initiate i/o on all queued requests.
7178 cv_signal_pageout();
7179 return (ptob(pgcnt
));
7183 * Synchronize primary storage cache with real object in virtual memory.
7185 * XXX - Anonymous pages should not be sync'ed out at all.
7188 segvn_sync(struct seg
*seg
, caddr_t addr
, size_t len
, int attr
, uint_t flags
)
7190 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
7203 struct anon_map
*amp
;
7205 anon_sync_obj_t cookie
;
7207 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
7209 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
7211 if (svd
->softlockcnt
> 0) {
7213 * If this is shared segment non 0 softlockcnt
7214 * means locked pages are still in use.
7216 if (svd
->type
== MAP_SHARED
) {
7217 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7222 * flush all pages from seg cache
7223 * otherwise we may deadlock in swap_putpage
7224 * for B_INVAL page (4175402).
7226 * Even if we grab segvn WRITER's lock
7227 * here, there might be another thread which could've
7228 * successfully performed lookup/insert just before
7229 * we acquired the lock here. So, grabbing either
7230 * lock here is of not much use. Until we devise
7231 * a strategy at upper layers to solve the
7232 * synchronization issues completely, we expect
7233 * applications to handle this appropriately.
7236 if (svd
->softlockcnt
> 0) {
7237 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7240 } else if (svd
->type
== MAP_SHARED
&& svd
->amp
!= NULL
&&
7241 svd
->amp
->a_softlockcnt
> 0) {
7243 * Try to purge this amp's entries from pcache. It will
7244 * succeed only if other segments that share the amp have no
7245 * outstanding softlock's.
7248 if (svd
->amp
->a_softlockcnt
> 0 || svd
->softlockcnt
> 0) {
7249 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7255 offset
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
7256 bflags
= ((flags
& MS_ASYNC
) ? B_ASYNC
: 0) |
7257 ((flags
& MS_INVALIDATE
) ? B_INVAL
: 0);
7260 pageprot
= attr
& ~(SHARED
|PRIVATE
);
7261 segtype
= (attr
& SHARED
) ? MAP_SHARED
: MAP_PRIVATE
;
7264 * We are done if the segment types don't match
7265 * or if we have segment level protections and
7268 if (svd
->type
!= segtype
) {
7269 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7273 if (svd
->prot
!= pageprot
) {
7274 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7279 vpp
= &svd
->vpage
[seg_page(seg
, addr
)];
7281 } else if (svd
->vp
&& svd
->amp
== NULL
&&
7282 (flags
& MS_INVALIDATE
) == 0) {
7285 * No attributes, no anonymous pages and MS_INVALIDATE flag
7286 * is not on, just use one big request.
7288 err
= VOP_PUTPAGE(svd
->vp
, (offset_t
)offset
, len
,
7289 bflags
, svd
->cred
, NULL
);
7290 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7294 if ((amp
= svd
->amp
) != NULL
)
7295 anon_index
= svd
->anon_index
+ seg_page(seg
, addr
);
7297 for (eaddr
= addr
+ len
; addr
< eaddr
; addr
+= PAGESIZE
) {
7300 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7301 anon_array_enter(amp
, anon_index
, &cookie
);
7302 ap
= anon_get_ptr(amp
->ahp
, anon_index
++);
7304 swap_xlate(ap
, &vp
, &off
);
7309 anon_array_exit(&cookie
);
7310 ANON_LOCK_EXIT(&
->a_rwlock
);
7317 if (vp
== NULL
) /* untouched zfod page */
7322 prot
= VPP_PROT(vpp
);
7325 if (prot
!= pageprot
) {
7331 * See if any of these pages are locked -- if so, then we
7332 * will have to truncate an invalidate request at the first
7333 * locked one. We don't need the page_struct_lock to test
7334 * as this is only advisory; even if we acquire it someone
7335 * might race in and lock the page after we unlock and before
7336 * we do the PUTPAGE, then PUTPAGE simply does nothing.
7338 if (flags
& MS_INVALIDATE
) {
7339 if ((pp
= page_lookup(vp
, off
, SE_SHARED
)) != NULL
) {
7340 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
7342 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7345 if (ap
!= NULL
&& pp
->p_szc
!= 0 &&
7346 page_tryupgrade(pp
)) {
7347 if (pp
->p_lckcnt
== 0 &&
7348 pp
->p_cowcnt
== 0) {
7350 * swapfs VN_DISPOSE() won't
7351 * invalidate large pages.
7352 * Attempt to demote.
7353 * XXX can't help it if it
7354 * fails. But for swapfs
7355 * pages it is no big deal.
7357 (void) page_try_demote_pages(
7363 } else if (svd
->type
== MAP_SHARED
&& amp
!= NULL
) {
7365 * Avoid writing out to disk ISM's large pages
7366 * because segspt_free_pages() relies on NULL an_pvp
7367 * of anon slots of such pages.
7370 ASSERT(svd
->vp
== NULL
);
7372 * swapfs uses page_lookup_nowait if not freeing or
7373 * invalidating and skips a page if
7374 * page_lookup_nowait returns NULL.
7376 pp
= page_lookup_nowait(vp
, off
, SE_SHARED
);
7380 if (pp
->p_szc
!= 0) {
7386 * Note ISM pages are created large so (vp, off)'s
7387 * page cannot suddenly become large after we unlock
7393 * XXX - Should ultimately try to kluster
7394 * calls to VOP_PUTPAGE() for performance.
7397 err
= VOP_PUTPAGE(vp
, (offset_t
)off
, PAGESIZE
,
7398 (bflags
| (IS_SWAPFSVP(vp
) ? B_PAGE_NOWAIT
: 0)),
7405 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7410 * Determine if we have data corresponding to pages in the
7411 * primary storage virtual memory cache (i.e., "in core").
7414 segvn_incore(struct seg
*seg
, caddr_t addr
, size_t len
, char *vec
)
7416 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
7417 struct vnode
*vp
, *avp
;
7418 u_offset_t offset
, aoffset
;
7424 struct anon_map
*amp
; /* XXX - for locknest */
7427 anon_sync_obj_t cookie
;
7429 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
7431 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
7432 if (svd
->amp
== NULL
&& svd
->vp
== NULL
) {
7433 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7434 bzero(vec
, btopr(len
));
7435 return (len
); /* no anonymous pages created yet */
7438 p
= seg_page(seg
, addr
);
7439 ep
= seg_page(seg
, addr
+ len
);
7440 start
= svd
->vp
? SEG_PAGE_VNODEBACKED
: 0;
7443 for (; p
< ep
; p
++, addr
+= PAGESIZE
) {
7444 vpp
= (svd
->vpage
) ? &svd
->vpage
[p
]: NULL
;
7448 /* Grab the vnode/offset for the anon slot */
7450 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7451 anon_array_enter(amp
, svd
->anon_index
+ p
, &cookie
);
7452 ap
= anon_get_ptr(amp
->ahp
, svd
->anon_index
+ p
);
7454 swap_xlate(ap
, &avp
, &aoffset
);
7456 anon_array_exit(&cookie
);
7457 ANON_LOCK_EXIT(&
->a_rwlock
);
7459 if ((avp
!= NULL
) && page_exists(avp
, aoffset
)) {
7460 /* A page exists for the anon slot */
7461 ret
|= SEG_PAGE_INCORE
;
7464 * If page is mapped and writable
7467 if ((hat_getattr(seg
->s_as
->a_hat
, addr
,
7468 &attr
) != -1) && (attr
& PROT_WRITE
)) {
7469 ret
|= SEG_PAGE_ANON
;
7472 * Don't get page_struct lock for lckcnt and cowcnt,
7473 * since this is purely advisory.
7475 if ((pp
= page_lookup_nowait(avp
, aoffset
,
7476 SE_SHARED
)) != NULL
) {
7478 ret
|= SEG_PAGE_SOFTLOCK
;
7480 ret
|= SEG_PAGE_HASCOW
;
7485 /* Gather vnode statistics */
7487 offset
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
7491 * Try to obtain a "shared" lock on the page
7492 * without blocking. If this fails, determine
7493 * if the page is in memory.
7495 pp
= page_lookup_nowait(vp
, offset
, SE_SHARED
);
7496 if ((pp
== NULL
) && (page_exists(vp
, offset
))) {
7497 /* Page is incore, and is named */
7498 ret
|= (SEG_PAGE_INCORE
| SEG_PAGE_VNODE
);
7501 * Don't get page_struct lock for lckcnt and cowcnt,
7502 * since this is purely advisory.
7505 ret
|= (SEG_PAGE_INCORE
| SEG_PAGE_VNODE
);
7507 ret
|= SEG_PAGE_SOFTLOCK
;
7509 ret
|= SEG_PAGE_HASCOW
;
7514 /* Gather virtual page information */
7516 if (VPP_ISPPLOCK(vpp
))
7517 ret
|= SEG_PAGE_LOCKED
;
7523 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7528 * Statement for p_cowcnts/p_lckcnts.
7530 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region
7531 * irrespective of the following factors or anything else:
7533 * (1) anon slots are populated or not
7534 * (2) cow is broken or not
7535 * (3) refcnt on ap is 1 or greater than 1
7537 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock
7541 * Handling p_cowcnts/p_lckcnts during copy-on-write fault:
7543 * if vpage has PROT_WRITE
7544 * transfer cowcnt on the oldpage -> cowcnt on the newpage
7546 * transfer lckcnt on the oldpage -> lckcnt on the newpage
7548 * During copy-on-write, decrement p_cowcnt on the oldpage and increment
7549 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE.
7551 * We may also break COW if softlocking on read access in the physio case.
7552 * In this case, vpage may not have PROT_WRITE. So, we need to decrement
7553 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the
7554 * vpage doesn't have PROT_WRITE.
7557 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region:
7559 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and
7560 * increment p_lckcnt by calling page_subclaim() which takes care of
7561 * availrmem accounting and p_lckcnt overflow.
7563 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and
7564 * increment p_cowcnt by calling page_addclaim() which takes care of
7565 * availrmem availability and p_cowcnt overflow.
7569 * Lock down (or unlock) pages mapped by this segment.
7571 * XXX only creates PAGESIZE pages if anon slots are not initialized.
7572 * At fault time they will be relocated into larger pages.
7575 segvn_lockop(struct seg
*seg
, caddr_t addr
, size_t len
,
7576 int attr
, int op
, ulong_t
*lockmap
, size_t pos
)
7578 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
7589 struct anon_map
*amp
;
7592 anon_sync_obj_t cookie
;
7593 struct kshmid
*sp
= NULL
;
7594 struct proc
*p
= curproc
;
7595 kproject_t
*proj
= NULL
;
7597 size_t locked_bytes
= 0;
7598 size_t unlocked_bytes
= 0;
7602 * Hold write lock on address space because may split or concatenate
7605 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
7608 * If this is a shm, use shm's project and zone, else use
7609 * project and zone of calling process
7612 /* Determine if this segment backs a sysV shm */
7613 if (svd
->amp
!= NULL
&& svd
->amp
->a_sp
!= NULL
) {
7614 ASSERT(svd
->type
== MAP_SHARED
);
7615 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
7616 sp
= svd
->amp
->a_sp
;
7617 proj
= sp
->shm_perm
.ipc_proj
;
7621 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
7623 pageprot
= attr
& ~(SHARED
|PRIVATE
);
7624 segtype
= attr
& SHARED
? MAP_SHARED
: MAP_PRIVATE
;
7627 * We are done if the segment types don't match
7628 * or if we have segment level protections and
7631 if (svd
->type
!= segtype
) {
7632 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7635 if (svd
->pageprot
== 0 && svd
->prot
!= pageprot
) {
7636 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7641 if (op
== MC_LOCK
) {
7642 if (svd
->tr_state
== SEGVN_TR_INIT
) {
7643 svd
->tr_state
= SEGVN_TR_OFF
;
7644 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
7645 ASSERT(svd
->amp
!= NULL
);
7646 segvn_textunrepl(seg
, 0);
7647 ASSERT(svd
->amp
== NULL
&&
7648 svd
->tr_state
== SEGVN_TR_OFF
);
7653 * If we're locking, then we must create a vpage structure if
7654 * none exists. If we're unlocking, then check to see if there
7655 * is a vpage -- if not, then we could not have locked anything.
7658 if ((vpp
= svd
->vpage
) == NULL
) {
7662 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7668 * The anonymous data vector (i.e., previously
7669 * unreferenced mapping to swap space) can be allocated
7670 * by lazily testing for its existence.
7672 if (op
== MC_LOCK
&& svd
->amp
== NULL
&& svd
->vp
== NULL
) {
7673 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
7674 svd
->amp
= anonmap_alloc(seg
->s_size
, 0, ANON_SLEEP
);
7675 svd
->amp
->a_szc
= seg
->s_szc
;
7678 if ((amp
= svd
->amp
) != NULL
) {
7679 anon_index
= svd
->anon_index
+ seg_page(seg
, addr
);
7682 offset
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
7683 evp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
7686 mutex_enter(&sp
->shm_mlock
);
7688 /* determine number of unlocked bytes in range for lock operation */
7689 if (op
== MC_LOCK
) {
7692 for (vpp
= &svd
->vpage
[seg_page(seg
, addr
)]; vpp
< evp
;
7694 if (!VPP_ISPPLOCK(vpp
))
7695 unlocked_bytes
+= PAGESIZE
;
7698 ulong_t i_idx
, i_edx
;
7699 anon_sync_obj_t i_cookie
;
7704 /* Only count sysV pages once for locked memory */
7705 i_edx
= svd
->anon_index
+ seg_page(seg
, addr
+ len
);
7706 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7707 for (i_idx
= anon_index
; i_idx
< i_edx
; i_idx
++) {
7708 anon_array_enter(amp
, i_idx
, &i_cookie
);
7709 i_ap
= anon_get_ptr(amp
->ahp
, i_idx
);
7711 unlocked_bytes
+= PAGESIZE
;
7712 anon_array_exit(&i_cookie
);
7715 swap_xlate(i_ap
, &i_vp
, &i_off
);
7716 anon_array_exit(&i_cookie
);
7717 pp
= page_lookup(i_vp
, i_off
, SE_SHARED
);
7719 unlocked_bytes
+= PAGESIZE
;
7721 } else if (pp
->p_lckcnt
== 0)
7722 unlocked_bytes
+= PAGESIZE
;
7725 ANON_LOCK_EXIT(&
->a_rwlock
);
7728 mutex_enter(&p
->p_lock
);
7729 err
= rctl_incr_locked_mem(p
, proj
, unlocked_bytes
,
7731 mutex_exit(&p
->p_lock
);
7735 mutex_exit(&sp
->shm_mlock
);
7736 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7741 * Loop over all pages in the range. Process if we're locking and
7742 * page has not already been locked in this mapping; or if we're
7743 * unlocking and the page has been locked.
7745 for (vpp
= &svd
->vpage
[seg_page(seg
, addr
)]; vpp
< evp
;
7746 vpp
++, pos
++, addr
+= PAGESIZE
, offset
+= PAGESIZE
, anon_index
++) {
7747 if ((attr
== 0 || VPP_PROT(vpp
) == pageprot
) &&
7748 ((op
== MC_LOCK
&& !VPP_ISPPLOCK(vpp
)) ||
7749 (op
== MC_UNLOCK
&& VPP_ISPPLOCK(vpp
)))) {
7752 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7754 * If this isn't a MAP_NORESERVE segment and
7755 * we're locking, allocate anon slots if they
7756 * don't exist. The page is brought in later on.
7758 if (op
== MC_LOCK
&& svd
->vp
== NULL
&&
7759 ((svd
->flags
& MAP_NORESERVE
) == 0) &&
7761 ((ap
= anon_get_ptr(amp
->ahp
, anon_index
))
7763 anon_array_enter(amp
, anon_index
, &cookie
);
7765 if ((ap
= anon_get_ptr(amp
->ahp
,
7766 anon_index
)) == NULL
) {
7767 pp
= anon_zero(seg
, addr
, &ap
,
7770 anon_array_exit(&cookie
);
7771 ANON_LOCK_EXIT(&
->a_rwlock
);
7775 ASSERT(anon_get_ptr(amp
->ahp
,
7776 anon_index
) == NULL
);
7777 (void) anon_set_ptr(amp
->ahp
,
7778 anon_index
, ap
, ANON_SLEEP
);
7781 anon_array_exit(&cookie
);
7785 * Get name for page, accounting for
7786 * existence of private copy.
7790 anon_array_enter(amp
, anon_index
, &cookie
);
7791 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
7793 swap_xlate(ap
, &vp
, &off
);
7795 if (svd
->vp
== NULL
&&
7796 (svd
->flags
& MAP_NORESERVE
)) {
7797 anon_array_exit(&cookie
);
7798 ANON_LOCK_EXIT(&
->a_rwlock
);
7804 if (op
!= MC_LOCK
|| ap
== NULL
) {
7805 anon_array_exit(&cookie
);
7806 ANON_LOCK_EXIT(&
->a_rwlock
);
7814 * Get page frame. It's ok if the page is
7815 * not available when we're unlocking, as this
7816 * may simply mean that a page we locked got
7817 * truncated out of existence after we locked it.
7819 * Invoke VOP_GETPAGE() to obtain the page struct
7820 * since we may need to read it from disk if its
7824 pp
= page_lookup(vp
, off
, SE_SHARED
);
7831 error
= VOP_GETPAGE(vp
, (offset_t
)off
, PAGESIZE
,
7832 (uint_t
*)NULL
, pl
, PAGESIZE
, seg
, addr
,
7833 S_OTHER
, svd
->cred
, NULL
);
7835 if (error
&& ap
!= NULL
) {
7836 anon_array_exit(&cookie
);
7837 ANON_LOCK_EXIT(&
->a_rwlock
);
7841 * If the error is EDEADLK then we must bounce
7842 * up and drop all vm subsystem locks and then
7843 * retry the operation later
7844 * This behavior is a temporary measure because
7845 * ufs/sds logging is badly designed and will
7846 * deadlock if we don't allow this bounce to
7847 * happen. The real solution is to re-design
7848 * the logging code to work properly. See bug
7849 * 4125102 for details of the problem.
7851 if (error
== EDEADLK
) {
7856 * Quit if we fail to fault in the page. Treat
7857 * the failure as an error, unless the addr
7858 * is mapped beyond the end of a file.
7860 if (error
&& svd
->vp
) {
7861 va
.va_mask
= AT_SIZE
;
7862 if (VOP_GETATTR(svd
->vp
, &va
, 0,
7863 svd
->cred
, NULL
) != 0) {
7867 if (btopr(va
.va_size
) >=
7883 * See Statement at the beginning of this routine.
7885 * claim is always set if MAP_PRIVATE and PROT_WRITE
7886 * irrespective of following factors:
7888 * (1) anon slots are populated or not
7889 * (2) cow is broken or not
7890 * (3) refcnt on ap is 1 or greater than 1
7892 * See 4140683 for details
7894 claim
= ((VPP_PROT(vpp
) & PROT_WRITE
) &&
7895 (svd
->type
== MAP_PRIVATE
));
7898 * Perform page-level operation appropriate to
7899 * operation. If locking, undo the SOFTLOCK
7900 * performed to bring the page into memory
7901 * after setting the lock. If unlocking,
7902 * and no page was found, account for the claim
7905 if (op
== MC_LOCK
) {
7906 int ret
= 1; /* Assume success */
7908 ASSERT(!VPP_ISPPLOCK(vpp
));
7910 ret
= page_pp_lock(pp
, claim
, 0);
7912 if (ap
->an_pvp
!= NULL
) {
7913 anon_swap_free(ap
, pp
);
7915 anon_array_exit(&cookie
);
7916 ANON_LOCK_EXIT(&
->a_rwlock
);
7919 /* locking page failed */
7926 if (pp
->p_lckcnt
== 1)
7927 locked_bytes
+= PAGESIZE
;
7929 locked_bytes
+= PAGESIZE
;
7931 if (lockmap
!= (ulong_t
*)NULL
)
7932 BT_SET(lockmap
, pos
);
7936 ASSERT(VPP_ISPPLOCK(vpp
));
7938 /* sysV pages should be locked */
7939 ASSERT(sp
== NULL
|| pp
->p_lckcnt
> 0);
7940 page_pp_unlock(pp
, claim
, 0);
7942 if (pp
->p_lckcnt
== 0)
7946 unlocked_bytes
+= PAGESIZE
;
7950 unlocked_bytes
+= PAGESIZE
;
7957 if (op
== MC_LOCK
) {
7958 /* Credit back bytes that did not get locked */
7959 if ((unlocked_bytes
- locked_bytes
) > 0) {
7961 mutex_enter(&p
->p_lock
);
7962 rctl_decr_locked_mem(p
, proj
,
7963 (unlocked_bytes
- locked_bytes
), chargeproc
);
7965 mutex_exit(&p
->p_lock
);
7969 /* Account bytes that were unlocked */
7970 if (unlocked_bytes
> 0) {
7972 mutex_enter(&p
->p_lock
);
7973 rctl_decr_locked_mem(p
, proj
, unlocked_bytes
,
7976 mutex_exit(&p
->p_lock
);
7980 mutex_exit(&sp
->shm_mlock
);
7981 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7987 * Set advice from user for specified pages
7988 * There are 5 types of advice:
7989 * MADV_NORMAL - Normal (default) behavior (whatever that is)
7990 * MADV_RANDOM - Random page references
7991 * do not allow readahead or 'klustering'
7992 * MADV_SEQUENTIAL - Sequential page references
7993 * Pages previous to the one currently being
7994 * accessed (determined by fault) are 'not needed'
7995 * and are freed immediately
7996 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl)
7997 * MADV_DONTNEED - Pages are not needed (synced out in mctl)
7998 * MADV_FREE - Contents can be discarded
7999 * MADV_ACCESS_DEFAULT- Default access
8000 * MADV_ACCESS_LWP - Next LWP will access heavily
8001 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily
8004 segvn_advise(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t behav
)
8006 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
8010 struct anon_map
*amp
;
8013 lgrp_mem_policy_t policy
;
8017 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
8020 * In case of MADV_FREE, we won't be modifying any segment private
8021 * data structures; so, we only need to grab READER's lock
8023 if (behav
!= MADV_FREE
) {
8024 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
8025 if (svd
->tr_state
!= SEGVN_TR_OFF
) {
8026 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8030 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
8034 * Large pages are assumed to be only turned on when accesses to the
8035 * segment's address range have spatial and temporal locality. That
8036 * justifies ignoring MADV_SEQUENTIAL for large page segments.
8037 * Also, ignore advice affecting lgroup memory allocation
8038 * if don't need to do lgroup optimizations on this system
8041 if ((behav
== MADV_SEQUENTIAL
&&
8042 (seg
->s_szc
!= 0 || HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
))) ||
8043 (!lgrp_optimizations() && (behav
== MADV_ACCESS_DEFAULT
||
8044 behav
== MADV_ACCESS_LWP
|| behav
== MADV_ACCESS_MANY
))) {
8045 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8049 if (behav
== MADV_SEQUENTIAL
|| behav
== MADV_ACCESS_DEFAULT
||
8050 behav
== MADV_ACCESS_LWP
|| behav
== MADV_ACCESS_MANY
) {
8052 * Since we are going to unload hat mappings
8053 * we first have to flush the cache. Otherwise
8054 * this might lead to system panic if another
8055 * thread is doing physio on the range whose
8056 * mappings are unloaded by madvise(3C).
8058 if (svd
->softlockcnt
> 0) {
8060 * If this is shared segment non 0 softlockcnt
8061 * means locked pages are still in use.
8063 if (svd
->type
== MAP_SHARED
) {
8064 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8068 * Since we do have the segvn writers lock
8069 * nobody can fill the cache with entries
8070 * belonging to this seg during the purge.
8071 * The flush either succeeds or we still
8072 * have pending I/Os. In the later case,
8073 * madvise(3C) fails.
8076 if (svd
->softlockcnt
> 0) {
8078 * Since madvise(3C) is advisory and
8079 * it's not part of UNIX98, madvise(3C)
8080 * failure here doesn't cause any hardship.
8081 * Note that we don't block in "as" layer.
8083 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8086 } else if (svd
->type
== MAP_SHARED
&& svd
->amp
!= NULL
&&
8087 svd
->amp
->a_softlockcnt
> 0) {
8089 * Try to purge this amp's entries from pcache. It
8090 * will succeed only if other segments that share the
8091 * amp have no outstanding softlock's.
8099 if (behav
== MADV_FREE
) {
8101 * MADV_FREE is not supported for segments with
8102 * underlying object; if anonmap is NULL, anon slots
8103 * are not yet populated and there is nothing for
8104 * us to do. As MADV_FREE is advisory, we don't
8105 * return error in either case.
8107 if (vp
!= NULL
|| amp
== NULL
) {
8108 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8114 page
= seg_page(seg
, addr
);
8115 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
8116 anon_disclaim(amp
, svd
->anon_index
+ page
, len
);
8117 ANON_LOCK_EXIT(&
->a_rwlock
);
8118 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8123 * If advice is to be applied to entire segment,
8124 * use advice field in seg_data structure
8125 * otherwise use appropriate vpage entry.
8127 if ((addr
== seg
->s_base
) && (len
== seg
->s_size
)) {
8129 case MADV_ACCESS_LWP
:
8130 case MADV_ACCESS_MANY
:
8131 case MADV_ACCESS_DEFAULT
:
8133 * Set memory allocation policy for this segment
8135 policy
= lgrp_madv_to_policy(behav
, len
, svd
->type
);
8136 if (svd
->type
== MAP_SHARED
)
8137 already_set
= lgrp_shm_policy_set(policy
, amp
,
8138 svd
->anon_index
, vp
, svd
->offset
, len
);
8141 * For private memory, need writers lock on
8142 * address space because the segment may be
8143 * split or concatenated when changing policy
8145 if (AS_READ_HELD(seg
->s_as
,
8146 &seg
->s_as
->a_lock
)) {
8147 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8151 already_set
= lgrp_privm_policy_set(policy
,
8152 &svd
->policy_info
, len
);
8156 * If policy set already and it shouldn't be reapplied,
8157 * don't do anything.
8160 !LGRP_MEM_POLICY_REAPPLICABLE(policy
))
8164 * Mark any existing pages in given range for
8167 page_mark_migrate(seg
, addr
, len
, amp
, svd
->anon_index
,
8168 vp
, svd
->offset
, 1);
8171 * If same policy set already or this is a shared
8172 * memory segment, don't need to try to concatenate
8173 * segment with adjacent ones.
8175 if (already_set
|| svd
->type
== MAP_SHARED
)
8179 * Try to concatenate this segment with previous
8180 * one and next one, since we changed policy for
8181 * this one and it may be compatible with adjacent
8184 prev
= AS_SEGPREV(seg
->s_as
, seg
);
8185 next
= AS_SEGNEXT(seg
->s_as
, seg
);
8187 if (next
&& next
->s_ops
== &segvn_ops
&&
8188 addr
+ len
== next
->s_base
)
8189 (void) segvn_concat(seg
, next
, 1);
8191 if (prev
&& prev
->s_ops
== &segvn_ops
&&
8192 addr
== prev
->s_base
+ prev
->s_size
) {
8194 * Drop lock for private data of current
8195 * segment before concatenating (deleting) it
8196 * and return IE_REATTACH to tell as_ctl() that
8197 * current segment has changed
8199 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8200 if (!segvn_concat(prev
, seg
, 1))
8207 case MADV_SEQUENTIAL
:
8209 * unloading mapping guarantees
8210 * detection in segvn_fault
8212 ASSERT(seg
->s_szc
== 0);
8213 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
8214 hat_unload(seg
->s_as
->a_hat
, addr
, len
,
8219 svd
->advice
= (uchar_t
)behav
;
8220 svd
->pageadvice
= 0;
8222 case MADV_WILLNEED
: /* handled in memcntl */
8223 case MADV_DONTNEED
: /* handled in memcntl */
8224 case MADV_FREE
: /* handled above */
8231 struct seg
*new_seg
;
8232 struct segvn_data
*new_svd
;
8236 page
= seg_page(seg
, addr
);
8241 struct vpage
*bvpp
, *evpp
;
8243 case MADV_ACCESS_LWP
:
8244 case MADV_ACCESS_MANY
:
8245 case MADV_ACCESS_DEFAULT
:
8247 * Set memory allocation policy for portion of this
8252 * Align address and length of advice to page
8253 * boundaries for large pages
8255 if (seg
->s_szc
!= 0) {
8258 pgsz
= page_get_pagesize(seg
->s_szc
);
8259 addr
= (caddr_t
)P2ALIGN((uintptr_t)addr
, pgsz
);
8260 len
= P2ROUNDUP(len
, pgsz
);
8264 * Check to see whether policy is set already
8266 policy
= lgrp_madv_to_policy(behav
, len
, svd
->type
);
8268 anon_index
= svd
->anon_index
+ page
;
8269 off
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
8271 if (svd
->type
== MAP_SHARED
)
8272 already_set
= lgrp_shm_policy_set(policy
, amp
,
8273 anon_index
, vp
, off
, len
);
8276 (policy
== svd
->policy_info
.mem_policy
);
8279 * If policy set already and it shouldn't be reapplied,
8280 * don't do anything.
8283 !LGRP_MEM_POLICY_REAPPLICABLE(policy
))
8287 * For private memory, need writers lock on
8288 * address space because the segment may be
8289 * split or concatenated when changing policy
8291 if (svd
->type
== MAP_PRIVATE
&&
8292 AS_READ_HELD(seg
->s_as
, &seg
->s_as
->a_lock
)) {
8293 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8298 * Mark any existing pages in given range for
8301 page_mark_migrate(seg
, addr
, len
, amp
, svd
->anon_index
,
8302 vp
, svd
->offset
, 1);
8305 * Don't need to try to split or concatenate
8306 * segments, since policy is same or this is a shared
8309 if (already_set
|| svd
->type
== MAP_SHARED
)
8312 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
8313 ASSERT(svd
->amp
== NULL
);
8314 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
8315 ASSERT(svd
->softlockcnt
== 0);
8316 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
8318 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
8322 * Split off new segment if advice only applies to a
8323 * portion of existing segment starting in middle
8327 oldeaddr
= seg
->s_base
+ seg
->s_size
;
8328 if (addr
> seg
->s_base
) {
8330 * Must flush I/O page cache
8331 * before splitting segment
8333 if (svd
->softlockcnt
> 0)
8337 * Split segment and return IE_REATTACH to tell
8338 * as_ctl() that current segment changed
8340 new_seg
= segvn_split_seg(seg
, addr
);
8341 new_svd
= (struct segvn_data
*)new_seg
->s_data
;
8345 * If new segment ends where old one
8346 * did, try to concatenate the new
8347 * segment with next one.
8349 if (eaddr
== oldeaddr
) {
8351 * Set policy for new segment
8353 (void) lgrp_privm_policy_set(policy
,
8354 &new_svd
->policy_info
,
8357 next
= AS_SEGNEXT(new_seg
->s_as
,
8361 next
->s_ops
== &segvn_ops
&&
8362 eaddr
== next
->s_base
)
8363 (void) segvn_concat(new_seg
,
8369 * Split off end of existing segment if advice only
8370 * applies to a portion of segment ending before
8371 * end of the existing segment
8373 if (eaddr
< oldeaddr
) {
8375 * Must flush I/O page cache
8376 * before splitting segment
8378 if (svd
->softlockcnt
> 0)
8382 * If beginning of old segment was already
8383 * split off, use new segment to split end off
8386 if (new_seg
!= NULL
&& new_seg
!= seg
) {
8390 (void) segvn_split_seg(new_seg
, eaddr
);
8393 * Set policy for new segment
8395 (void) lgrp_privm_policy_set(policy
,
8396 &new_svd
->policy_info
,
8400 * Split segment and return IE_REATTACH
8401 * to tell as_ctl() that current
8404 (void) segvn_split_seg(seg
, eaddr
);
8407 (void) lgrp_privm_policy_set(policy
,
8408 &svd
->policy_info
, seg
->s_size
);
8411 * If new segment starts where old one
8412 * did, try to concatenate it with
8415 if (addr
== seg
->s_base
) {
8416 prev
= AS_SEGPREV(seg
->s_as
,
8420 * Drop lock for private data
8421 * of current segment before
8422 * concatenating (deleting) it
8427 addr
== prev
->s_base
+
8432 (void) segvn_concat(
8440 case MADV_SEQUENTIAL
:
8441 ASSERT(seg
->s_szc
== 0);
8442 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
8443 hat_unload(seg
->s_as
->a_hat
, addr
, len
, HAT_UNLOAD
);
8447 bvpp
= &svd
->vpage
[page
];
8448 evpp
= &svd
->vpage
[page
+ (len
>> PAGESHIFT
)];
8449 for (; bvpp
< evpp
; bvpp
++)
8450 VPP_SETADVICE(bvpp
, behav
);
8451 svd
->advice
= MADV_NORMAL
;
8453 case MADV_WILLNEED
: /* handled in memcntl */
8454 case MADV_DONTNEED
: /* handled in memcntl */
8455 case MADV_FREE
: /* handled above */
8461 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8466 * Create a vpage structure for this seg.
8469 segvn_vpage(struct seg
*seg
)
8471 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
8472 struct vpage
*vp
, *evp
;
8474 ASSERT(SEGVN_WRITE_HELD(seg
->s_as
, &svd
->lock
));
8477 * If no vpage structure exists, allocate one. Copy the protections
8478 * and the advice from the segment itself to the individual pages.
8480 if (svd
->vpage
== NULL
) {
8481 svd
->pageadvice
= 1;
8482 svd
->vpage
= kmem_zalloc(seg_pages(seg
) * sizeof (struct vpage
),
8484 evp
= &svd
->vpage
[seg_page(seg
, seg
->s_base
+ seg
->s_size
)];
8485 for (vp
= svd
->vpage
; vp
< evp
; vp
++) {
8486 VPP_SETPROT(vp
, svd
->prot
);
8487 VPP_SETADVICE(vp
, svd
->advice
);
8493 * Dump the pages belonging to this segvn segment.
8496 segvn_dump(struct seg
*seg
)
8498 struct segvn_data
*svd
;
8500 struct anon_map
*amp
;
8503 u_offset_t off
, offset
;
8505 pgcnt_t page
, npages
;
8508 npages
= seg_pages(seg
);
8509 svd
= (struct segvn_data
*)seg
->s_data
;
8511 off
= offset
= svd
->offset
;
8514 if ((amp
= svd
->amp
) != NULL
) {
8515 anon_index
= svd
->anon_index
;
8516 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
8519 for (page
= 0; page
< npages
; page
++, offset
+= PAGESIZE
) {
8523 if (amp
&& (ap
= anon_get_ptr(svd
->amp
->ahp
, anon_index
++))) {
8524 swap_xlate_nopanic(ap
, &vp
, &off
);
8531 * If pp == NULL, the page either does not exist
8532 * or is exclusively locked. So determine if it
8533 * exists before searching for it.
8536 if ((pp
= page_lookup_nowait(vp
, off
, SE_SHARED
)))
8539 pp
= page_exists(vp
, off
);
8542 pfn
= page_pptonum(pp
);
8543 dump_addpage(seg
->s_as
, addr
, pfn
);
8548 dump_timeleft
= dump_timeout
;
8552 ANON_LOCK_EXIT(&
->a_rwlock
);
8556 static uint32_t segvn_pglock_mtbf
= 0;
8559 #define PCACHE_SHWLIST ((page_t *)-2)
8560 #define NOPCACHE_SHWLIST ((page_t *)-1)
8563 * Lock/Unlock anon pages over a given range. Return shadow list. This routine
8564 * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages
8565 * to avoid the overhead of per page locking, unlocking for subsequent IOs to
8566 * the same parts of the segment. Currently shadow list creation is only
8567 * supported for pure anon segments. MAP_PRIVATE segment pcache entries are
8568 * tagged with segment pointer, starting virtual address and length. This
8569 * approach for MAP_SHARED segments may add many pcache entries for the same
8570 * set of pages and lead to long hash chains that decrease pcache lookup
8571 * performance. To avoid this issue for shared segments shared anon map and
8572 * starting anon index are used for pcache entry tagging. This allows all
8573 * segments to share pcache entries for the same anon range and reduces pcache
8574 * chain's length as well as memory overhead from duplicate shadow lists and
8577 * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd
8578 * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock
8579 * part of softlockcnt accounting is done differently for private and shared
8580 * segments. In private segment case softlock is only incremented when a new
8581 * shadow list is created but not when an existing one is found via
8582 * seg_plookup(). pcache entries have reference count incremented/decremented
8583 * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0
8584 * reference count can be purged (and purging is needed before segment can be
8585 * freed). When a private segment pcache entry is purged segvn_reclaim() will
8586 * decrement softlockcnt. Since in private segment case each of its pcache
8587 * entries only belongs to this segment we can expect that when
8588 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
8589 * segment purge will succeed and softlockcnt will drop to 0. In shared
8590 * segment case reference count in pcache entry counts active locks from many
8591 * different segments so we can't expect segment purging to succeed even when
8592 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
8593 * segment. To be able to determine when there're no pending pagelocks in
8594 * shared segment case we don't rely on purging to make softlockcnt drop to 0
8595 * but instead softlockcnt is incremented and decremented for every
8596 * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow
8597 * list was created or an existing one was found. When softlockcnt drops to 0
8598 * this segment no longer has any claims for pcached shadow lists and the
8599 * segment can be freed even if there're still active pcache entries
8600 * shared by this segment anon map. Shared segment pcache entries belong to
8601 * anon map and are typically removed when anon map is freed after all
8602 * processes destroy the segments that use this anon map.
8605 segvn_pagelock(struct seg
*seg
, caddr_t addr
, size_t len
, struct page
***ppp
,
8606 enum lock_type type
, enum seg_rw rw
)
8608 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
8610 pgcnt_t adjustpages
;
8613 uint_t protchk
= (rw
== S_READ
) ? PROT_READ
: PROT_WRITE
;
8615 struct anon_map
*amp
;
8617 struct page
**pplist
, **pl
, *pp
;
8620 caddr_t lpgaddr
, lpgeaddr
;
8621 anon_sync_obj_t cookie
;
8623 struct anon_map
*pamp
;
8625 seg_preclaim_cbfunc_t preclaim_callback
;
8630 int sftlck_sbase
= 0;
8631 int sftlck_send
= 0;
8634 if (type
== L_PAGELOCK
&& segvn_pglock_mtbf
) {
8635 hrtime_t ts
= gethrtime();
8636 if ((ts
% segvn_pglock_mtbf
) == 0) {
8639 if ((ts
% segvn_pglock_mtbf
) == 1) {
8645 TRACE_2(TR_FAC_PHYSIO
, TR_PHYSIO_SEGVN_START
,
8646 "segvn_pagelock: start seg %p addr %p", seg
, addr
);
8648 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
8649 ASSERT(type
== L_PAGELOCK
|| type
== L_PAGEUNLOCK
);
8651 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
8654 * for now we only support pagelock to anon memory. We would have to
8655 * check protections for vnode objects and call into the vnode driver.
8656 * That's too much for a fast path. Let the fault entry point handle
8659 if (svd
->vp
!= NULL
) {
8660 if (type
== L_PAGELOCK
) {
8664 panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL");
8666 if ((amp
= svd
->amp
) == NULL
) {
8667 if (type
== L_PAGELOCK
) {
8671 panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL");
8673 if (rw
!= S_READ
&& rw
!= S_WRITE
) {
8674 if (type
== L_PAGELOCK
) {
8678 panic("segvn_pagelock(L_PAGEUNLOCK): bad rw");
8681 if (seg
->s_szc
!= 0) {
8683 * We are adjusting the pagelock region to the large page size
8684 * boundary because the unlocked part of a large page cannot
8685 * be freed anyway unless all constituent pages of a large
8686 * page are locked. Bigger regions reduce pcache chain length
8687 * and improve lookup performance. The tradeoff is that the
8688 * very first segvn_pagelock() call for a given page is more
8689 * expensive if only 1 page_t is needed for IO. This is only
8690 * an issue if pcache entry doesn't get reused by several
8691 * subsequent calls. We optimize here for the case when pcache
8692 * is heavily used by repeated IOs to the same address range.
8694 * Note segment's page size cannot change while we are holding
8695 * as lock. And then it cannot change while softlockcnt is
8696 * not 0. This will allow us to correctly recalculate large
8697 * page size region for the matching pageunlock/reclaim call
8698 * since as_pageunlock() caller must always match
8699 * as_pagelock() call's addr and len.
8701 * For pageunlock *ppp points to the pointer of page_t that
8702 * corresponds to the real unadjusted start address. Similar
8703 * for pagelock *ppp must point to the pointer of page_t that
8704 * corresponds to the real unadjusted start address.
8706 pgsz
= page_get_pagesize(seg
->s_szc
);
8707 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
, lpgeaddr
);
8708 adjustpages
= btop((uintptr_t)(addr
- lpgaddr
));
8709 } else if (len
< segvn_pglock_comb_thrshld
) {
8711 lpgeaddr
= addr
+ len
;
8716 * Align the address range of large enough requests to allow
8717 * combining of different shadow lists into 1 to reduce memory
8718 * overhead from potentially overlapping large shadow lists
8719 * (worst case is we have a 1MB IO into buffers with start
8720 * addresses separated by 4K). Alignment is only possible if
8721 * padded chunks have sufficient access permissions. Note
8722 * permissions won't change between L_PAGELOCK and
8723 * L_PAGEUNLOCK calls since non 0 softlockcnt will force
8724 * segvn_setprot() to wait until softlockcnt drops to 0. This
8725 * allows us to determine in L_PAGEUNLOCK the same range we
8726 * computed in L_PAGELOCK.
8728 * If alignment is limited by segment ends set
8729 * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when
8730 * these flags are set bump softlockcnt_sbase/softlockcnt_send
8731 * per segment counters. In L_PAGEUNLOCK case decrease
8732 * softlockcnt_sbase/softlockcnt_send counters if
8733 * sftlck_sbase/sftlck_send flags are set. When
8734 * softlockcnt_sbase/softlockcnt_send are non 0
8735 * segvn_concat()/segvn_extend_prev()/segvn_extend_next()
8736 * won't merge the segments. This restriction combined with
8737 * restriction on segment unmapping and splitting for segments
8738 * that have non 0 softlockcnt allows L_PAGEUNLOCK to
8739 * correctly determine the same range that was previously
8740 * locked by matching L_PAGELOCK.
8742 pflags
= SEGP_PSHIFT
| (segvn_pglock_comb_bshift
<< 16);
8744 if (svd
->type
== MAP_PRIVATE
) {
8745 lpgaddr
= (caddr_t
)P2ALIGN((uintptr_t)addr
,
8746 segvn_pglock_comb_balign
);
8747 if (lpgaddr
< seg
->s_base
) {
8748 lpgaddr
= seg
->s_base
;
8752 ulong_t aix
= svd
->anon_index
+ seg_page(seg
, addr
);
8753 ulong_t aaix
= P2ALIGN(aix
, segvn_pglock_comb_palign
);
8754 if (aaix
< svd
->anon_index
) {
8755 lpgaddr
= seg
->s_base
;
8758 lpgaddr
= addr
- ptob(aix
- aaix
);
8759 ASSERT(lpgaddr
>= seg
->s_base
);
8762 if (svd
->pageprot
&& lpgaddr
!= addr
) {
8763 struct vpage
*vp
= &svd
->vpage
[seg_page(seg
, lpgaddr
)];
8764 struct vpage
*evp
= &svd
->vpage
[seg_page(seg
, addr
)];
8766 if ((VPP_PROT(vp
) & protchk
) == 0) {
8776 lpgeaddr
= addr
+ len
;
8778 if (svd
->type
== MAP_PRIVATE
) {
8779 lpgeaddr
= (caddr_t
)P2ROUNDUP(
8780 (uintptr_t)lpgeaddr
,
8781 segvn_pglock_comb_balign
);
8783 ulong_t aix
= svd
->anon_index
+
8784 seg_page(seg
, lpgeaddr
);
8785 ulong_t aaix
= P2ROUNDUP(aix
,
8786 segvn_pglock_comb_palign
);
8790 lpgeaddr
+= ptob(aaix
- aix
);
8793 if (lpgeaddr
== 0 ||
8794 lpgeaddr
> seg
->s_base
+ seg
->s_size
) {
8795 lpgeaddr
= seg
->s_base
+ seg
->s_size
;
8799 if (svd
->pageprot
&& lpgeaddr
!= addr
+ len
) {
8803 vp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
8804 evp
= &svd
->vpage
[seg_page(seg
, lpgeaddr
)];
8807 if ((VPP_PROT(vp
) & protchk
) == 0) {
8813 lpgeaddr
= addr
+ len
;
8816 adjustpages
= btop((uintptr_t)(addr
- lpgaddr
));
8820 * For MAP_SHARED segments we create pcache entries tagged by amp and
8821 * anon index so that we can share pcache entries with other segments
8822 * that map this amp. For private segments pcache entries are tagged
8823 * with segment and virtual address.
8825 if (svd
->type
== MAP_SHARED
) {
8827 paddr
= (caddr_t
)((lpgaddr
- seg
->s_base
) +
8828 ptob(svd
->anon_index
));
8829 preclaim_callback
= shamp_reclaim
;
8833 preclaim_callback
= segvn_reclaim
;
8836 if (type
== L_PAGEUNLOCK
) {
8837 VM_STAT_ADD(segvnvmstats
.pagelock
[0]);
8840 * update hat ref bits for /proc. We need to make sure
8841 * that threads tracing the ref and mod bits of the
8842 * address space get the right data.
8843 * Note: page ref and mod bits are updated at reclaim time
8845 if (seg
->s_as
->a_vbits
) {
8846 for (a
= addr
; a
< addr
+ len
; a
+= PAGESIZE
) {
8847 if (rw
== S_WRITE
) {
8848 hat_setstat(seg
->s_as
, a
,
8849 PAGESIZE
, P_REF
| P_MOD
);
8851 hat_setstat(seg
->s_as
, a
,
8858 * Check the shadow list entry after the last page used in
8859 * this IO request. If it's NOPCACHE_SHWLIST the shadow list
8860 * was not inserted into pcache and is not large page
8861 * adjusted. In this case call reclaim callback directly and
8862 * don't adjust the shadow list start and size for large
8866 if ((*ppp
)[npages
] == NOPCACHE_SHWLIST
) {
8869 ASSERT(svd
->type
== MAP_SHARED
);
8870 ptag
= (void *)pamp
;
8871 paddr
= (caddr_t
)((addr
- seg
->s_base
) +
8872 ptob(svd
->anon_index
));
8877 (*preclaim_callback
)(ptag
, paddr
, len
, *ppp
, rw
, 0);
8879 ASSERT((*ppp
)[npages
] == PCACHE_SHWLIST
||
8880 IS_SWAPFSVP((*ppp
)[npages
]->p_vnode
));
8881 len
= lpgeaddr
- lpgaddr
;
8883 seg_pinactive(seg
, pamp
, paddr
, len
,
8884 *ppp
- adjustpages
, rw
, pflags
, preclaim_callback
);
8888 ASSERT(svd
->type
== MAP_SHARED
);
8889 ASSERT(svd
->softlockcnt
>= npages
);
8890 atomic_add_long((ulong_t
*)&svd
->softlockcnt
, -npages
);
8894 ASSERT(svd
->softlockcnt_sbase
> 0);
8895 atomic_add_long((ulong_t
*)&svd
->softlockcnt_sbase
, -1);
8898 ASSERT(svd
->softlockcnt_send
> 0);
8899 atomic_add_long((ulong_t
*)&svd
->softlockcnt_send
, -1);
8903 * If someone is blocked while unmapping, we purge
8904 * segment page cache and thus reclaim pplist synchronously
8905 * without waiting for seg_pasync_thread. This speeds up
8906 * unmapping in cases where munmap(2) is called, while
8907 * raw async i/o is still in progress or where a thread
8908 * exits on data fault in a multithreaded application.
8910 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
8911 if (svd
->softlockcnt
== 0) {
8912 mutex_enter(&seg
->s_as
->a_contents
);
8913 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
8914 AS_CLRUNMAPWAIT(seg
->s_as
);
8915 cv_broadcast(&seg
->s_as
->a_cv
);
8917 mutex_exit(&seg
->s_as
->a_contents
);
8918 } else if (pamp
== NULL
) {
8920 * softlockcnt is not 0 and this is a
8921 * MAP_PRIVATE segment. Try to purge its
8922 * pcache entries to reduce softlockcnt.
8923 * If it drops to 0 segvn_reclaim()
8924 * will wake up a thread waiting on
8927 * We don't purge MAP_SHARED segments with non
8928 * 0 softlockcnt since IO is still in progress
8929 * for such segments.
8931 ASSERT(svd
->type
== MAP_PRIVATE
);
8935 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8936 TRACE_2(TR_FAC_PHYSIO
, TR_PHYSIO_SEGVN_UNLOCK_END
,
8937 "segvn_pagelock: unlock seg %p addr %p", seg
, addr
);
8941 /* The L_PAGELOCK case ... */
8943 VM_STAT_ADD(segvnvmstats
.pagelock
[1]);
8946 * For MAP_SHARED segments we have to check protections before
8947 * seg_plookup() since pcache entries may be shared by many segments
8948 * with potentially different page protections.
8951 ASSERT(svd
->type
== MAP_SHARED
);
8952 if (svd
->pageprot
== 0) {
8953 if ((svd
->prot
& protchk
) == 0) {
8959 * check page protections
8970 for (; a
< ea
; a
+= pgsz
) {
8973 ASSERT(seg
->s_szc
== 0 ||
8974 sameprot(seg
, a
, pgsz
));
8975 vp
= &svd
->vpage
[seg_page(seg
, a
)];
8976 if ((VPP_PROT(vp
) & protchk
) == 0) {
8985 * try to find pages in segment page cache
8987 pplist
= seg_plookup(seg
, pamp
, paddr
, lpgeaddr
- lpgaddr
, rw
, pflags
);
8988 if (pplist
!= NULL
) {
8990 npages
= btop((uintptr_t)(lpgeaddr
- lpgaddr
));
8991 ASSERT(svd
->type
== MAP_SHARED
);
8992 atomic_add_long((ulong_t
*)&svd
->softlockcnt
,
8996 atomic_add_long((ulong_t
*)&svd
->softlockcnt_sbase
, 1);
8999 atomic_add_long((ulong_t
*)&svd
->softlockcnt_send
, 1);
9001 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
9002 *ppp
= pplist
+ adjustpages
;
9003 TRACE_2(TR_FAC_PHYSIO
, TR_PHYSIO_SEGVN_HIT_END
,
9004 "segvn_pagelock: cache hit seg %p addr %p", seg
, addr
);
9009 * For MAP_SHARED segments we already verified above that segment
9010 * protections allow this pagelock operation.
9013 ASSERT(svd
->type
== MAP_PRIVATE
);
9014 if (svd
->pageprot
== 0) {
9015 if ((svd
->prot
& protchk
) == 0) {
9019 if (svd
->prot
& PROT_WRITE
) {
9020 wlen
= lpgeaddr
- lpgaddr
;
9023 ASSERT(rw
== S_READ
);
9028 * check page protections
9030 for (a
= lpgaddr
, wlen
= 0; a
< lpgeaddr
; a
+= pgsz
) {
9033 ASSERT(seg
->s_szc
== 0 ||
9034 sameprot(seg
, a
, pgsz
));
9035 vp
= &svd
->vpage
[seg_page(seg
, a
)];
9036 if ((VPP_PROT(vp
) & protchk
) == 0) {
9040 if (wcont
&& (VPP_PROT(vp
) & PROT_WRITE
)) {
9044 ASSERT(rw
== S_READ
);
9048 ASSERT(rw
== S_READ
|| wlen
== lpgeaddr
- lpgaddr
);
9049 ASSERT(rw
== S_WRITE
|| wlen
<= lpgeaddr
- lpgaddr
);
9053 * Only build large page adjusted shadow list if we expect to insert
9054 * it into pcache. For large enough pages it's a big overhead to
9055 * create a shadow list of the entire large page. But this overhead
9056 * should be amortized over repeated pcache hits on subsequent reuse
9057 * of this shadow list (IO into any range within this shadow list will
9058 * find it in pcache since we large page align the request for pcache
9059 * lookups). pcache performance is improved with bigger shadow lists
9060 * as it reduces the time to pcache the entire big segment and reduces
9061 * pcache chain length.
9063 if (seg_pinsert_check(seg
, pamp
, paddr
,
9064 lpgeaddr
- lpgaddr
, pflags
) == SEGP_SUCCESS
) {
9066 len
= lpgeaddr
- lpgaddr
;
9071 * Since this entry will not be inserted into the pcache, we
9072 * will not do any adjustments to the starting address or
9073 * size of the memory to be locked.
9079 pplist
= kmem_alloc(sizeof (page_t
*) * (npages
+ 1), KM_SLEEP
);
9081 *ppp
= pplist
+ adjustpages
;
9083 * If use_pcache is 0 this shadow list is not large page adjusted.
9084 * Record this info in the last entry of shadow array so that
9085 * L_PAGEUNLOCK can determine if it should large page adjust the
9086 * address range to find the real range that was locked.
9088 pl
[npages
] = use_pcache
? PCACHE_SHWLIST
: NOPCACHE_SHWLIST
;
9090 page
= seg_page(seg
, addr
);
9091 anon_index
= svd
->anon_index
+ page
;
9094 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
9095 ASSERT(amp
->a_szc
>= seg
->s_szc
);
9096 anpgcnt
= page_get_pagecnt(amp
->a_szc
);
9097 for (a
= addr
; a
< addr
+ len
; a
+= PAGESIZE
, anon_index
++) {
9103 * Lock and unlock anon array only once per large page.
9104 * anon_array_enter() locks the root anon slot according to
9105 * a_szc which can't change while anon map is locked. We lock
9106 * anon the first time through this loop and each time we
9107 * reach anon index that corresponds to a root of a large
9110 if (a
== addr
|| P2PHASE(anon_index
, anpgcnt
) == 0) {
9111 ASSERT(anlock
== 0);
9112 anon_array_enter(amp
, anon_index
, &cookie
);
9115 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
9118 * We must never use seg_pcache for COW pages
9119 * because we might end up with original page still
9120 * lying in seg_pcache even after private page is
9121 * created. This leads to data corruption as
9122 * aio_write refers to the page still in cache
9123 * while all other accesses refer to the private
9126 if (ap
== NULL
|| ap
->an_refcnt
!= 1) {
9127 struct vpage
*vpage
;
9133 if (svd
->vpage
!= NULL
) {
9134 vpage
= &svd
->vpage
[seg_page(seg
, a
)];
9139 anon_array_exit(&cookie
);
9142 error
= segvn_faultpage(seg
->s_as
->a_hat
, seg
, a
, 0,
9143 vpage
, &pp
, 0, F_INVAL
, rw
, 1);
9145 error
= fc_decode(error
);
9148 anon_array_enter(amp
, anon_index
, &cookie
);
9150 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
9151 if (ap
== NULL
|| ap
->an_refcnt
!= 1) {
9156 swap_xlate(ap
, &vp
, &off
);
9157 pp
= page_lookup_nowait(vp
, off
, SE_SHARED
);
9162 if (ap
->an_pvp
!= NULL
) {
9163 anon_swap_free(ap
, pp
);
9166 * Unlock anon if this is the last slot in a large page.
9168 if (P2PHASE(anon_index
, anpgcnt
) == anpgcnt
- 1) {
9170 anon_array_exit(&cookie
);
9175 if (anlock
) { /* Ensure the lock is dropped */
9176 anon_array_exit(&cookie
);
9178 ANON_LOCK_EXIT(&
->a_rwlock
);
9180 if (a
>= addr
+ len
) {
9181 atomic_add_long((ulong_t
*)&svd
->softlockcnt
, npages
);
9183 ASSERT(svd
->type
== MAP_SHARED
);
9184 atomic_add_long((ulong_t
*)&pamp
->a_softlockcnt
,
9189 atomic_add_long((ulong_t
*)&svd
->softlockcnt_sbase
, 1);
9192 atomic_add_long((ulong_t
*)&svd
->softlockcnt_send
, 1);
9195 (void) seg_pinsert(seg
, pamp
, paddr
, len
, wlen
, pl
,
9196 rw
, pflags
, preclaim_callback
);
9198 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
9199 TRACE_2(TR_FAC_PHYSIO
, TR_PHYSIO_SEGVN_FILL_END
,
9200 "segvn_pagelock: cache fill seg %p addr %p", seg
, addr
);
9205 np
= ((uintptr_t)(a
- addr
)) >> PAGESHIFT
;
9206 while (np
> (uint_t
)0) {
9207 ASSERT(PAGE_LOCKED(*pplist
));
9208 page_unlock(*pplist
);
9212 kmem_free(pl
, sizeof (page_t
*) * (npages
+ 1));
9214 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
9216 TRACE_2(TR_FAC_PHYSIO
, TR_PHYSIO_SEGVN_MISS_END
,
9217 "segvn_pagelock: cache miss seg %p addr %p", seg
, addr
);
9222 * purge any cached pages in the I/O page cache
9225 segvn_purge(struct seg
*seg
)
9227 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9230 * pcache is only used by pure anon segments.
9232 if (svd
->amp
== NULL
|| svd
->vp
!= NULL
) {
9237 * For MAP_SHARED segments non 0 segment's softlockcnt means
9238 * active IO is still in progress via this segment. So we only
9239 * purge MAP_SHARED segments when their softlockcnt is 0.
9241 if (svd
->type
== MAP_PRIVATE
) {
9242 if (svd
->softlockcnt
) {
9243 seg_ppurge(seg
, NULL
, 0);
9245 } else if (svd
->softlockcnt
== 0 && svd
->amp
->a_softlockcnt
!= 0) {
9246 seg_ppurge(seg
, svd
->amp
, 0);
9251 * If async argument is not 0 we are called from pcache async thread and don't
9257 segvn_reclaim(void *ptag
, caddr_t addr
, size_t len
, struct page
**pplist
,
9258 enum seg_rw rw
, int async
)
9260 struct seg
*seg
= (struct seg
*)ptag
;
9261 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9265 npages
= np
= btop(len
);
9268 ASSERT(svd
->vp
== NULL
&& svd
->amp
!= NULL
);
9269 ASSERT(svd
->softlockcnt
>= npages
);
9270 ASSERT(async
|| AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
9274 ASSERT(pl
[np
] == NOPCACHE_SHWLIST
|| pl
[np
] == PCACHE_SHWLIST
);
9275 ASSERT(!async
|| pl
[np
] == PCACHE_SHWLIST
);
9277 while (np
> (uint_t
)0) {
9278 if (rw
== S_WRITE
) {
9279 hat_setrefmod(*pplist
);
9281 hat_setref(*pplist
);
9283 page_unlock(*pplist
);
9288 kmem_free(pl
, sizeof (page_t
*) * (npages
+ 1));
9291 * If we are pcache async thread we don't hold AS lock. This means if
9292 * softlockcnt drops to 0 after the decrement below address space may
9293 * get freed. We can't allow it since after softlock derement to 0 we
9294 * still need to access as structure for possible wakeup of unmap
9295 * waiters. To prevent the disappearance of as we take this segment
9296 * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to
9297 * make sure this routine completes before segment is freed.
9299 * The second complication we have to deal with in async case is a
9300 * possibility of missed wake up of unmap wait thread. When we don't
9301 * hold as lock here we may take a_contents lock before unmap wait
9302 * thread that was first to see softlockcnt was still not 0. As a
9303 * result we'll fail to wake up an unmap wait thread. To avoid this
9304 * race we set nounmapwait flag in as structure if we drop softlockcnt
9305 * to 0 when we were called by pcache async thread. unmapwait thread
9306 * will not block if this flag is set.
9309 mutex_enter(&svd
->segfree_syncmtx
);
9312 if (!atomic_add_long_nv((ulong_t
*)&svd
->softlockcnt
, -npages
)) {
9313 if (async
|| AS_ISUNMAPWAIT(seg
->s_as
)) {
9314 mutex_enter(&seg
->s_as
->a_contents
);
9316 AS_SETNOUNMAPWAIT(seg
->s_as
);
9318 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
9319 AS_CLRUNMAPWAIT(seg
->s_as
);
9320 cv_broadcast(&seg
->s_as
->a_cv
);
9322 mutex_exit(&seg
->s_as
->a_contents
);
9327 mutex_exit(&svd
->segfree_syncmtx
);
9334 shamp_reclaim(void *ptag
, caddr_t addr
, size_t len
, struct page
**pplist
,
9335 enum seg_rw rw
, int async
)
9337 amp_t
*amp
= (amp_t
*)ptag
;
9341 npages
= np
= btop(len
);
9343 ASSERT(amp
->a_softlockcnt
>= npages
);
9347 ASSERT(pl
[np
] == NOPCACHE_SHWLIST
|| pl
[np
] == PCACHE_SHWLIST
);
9348 ASSERT(!async
|| pl
[np
] == PCACHE_SHWLIST
);
9350 while (np
> (uint_t
)0) {
9351 if (rw
== S_WRITE
) {
9352 hat_setrefmod(*pplist
);
9354 hat_setref(*pplist
);
9356 page_unlock(*pplist
);
9361 kmem_free(pl
, sizeof (page_t
*) * (npages
+ 1));
9364 * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt
9365 * drops to 0. anon map can't be freed until a_softlockcnt drops to 0
9366 * and anonmap_purge() acquires a_purgemtx.
9368 mutex_enter(&
->a_purgemtx
);
9369 if (!atomic_add_long_nv((ulong_t
*)&
->a_softlockcnt
, -npages
) &&
9371 amp
->a_purgewait
= 0;
9372 cv_broadcast(&
->a_purgecv
);
9374 mutex_exit(&
->a_purgemtx
);
9379 * get a memory ID for an addr in a given segment
9381 * XXX only creates PAGESIZE pages if anon slots are not initialized.
9382 * At fault time they will be relocated into larger pages.
9385 segvn_getmemid(struct seg
*seg
, caddr_t addr
, memid_t
*memidp
)
9387 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9388 struct anon
*ap
= NULL
;
9390 struct anon_map
*amp
;
9391 anon_sync_obj_t cookie
;
9393 if (svd
->type
== MAP_PRIVATE
) {
9394 memidp
->val
[0] = (uintptr_t)seg
->s_as
;
9395 memidp
->val
[1] = (uintptr_t)addr
;
9399 if (svd
->type
== MAP_SHARED
) {
9401 memidp
->val
[0] = (uintptr_t)svd
->vp
;
9402 memidp
->val
[1] = (u_longlong_t
)svd
->offset
+
9403 (uintptr_t)(addr
- seg
->s_base
);
9407 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
9408 if ((amp
= svd
->amp
) != NULL
) {
9409 anon_index
= svd
->anon_index
+
9410 seg_page(seg
, addr
);
9412 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
9414 ASSERT(amp
!= NULL
);
9416 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
9417 anon_array_enter(amp
, anon_index
, &cookie
);
9418 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
9422 pp
= anon_zero(seg
, addr
, &ap
, svd
->cred
);
9424 anon_array_exit(&cookie
);
9425 ANON_LOCK_EXIT(&
->a_rwlock
);
9428 ASSERT(anon_get_ptr(amp
->ahp
, anon_index
)
9430 (void) anon_set_ptr(amp
->ahp
, anon_index
,
9435 anon_array_exit(&cookie
);
9436 ANON_LOCK_EXIT(&
->a_rwlock
);
9438 memidp
->val
[0] = (uintptr_t)ap
;
9439 memidp
->val
[1] = (uintptr_t)addr
& PAGEOFFSET
;
9447 sameprot(struct seg
*seg
, caddr_t a
, size_t len
)
9449 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9450 struct vpage
*vpage
;
9451 spgcnt_t pages
= btop(len
);
9454 if (svd
->pageprot
== 0)
9457 ASSERT(svd
->vpage
!= NULL
);
9459 vpage
= &svd
->vpage
[seg_page(seg
, a
)];
9460 prot
= VPP_PROT(vpage
);
9463 while (pages
-- > 0) {
9464 if (prot
!= VPP_PROT(vpage
))
9472 * Get memory allocation policy info for specified address in given segment
9474 static lgrp_mem_policy_info_t
*
9475 segvn_getpolicy(struct seg
*seg
, caddr_t addr
)
9477 struct anon_map
*amp
;
9479 lgrp_mem_policy_info_t
*policy_info
;
9480 struct segvn_data
*svn_data
;
9484 ASSERT(seg
!= NULL
);
9486 svn_data
= (struct segvn_data
*)seg
->s_data
;
9487 if (svn_data
== NULL
)
9491 * Get policy info for private or shared memory
9493 if (svn_data
->type
!= MAP_SHARED
) {
9494 if (svn_data
->tr_state
!= SEGVN_TR_ON
) {
9495 policy_info
= &svn_data
->policy_info
;
9497 policy_info
= &svn_data
->tr_policy_info
;
9498 ASSERT(policy_info
->mem_policy
==
9499 LGRP_MEM_POLICY_NEXT_SEG
);
9502 amp
= svn_data
->amp
;
9503 anon_index
= svn_data
->anon_index
+ seg_page(seg
, addr
);
9505 vn_off
= svn_data
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
9506 policy_info
= lgrp_shm_policy_get(amp
, anon_index
, vp
, vn_off
);
9509 return (policy_info
);
9514 segvn_capable(struct seg
*seg
, segcapability_t capability
)
9520 * Bind text vnode segment to an amp. If we bind successfully mappings will be
9521 * established to per vnode mapping per lgroup amp pages instead of to vnode
9522 * pages. There's one amp per vnode text mapping per lgroup. Many processes
9523 * may share the same text replication amp. If a suitable amp doesn't already
9524 * exist in svntr hash table create a new one. We may fail to bind to amp if
9525 * segment is not eligible for text replication. Code below first checks for
9526 * these conditions. If binding is successful segment tr_state is set to on
9527 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and
9528 * svd->amp remains as NULL.
9531 segvn_textrepl(struct seg
*seg
)
9533 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9534 vnode_t
*vp
= svd
->vp
;
9535 u_offset_t off
= svd
->offset
;
9536 size_t size
= seg
->s_size
;
9537 u_offset_t eoff
= off
+ size
;
9538 uint_t szc
= seg
->s_szc
;
9539 ulong_t hash
= SVNTR_HASH_FUNC(vp
);
9542 proc_t
*p
= seg
->s_as
->a_proc
;
9546 struct anon_map
*amp
;
9548 ASSERT(AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
9549 ASSERT(SEGVN_WRITE_HELD(seg
->s_as
, &svd
->lock
));
9551 ASSERT(svd
->tr_state
== SEGVN_TR_INIT
);
9552 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
));
9553 ASSERT(svd
->flags
& MAP_TEXT
);
9554 ASSERT(svd
->type
== MAP_PRIVATE
);
9555 ASSERT(vp
!= NULL
&& svd
->amp
== NULL
);
9556 ASSERT(!svd
->pageprot
&& !(svd
->prot
& PROT_WRITE
));
9557 ASSERT(!(svd
->flags
& MAP_NORESERVE
) && svd
->swresv
== 0);
9558 ASSERT(seg
->s_as
!= &kas
);
9560 ASSERT(svntr_hashtab
!= NULL
);
9563 * If numa optimizations are no longer desired bail out.
9565 if (!lgrp_optimizations()) {
9566 svd
->tr_state
= SEGVN_TR_OFF
;
9571 * Avoid creating anon maps with size bigger than the file size.
9572 * If VOP_GETATTR() call fails bail out.
9574 va
.va_mask
= AT_SIZE
| AT_MTIME
| AT_CTIME
;
9575 if (VOP_GETATTR(vp
, &va
, 0, svd
->cred
, NULL
) != 0) {
9576 svd
->tr_state
= SEGVN_TR_OFF
;
9577 SEGVN_TR_ADDSTAT(gaerr
);
9580 if (btopr(va
.va_size
) < btopr(eoff
)) {
9581 svd
->tr_state
= SEGVN_TR_OFF
;
9582 SEGVN_TR_ADDSTAT(overmap
);
9587 * VVMEXEC may not be set yet if exec() prefaults text segment. Set
9588 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED
9589 * mapping that checks if trcache for this vnode needs to be
9590 * invalidated can't miss us.
9592 if (!(vp
->v_flag
& VVMEXEC
)) {
9593 mutex_enter(&vp
->v_lock
);
9594 vp
->v_flag
|= VVMEXEC
;
9595 mutex_exit(&vp
->v_lock
);
9597 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
9599 * Bail out if potentially MAP_SHARED writable mappings exist to this
9600 * vnode. We don't want to use old file contents from existing
9601 * replicas if this mapping was established after the original file
9604 if (vn_is_mapped(vp
, V_WRITE
)) {
9605 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9606 svd
->tr_state
= SEGVN_TR_OFF
;
9607 SEGVN_TR_ADDSTAT(wrcnt
);
9610 svntrp
= svntr_hashtab
[hash
].tr_head
;
9611 for (; svntrp
!= NULL
; svntrp
= svntrp
->tr_next
) {
9612 ASSERT(svntrp
->tr_refcnt
!= 0);
9613 if (svntrp
->tr_vp
!= vp
) {
9618 * Bail out if the file or its attributes were changed after
9619 * this replication entry was created since we need to use the
9620 * latest file contents. Note that mtime test alone is not
9621 * sufficient because a user can explicitly change mtime via
9622 * utimes(2) interfaces back to the old value after modifiying
9623 * the file contents. To detect this case we also have to test
9624 * ctime which among other things records the time of the last
9625 * mtime change by utimes(2). ctime is not changed when the file
9626 * is only read or executed so we expect that typically existing
9627 * replication amp's can be used most of the time.
9629 if (!svntrp
->tr_valid
||
9630 svntrp
->tr_mtime
.tv_sec
!= va
.va_mtime
.tv_sec
||
9631 svntrp
->tr_mtime
.tv_nsec
!= va
.va_mtime
.tv_nsec
||
9632 svntrp
->tr_ctime
.tv_sec
!= va
.va_ctime
.tv_sec
||
9633 svntrp
->tr_ctime
.tv_nsec
!= va
.va_ctime
.tv_nsec
) {
9634 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9635 svd
->tr_state
= SEGVN_TR_OFF
;
9636 SEGVN_TR_ADDSTAT(stale
);
9640 * if off, eoff and szc match current segment we found the
9641 * existing entry we can use.
9643 if (svntrp
->tr_off
== off
&& svntrp
->tr_eoff
== eoff
&&
9644 svntrp
->tr_szc
== szc
) {
9648 * Don't create different but overlapping in file offsets
9649 * entries to avoid replication of the same file pages more
9650 * than once per lgroup.
9652 if ((off
>= svntrp
->tr_off
&& off
< svntrp
->tr_eoff
) ||
9653 (eoff
> svntrp
->tr_off
&& eoff
<= svntrp
->tr_eoff
)) {
9654 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9655 svd
->tr_state
= SEGVN_TR_OFF
;
9656 SEGVN_TR_ADDSTAT(overlap
);
9661 * If we didn't find existing entry create a new one.
9663 if (svntrp
== NULL
) {
9664 svntrp
= kmem_cache_alloc(svntr_cache
, KM_NOSLEEP
);
9665 if (svntrp
== NULL
) {
9666 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9667 svd
->tr_state
= SEGVN_TR_OFF
;
9668 SEGVN_TR_ADDSTAT(nokmem
);
9674 for (i
= 0; i
< NLGRPS_MAX
; i
++) {
9675 ASSERT(svntrp
->tr_amp
[i
] == NULL
);
9680 svntrp
->tr_off
= off
;
9681 svntrp
->tr_eoff
= eoff
;
9682 svntrp
->tr_szc
= szc
;
9683 svntrp
->tr_valid
= 1;
9684 svntrp
->tr_mtime
= va
.va_mtime
;
9685 svntrp
->tr_ctime
= va
.va_ctime
;
9686 svntrp
->tr_refcnt
= 0;
9687 svntrp
->tr_next
= svntr_hashtab
[hash
].tr_head
;
9688 svntr_hashtab
[hash
].tr_head
= svntrp
;
9693 * We want to pick a replica with pages on main thread's (t_tid = 1,
9694 * aka T1) lgrp. Currently text replication is only optimized for
9695 * workloads that either have all threads of a process on the same
9696 * lgrp or execute their large text primarily on main thread.
9698 lgrp_id
= p
->p_t1_lgrpid
;
9699 if (lgrp_id
== LGRP_NONE
) {
9701 * In case exec() prefaults text on non main thread use
9702 * current thread lgrpid. It will become main thread anyway
9705 lgrp_id
= lgrp_home_id(curthread
);
9708 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise
9709 * just set it to NLGRPS_MAX if it's different from current process T1
9710 * home lgrp. p_tr_lgrpid is used to detect if process uses text
9711 * replication and T1 new home is different from lgrp used for text
9712 * replication. When this happens asyncronous segvn thread rechecks if
9713 * segments should change lgrps used for text replication. If we fail
9714 * to set p_tr_lgrpid with atomic_cas_32 then set it to NLGRPS_MAX
9715 * without cas if it's not already NLGRPS_MAX and not equal lgrp_id
9716 * we want to use. We don't need to use cas in this case because
9717 * another thread that races in between our non atomic check and set
9718 * may only change p_tr_lgrpid to NLGRPS_MAX at this point.
9720 ASSERT(lgrp_id
!= LGRP_NONE
&& lgrp_id
< NLGRPS_MAX
);
9721 olid
= p
->p_tr_lgrpid
;
9722 if (lgrp_id
!= olid
&& olid
!= NLGRPS_MAX
) {
9723 lgrp_id_t nlid
= (olid
== LGRP_NONE
) ? lgrp_id
: NLGRPS_MAX
;
9724 if (atomic_cas_32((uint32_t *)&p
->p_tr_lgrpid
, olid
, nlid
) !=
9726 olid
= p
->p_tr_lgrpid
;
9727 ASSERT(olid
!= LGRP_NONE
);
9728 if (olid
!= lgrp_id
&& olid
!= NLGRPS_MAX
) {
9729 p
->p_tr_lgrpid
= NLGRPS_MAX
;
9732 ASSERT(p
->p_tr_lgrpid
!= LGRP_NONE
);
9735 * lgrp_move_thread() won't schedule async recheck after
9736 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not
9737 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid
9740 if (first
&& p
->p_t1_lgrpid
!= LGRP_NONE
&&
9741 p
->p_t1_lgrpid
!= lgrp_id
) {
9747 * If no amp was created yet for lgrp_id create a new one as long as
9748 * we have enough memory to afford it.
9750 if ((amp
= svntrp
->tr_amp
[lgrp_id
]) == NULL
) {
9751 size_t trmem
= atomic_add_long_nv(&segvn_textrepl_bytes
, size
);
9752 if (trmem
> segvn_textrepl_max_bytes
) {
9753 SEGVN_TR_ADDSTAT(normem
);
9756 if (anon_try_resv_zone(size
, NULL
) == 0) {
9757 SEGVN_TR_ADDSTAT(noanon
);
9760 amp
= anonmap_alloc(size
, size
, ANON_NOSLEEP
);
9762 anon_unresv_zone(size
, NULL
);
9763 SEGVN_TR_ADDSTAT(nokmem
);
9766 ASSERT(amp
->refcnt
== 1);
9768 svntrp
->tr_amp
[lgrp_id
] = amp
;
9769 SEGVN_TR_ADDSTAT(newamp
);
9771 svntrp
->tr_refcnt
++;
9772 ASSERT(svd
->svn_trnext
== NULL
);
9773 ASSERT(svd
->svn_trprev
== NULL
);
9774 svd
->svn_trnext
= svntrp
->tr_svnhead
;
9775 svd
->svn_trprev
= NULL
;
9776 if (svntrp
->tr_svnhead
!= NULL
) {
9777 svntrp
->tr_svnhead
->svn_trprev
= svd
;
9779 svntrp
->tr_svnhead
= svd
;
9780 ASSERT(amp
->a_szc
== szc
&& amp
->size
== size
&& amp
->swresv
== size
);
9781 ASSERT(amp
->refcnt
>= 1);
9783 svd
->anon_index
= 0;
9784 svd
->tr_policy_info
.mem_policy
= LGRP_MEM_POLICY_NEXT_SEG
;
9785 svd
->tr_policy_info
.mem_lgrpid
= lgrp_id
;
9786 svd
->tr_state
= SEGVN_TR_ON
;
9787 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9788 SEGVN_TR_ADDSTAT(repl
);
9791 ASSERT(segvn_textrepl_bytes
>= size
);
9792 atomic_add_long(&segvn_textrepl_bytes
, -size
);
9793 ASSERT(svntrp
!= NULL
);
9794 ASSERT(svntrp
->tr_amp
[lgrp_id
] == NULL
);
9795 if (svntrp
->tr_refcnt
== 0) {
9796 ASSERT(svntrp
== svntr_hashtab
[hash
].tr_head
);
9797 svntr_hashtab
[hash
].tr_head
= svntrp
->tr_next
;
9798 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9799 kmem_cache_free(svntr_cache
, svntrp
);
9801 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9803 svd
->tr_state
= SEGVN_TR_OFF
;
9807 * Convert seg back to regular vnode mapping seg by unbinding it from its text
9808 * replication amp. This routine is most typically called when segment is
9809 * unmapped but can also be called when segment no longer qualifies for text
9810 * replication (e.g. due to protection changes). If unload_unmap is set use
9811 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of
9812 * svntr free all its anon maps and remove it from the hash table.
9815 segvn_textunrepl(struct seg
*seg
, int unload_unmap
)
9817 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9818 vnode_t
*vp
= svd
->vp
;
9819 u_offset_t off
= svd
->offset
;
9820 size_t size
= seg
->s_size
;
9821 u_offset_t eoff
= off
+ size
;
9822 uint_t szc
= seg
->s_szc
;
9823 ulong_t hash
= SVNTR_HASH_FUNC(vp
);
9825 svntr_t
**prv_svntrp
;
9826 lgrp_id_t lgrp_id
= svd
->tr_policy_info
.mem_lgrpid
;
9829 ASSERT(AS_LOCK_HELD(seg
->s_as
, &seg
->s_as
->a_lock
));
9830 ASSERT(AS_WRITE_HELD(seg
->s_as
, &seg
->s_as
->a_lock
) ||
9831 SEGVN_WRITE_HELD(seg
->s_as
, &svd
->lock
));
9832 ASSERT(svd
->tr_state
== SEGVN_TR_ON
);
9833 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
));
9834 ASSERT(svd
->amp
!= NULL
);
9835 ASSERT(svd
->amp
->refcnt
>= 1);
9836 ASSERT(svd
->anon_index
== 0);
9837 ASSERT(lgrp_id
!= LGRP_NONE
&& lgrp_id
< NLGRPS_MAX
);
9838 ASSERT(svntr_hashtab
!= NULL
);
9840 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
9841 prv_svntrp
= &svntr_hashtab
[hash
].tr_head
;
9842 for (; (svntrp
= *prv_svntrp
) != NULL
; prv_svntrp
= &svntrp
->tr_next
) {
9843 ASSERT(svntrp
->tr_refcnt
!= 0);
9844 if (svntrp
->tr_vp
== vp
&& svntrp
->tr_off
== off
&&
9845 svntrp
->tr_eoff
== eoff
&& svntrp
->tr_szc
== szc
) {
9849 if (svntrp
== NULL
) {
9850 panic("segvn_textunrepl: svntr record not found");
9852 if (svntrp
->tr_amp
[lgrp_id
] != svd
->amp
) {
9853 panic("segvn_textunrepl: amp mismatch");
9855 svd
->tr_state
= SEGVN_TR_OFF
;
9857 if (svd
->svn_trprev
== NULL
) {
9858 ASSERT(svntrp
->tr_svnhead
== svd
);
9859 svntrp
->tr_svnhead
= svd
->svn_trnext
;
9860 if (svntrp
->tr_svnhead
!= NULL
) {
9861 svntrp
->tr_svnhead
->svn_trprev
= NULL
;
9863 svd
->svn_trnext
= NULL
;
9865 svd
->svn_trprev
->svn_trnext
= svd
->svn_trnext
;
9866 if (svd
->svn_trnext
!= NULL
) {
9867 svd
->svn_trnext
->svn_trprev
= svd
->svn_trprev
;
9868 svd
->svn_trnext
= NULL
;
9870 svd
->svn_trprev
= NULL
;
9872 if (--svntrp
->tr_refcnt
) {
9873 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9876 *prv_svntrp
= svntrp
->tr_next
;
9877 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9878 for (i
= 0; i
< NLGRPS_MAX
; i
++) {
9879 struct anon_map
*amp
= svntrp
->tr_amp
[i
];
9883 ASSERT(amp
->refcnt
== 1);
9884 ASSERT(amp
->swresv
== size
);
9885 ASSERT(amp
->size
== size
);
9886 ASSERT(amp
->a_szc
== szc
);
9887 if (amp
->a_szc
!= 0) {
9888 anon_free_pages(amp
->ahp
, 0, size
, szc
);
9890 anon_free(amp
->ahp
, 0, size
);
9892 svntrp
->tr_amp
[i
] = NULL
;
9893 ASSERT(segvn_textrepl_bytes
>= size
);
9894 atomic_add_long(&segvn_textrepl_bytes
, -size
);
9895 anon_unresv_zone(amp
->swresv
, NULL
);
9899 kmem_cache_free(svntr_cache
, svntrp
);
9901 hat_unload_callback(seg
->s_as
->a_hat
, seg
->s_base
, size
,
9902 unload_unmap
? HAT_UNLOAD_UNMAP
: 0, NULL
);
9906 * This is called when a MAP_SHARED writable mapping is created to a vnode
9907 * that is currently used for execution (VVMEXEC flag is set). In this case we
9908 * need to prevent further use of existing replicas.
9911 segvn_inval_trcache(vnode_t
*vp
)
9913 ulong_t hash
= SVNTR_HASH_FUNC(vp
);
9916 ASSERT(vp
->v_flag
& VVMEXEC
);
9918 if (svntr_hashtab
== NULL
) {
9922 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
9923 svntrp
= svntr_hashtab
[hash
].tr_head
;
9924 for (; svntrp
!= NULL
; svntrp
= svntrp
->tr_next
) {
9925 ASSERT(svntrp
->tr_refcnt
!= 0);
9926 if (svntrp
->tr_vp
== vp
&& svntrp
->tr_valid
) {
9927 svntrp
->tr_valid
= 0;
9930 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9934 segvn_trasync_thread(void)
9936 callb_cpr_t cpr_info
;
9937 kmutex_t cpr_lock
; /* just for CPR stuff */
9939 mutex_init(&cpr_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
9941 CALLB_CPR_INIT(&cpr_info
, &cpr_lock
,
9942 callb_generic_cpr
, "segvn_async");
9944 if (segvn_update_textrepl_interval
== 0) {
9945 segvn_update_textrepl_interval
= segvn_update_tr_time
* hz
;
9947 segvn_update_textrepl_interval
*= hz
;
9949 (void) timeout(segvn_trupdate_wakeup
, NULL
,
9950 segvn_update_textrepl_interval
);
9953 mutex_enter(&cpr_lock
);
9954 CALLB_CPR_SAFE_BEGIN(&cpr_info
);
9955 mutex_exit(&cpr_lock
);
9956 sema_p(&segvn_trasync_sem
);
9957 mutex_enter(&cpr_lock
);
9958 CALLB_CPR_SAFE_END(&cpr_info
, &cpr_lock
);
9959 mutex_exit(&cpr_lock
);
9964 static uint64_t segvn_lgrp_trthr_migrs_snpsht
= 0;
9967 segvn_trupdate_wakeup(void *dummy
)
9969 uint64_t cur_lgrp_trthr_migrs
= lgrp_get_trthr_migrations();
9971 if (cur_lgrp_trthr_migrs
!= segvn_lgrp_trthr_migrs_snpsht
) {
9972 segvn_lgrp_trthr_migrs_snpsht
= cur_lgrp_trthr_migrs
;
9973 sema_v(&segvn_trasync_sem
);
9976 if (!segvn_disable_textrepl_update
&&
9977 segvn_update_textrepl_interval
!= 0) {
9978 (void) timeout(segvn_trupdate_wakeup
, dummy
,
9979 segvn_update_textrepl_interval
);
9984 segvn_trupdate(void)
9990 ASSERT(svntr_hashtab
!= NULL
);
9992 for (hash
= 0; hash
< svntr_hashtab_sz
; hash
++) {
9993 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
9994 svntrp
= svntr_hashtab
[hash
].tr_head
;
9995 for (; svntrp
!= NULL
; svntrp
= svntrp
->tr_next
) {
9996 ASSERT(svntrp
->tr_refcnt
!= 0);
9997 svd
= svntrp
->tr_svnhead
;
9998 for (; svd
!= NULL
; svd
= svd
->svn_trnext
) {
9999 segvn_trupdate_seg(svd
->seg
, svd
, svntrp
,
10003 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
10008 segvn_trupdate_seg(struct seg
*seg
,
10017 struct anon_map
*amp
;
10019 ASSERT(svd
->vp
!= NULL
);
10020 ASSERT(svd
->vp
== svntrp
->tr_vp
);
10021 ASSERT(svd
->offset
== svntrp
->tr_off
);
10022 ASSERT(svd
->offset
+ seg
->s_size
== svntrp
->tr_eoff
);
10023 ASSERT(seg
!= NULL
);
10024 ASSERT(svd
->seg
== seg
);
10025 ASSERT(seg
->s_data
== (void *)svd
);
10026 ASSERT(seg
->s_szc
== svntrp
->tr_szc
);
10027 ASSERT(svd
->tr_state
== SEGVN_TR_ON
);
10028 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
));
10029 ASSERT(svd
->amp
!= NULL
);
10030 ASSERT(svd
->tr_policy_info
.mem_policy
== LGRP_MEM_POLICY_NEXT_SEG
);
10031 ASSERT(svd
->tr_policy_info
.mem_lgrpid
!= LGRP_NONE
);
10032 ASSERT(svd
->tr_policy_info
.mem_lgrpid
< NLGRPS_MAX
);
10033 ASSERT(svntrp
->tr_amp
[svd
->tr_policy_info
.mem_lgrpid
] == svd
->amp
);
10034 ASSERT(svntrp
->tr_refcnt
!= 0);
10035 ASSERT(mutex_owned(&svntr_hashtab
[hash
].tr_lock
));
10038 ASSERT(as
!= NULL
&& as
!= &kas
);
10041 ASSERT(p
->p_tr_lgrpid
!= LGRP_NONE
);
10042 lgrp_id
= p
->p_t1_lgrpid
;
10043 if (lgrp_id
== LGRP_NONE
) {
10046 ASSERT(lgrp_id
< NLGRPS_MAX
);
10047 if (svd
->tr_policy_info
.mem_lgrpid
== lgrp_id
) {
10052 * Use tryenter locking since we are locking as/seg and svntr hash
10053 * lock in reverse from syncrounous thread order.
10055 if (!AS_LOCK_TRYENTER(as
, &as
->a_lock
, RW_READER
)) {
10056 SEGVN_TR_ADDSTAT(nolock
);
10057 if (segvn_lgrp_trthr_migrs_snpsht
) {
10058 segvn_lgrp_trthr_migrs_snpsht
= 0;
10062 if (!SEGVN_LOCK_TRYENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
)) {
10063 AS_LOCK_EXIT(as
, &as
->a_lock
);
10064 SEGVN_TR_ADDSTAT(nolock
);
10065 if (segvn_lgrp_trthr_migrs_snpsht
) {
10066 segvn_lgrp_trthr_migrs_snpsht
= 0;
10070 size
= seg
->s_size
;
10071 if (svntrp
->tr_amp
[lgrp_id
] == NULL
) {
10072 size_t trmem
= atomic_add_long_nv(&segvn_textrepl_bytes
, size
);
10073 if (trmem
> segvn_textrepl_max_bytes
) {
10074 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
10075 AS_LOCK_EXIT(as
, &as
->a_lock
);
10076 atomic_add_long(&segvn_textrepl_bytes
, -size
);
10077 SEGVN_TR_ADDSTAT(normem
);
10080 if (anon_try_resv_zone(size
, NULL
) == 0) {
10081 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
10082 AS_LOCK_EXIT(as
, &as
->a_lock
);
10083 atomic_add_long(&segvn_textrepl_bytes
, -size
);
10084 SEGVN_TR_ADDSTAT(noanon
);
10087 amp
= anonmap_alloc(size
, size
, KM_NOSLEEP
);
10089 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
10090 AS_LOCK_EXIT(as
, &as
->a_lock
);
10091 atomic_add_long(&segvn_textrepl_bytes
, -size
);
10092 anon_unresv_zone(size
, NULL
);
10093 SEGVN_TR_ADDSTAT(nokmem
);
10096 ASSERT(amp
->refcnt
== 1);
10097 amp
->a_szc
= seg
->s_szc
;
10098 svntrp
->tr_amp
[lgrp_id
] = amp
;
10101 * We don't need to drop the bucket lock but here we give other
10102 * threads a chance. svntr and svd can't be unlinked as long as
10103 * segment lock is held as a writer and AS held as well. After we
10104 * retake bucket lock we'll continue from where we left. We'll be able
10105 * to reach the end of either list since new entries are always added
10106 * to the beginning of the lists.
10108 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
10109 hat_unload_callback(as
->a_hat
, seg
->s_base
, size
, 0, NULL
);
10110 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
10112 ASSERT(svd
->tr_state
== SEGVN_TR_ON
);
10113 ASSERT(svd
->amp
!= NULL
);
10114 ASSERT(svd
->tr_policy_info
.mem_policy
== LGRP_MEM_POLICY_NEXT_SEG
);
10115 ASSERT(svd
->tr_policy_info
.mem_lgrpid
!= lgrp_id
);
10116 ASSERT(svd
->amp
!= svntrp
->tr_amp
[lgrp_id
]);
10118 svd
->tr_policy_info
.mem_lgrpid
= lgrp_id
;
10119 svd
->amp
= svntrp
->tr_amp
[lgrp_id
];
10120 p
->p_tr_lgrpid
= NLGRPS_MAX
;
10121 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
10122 AS_LOCK_EXIT(as
, &as
->a_lock
);
10124 ASSERT(svntrp
->tr_refcnt
!= 0);
10125 ASSERT(svd
->vp
== svntrp
->tr_vp
);
10126 ASSERT(svd
->tr_policy_info
.mem_lgrpid
== lgrp_id
);
10127 ASSERT(svd
->amp
!= NULL
&& svd
->amp
== svntrp
->tr_amp
[lgrp_id
]);
10128 ASSERT(svd
->seg
== seg
);
10129 ASSERT(svd
->tr_state
== SEGVN_TR_ON
);
10131 SEGVN_TR_ADDSTAT(asyncrepl
);