4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Joyent, Inc.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
41 * VM - shared or copy-on-write from a vnode/anonymous memory.
44 #include <sys/types.h>
45 #include <sys/param.h>
46 #include <sys/t_lock.h>
47 #include <sys/errno.h>
48 #include <sys/systm.h>
50 #include <sys/debug.h>
52 #include <sys/vmsystm.h>
53 #include <sys/tuneable.h>
54 #include <sys/bitmap.h>
57 #include <sys/sysmacros.h>
58 #include <sys/vtrace.h>
59 #include <sys/cmn_err.h>
60 #include <sys/callb.h>
62 #include <sys/dumphdr.h>
68 #include <vm/seg_vn.h>
75 #include <sys/project.h>
77 #include <sys/shm_impl.h>
80 * segvn_fault needs a temporary page list array. To avoid calling kmem all
81 * the time, it creates a small (FAULT_TMP_PAGES_NUM entry) array and uses
82 * it if it can. In the rare case when this page list is not large enough,
83 * it goes and gets a large enough array from kmem.
85 #define FAULT_TMP_PAGES_NUM 0x8
86 #define FAULT_TMP_PAGES_SZ ptob(FAULT_TMP_PAGES_NUM)
89 * Private seg op routines.
91 static int segvn_dup(struct seg
*seg
, struct seg
*newseg
);
92 static int segvn_unmap(struct seg
*seg
, caddr_t addr
, size_t len
);
93 static void segvn_free(struct seg
*seg
);
94 static faultcode_t
segvn_fault(struct hat
*hat
, struct seg
*seg
,
95 caddr_t addr
, size_t len
, enum fault_type type
,
97 static faultcode_t
segvn_faulta(struct seg
*seg
, caddr_t addr
);
98 static int segvn_setprot(struct seg
*seg
, caddr_t addr
,
99 size_t len
, uint_t prot
);
100 static int segvn_checkprot(struct seg
*seg
, caddr_t addr
,
101 size_t len
, uint_t prot
);
102 static int segvn_kluster(struct seg
*seg
, caddr_t addr
, ssize_t delta
);
103 static int segvn_sync(struct seg
*seg
, caddr_t addr
, size_t len
,
104 int attr
, uint_t flags
);
105 static size_t segvn_incore(struct seg
*seg
, caddr_t addr
, size_t len
,
107 static int segvn_lockop(struct seg
*seg
, caddr_t addr
, size_t len
,
108 int attr
, int op
, ulong_t
*lockmap
, size_t pos
);
109 static int segvn_getprot(struct seg
*seg
, caddr_t addr
, size_t len
,
111 static uoff_t
segvn_getoffset(struct seg
*seg
, caddr_t addr
);
112 static int segvn_gettype(struct seg
*seg
, caddr_t addr
);
113 static int segvn_getvp(struct seg
*seg
, caddr_t addr
,
115 static int segvn_advise(struct seg
*seg
, caddr_t addr
, size_t len
,
117 static void segvn_dump(struct seg
*seg
);
118 static int segvn_pagelock(struct seg
*seg
, caddr_t addr
, size_t len
,
119 struct page
***ppp
, enum lock_type type
, enum seg_rw rw
);
120 static int segvn_setpagesize(struct seg
*seg
, caddr_t addr
, size_t len
,
122 static int segvn_getmemid(struct seg
*seg
, caddr_t addr
,
124 static lgrp_mem_policy_info_t
*segvn_getpolicy(struct seg
*, caddr_t
);
125 static int segvn_inherit(struct seg
*, caddr_t
, size_t, uint_t
);
127 const struct seg_ops segvn_ops
= {
129 .unmap
= segvn_unmap
,
131 .fault
= segvn_fault
,
132 .faulta
= segvn_faulta
,
133 .setprot
= segvn_setprot
,
134 .checkprot
= segvn_checkprot
,
135 .kluster
= segvn_kluster
,
137 .incore
= segvn_incore
,
138 .lockop
= segvn_lockop
,
139 .getprot
= segvn_getprot
,
140 .getoffset
= segvn_getoffset
,
141 .gettype
= segvn_gettype
,
142 .getvp
= segvn_getvp
,
143 .advise
= segvn_advise
,
145 .pagelock
= segvn_pagelock
,
146 .setpagesize
= segvn_setpagesize
,
147 .getmemid
= segvn_getmemid
,
148 .getpolicy
= segvn_getpolicy
,
149 .inherit
= segvn_inherit
,
153 * Common zfod structures, provided as a shorthand for others to use.
155 static segvn_crargs_t zfod_segvn_crargs
=
156 SEGVN_ZFOD_ARGS(PROT_ZFOD
, PROT_ALL
);
157 static segvn_crargs_t kzfod_segvn_crargs
=
158 SEGVN_ZFOD_ARGS(PROT_ZFOD
& ~PROT_USER
,
159 PROT_ALL
& ~PROT_USER
);
160 static segvn_crargs_t stack_noexec_crargs
=
161 SEGVN_ZFOD_ARGS(PROT_ZFOD
& ~PROT_EXEC
, PROT_ALL
);
163 caddr_t zfod_argsp
= (caddr_t
)&zfod_segvn_crargs
; /* user zfod argsp */
164 caddr_t kzfod_argsp
= (caddr_t
)&kzfod_segvn_crargs
; /* kernel zfod argsp */
165 caddr_t stack_exec_argsp
= (caddr_t
)&zfod_segvn_crargs
; /* executable stack */
166 caddr_t stack_noexec_argsp
= (caddr_t
)&stack_noexec_crargs
; /* noexec stack */
168 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */
170 size_t segvn_comb_thrshld
= UINT_MAX
; /* patchable -- see 1196681 */
172 size_t segvn_pglock_comb_thrshld
= (1UL << 16); /* 64K */
173 size_t segvn_pglock_comb_balign
= (1UL << 16); /* 64K */
174 uint_t segvn_pglock_comb_bshift
;
175 size_t segvn_pglock_comb_palign
;
177 static int segvn_concat(struct seg
*, struct seg
*, int);
178 static int segvn_extend_prev(struct seg
*, struct seg
*,
179 struct segvn_crargs
*, size_t);
180 static int segvn_extend_next(struct seg
*, struct seg
*,
181 struct segvn_crargs
*, size_t);
182 static void segvn_softunlock(struct seg
*, caddr_t
, size_t, enum seg_rw
);
183 static void segvn_pagelist_rele(page_t
**);
184 static void segvn_setvnode_mpss(vnode_t
*);
185 static void segvn_relocate_pages(page_t
**, page_t
*);
186 static int segvn_full_szcpages(page_t
**, uint_t
, int *, uint_t
*);
187 static int segvn_fill_vp_pages(struct segvn_data
*, vnode_t
*, uoff_t
,
188 uint_t
, page_t
**, page_t
**, uint_t
*, int *);
189 static faultcode_t
segvn_fault_vnodepages(struct hat
*, struct seg
*, caddr_t
,
190 caddr_t
, enum fault_type
, enum seg_rw
, caddr_t
, caddr_t
, int);
191 static faultcode_t
segvn_fault_anonpages(struct hat
*, struct seg
*, caddr_t
,
192 caddr_t
, enum fault_type
, enum seg_rw
, caddr_t
, caddr_t
, int);
193 static faultcode_t
segvn_faultpage(struct hat
*, struct seg
*, caddr_t
,
194 uoff_t
, struct vpage
*, page_t
**, uint_t
,
195 enum fault_type
, enum seg_rw
, int);
196 static void segvn_vpage(struct seg
*);
197 static size_t segvn_count_swap_by_vpages(struct seg
*);
199 static void segvn_purge(struct seg
*seg
);
200 static int segvn_reclaim(void *, caddr_t
, size_t, struct page
**,
202 static int shamp_reclaim(void *, caddr_t
, size_t, struct page
**,
205 static int sameprot(struct seg
*, caddr_t
, size_t);
207 static int segvn_demote_range(struct seg
*, caddr_t
, size_t, int, uint_t
);
208 static int segvn_clrszc(struct seg
*);
209 static struct seg
*segvn_split_seg(struct seg
*, caddr_t
);
210 static int segvn_claim_pages(struct seg
*, struct vpage
*, uoff_t
,
213 static void segvn_hat_rgn_unload_callback(caddr_t
, caddr_t
, caddr_t
,
214 size_t, void *, uoff_t
);
216 static struct kmem_cache
*segvn_cache
;
217 static struct kmem_cache
**segvn_szc_cache
;
220 static struct segvnvmstats_str
{
221 ulong_t fill_vp_pages
[31];
222 ulong_t fltvnpages
[49];
223 ulong_t fullszcpages
[10];
224 ulong_t relocatepages
[3];
225 ulong_t fltanpages
[17];
227 ulong_t demoterange
[3];
229 #endif /* VM_STATS */
231 #define SDR_RANGE 1 /* demote entire range */
232 #define SDR_END 2 /* demote non aligned ends only */
234 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \
236 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \
237 ASSERT(lpgaddr >= (seg)->s_base); \
238 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \
240 ASSERT(lpgeaddr > lpgaddr); \
241 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \
243 lpgeaddr = lpgaddr = (addr); \
249 segvn_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
251 struct segvn_data
*svd
= buf
;
253 rw_init(&svd
->lock
, NULL
, RW_DEFAULT
, NULL
);
254 mutex_init(&svd
->segfree_syncmtx
, NULL
, MUTEX_DEFAULT
, NULL
);
255 svd
->svn_trnext
= svd
->svn_trprev
= NULL
;
261 segvn_cache_destructor(void *buf
, void *cdrarg
)
263 struct segvn_data
*svd
= buf
;
265 rw_destroy(&svd
->lock
);
266 mutex_destroy(&svd
->segfree_syncmtx
);
271 svntr_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
273 bzero(buf
, sizeof (svntr_t
));
278 * Patching this variable to non-zero allows the system to run with
279 * stacks marked as "not executable". It's a bit of a kludge, but is
280 * provided as a tweakable for platforms that export those ABIs
281 * (e.g. sparc V8) that have executable stacks enabled by default.
282 * There are also some restrictions for platforms that don't actually
283 * implement 'noexec' protections.
285 * Once enabled, the system is (therefore) unable to provide a fully
286 * ABI-compliant execution environment, though practically speaking,
287 * most everything works. The exceptions are generally some interpreters
288 * and debuggers that create executable code on the stack and jump
289 * into it (without explicitly mprotecting the address range to include
292 * One important class of applications that are disabled are those
293 * that have been transformed into malicious agents using one of the
294 * numerous "buffer overflow" attacks. See 4007890.
296 int noexec_user_stack
= 0;
297 int noexec_user_stack_log
= 1;
299 int segvn_lpg_disable
= 0;
300 uint_t segvn_maxpgszc
= 0;
302 ulong_t segvn_vmpss_clrszc_cnt
;
303 ulong_t segvn_vmpss_clrszc_err
;
304 ulong_t segvn_fltvnpages_clrszc_cnt
;
305 ulong_t segvn_fltvnpages_clrszc_err
;
306 ulong_t segvn_setpgsz_align_err
;
307 ulong_t segvn_setpgsz_anon_align_err
;
308 ulong_t segvn_setpgsz_getattr_err
;
309 ulong_t segvn_setpgsz_eof_err
;
310 ulong_t segvn_faultvnmpss_align_err1
;
311 ulong_t segvn_faultvnmpss_align_err2
;
312 ulong_t segvn_faultvnmpss_align_err3
;
313 ulong_t segvn_faultvnmpss_align_err4
;
314 ulong_t segvn_faultvnmpss_align_err5
;
315 ulong_t segvn_vmpss_pageio_deadlk_err
;
317 int segvn_use_regions
= 1;
320 * Segvn supports text replication optimization for NUMA platforms. Text
321 * replica's are represented by anon maps (amp). There's one amp per text file
322 * region per lgroup. A process chooses the amp for each of its text mappings
323 * based on the lgroup assignment of its main thread (t_tid = 1). All
324 * processes that want a replica on a particular lgroup for the same text file
325 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
326 * with vp,off,size,szc used as a key. Text replication segments are read only
327 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
328 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
329 * pages. Replication amp is assigned to a segment when it gets its first
330 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
331 * rechecks periodically if the process still maps an amp local to the main
332 * thread. If not async thread forces process to remap to an amp in the new
333 * home lgroup of the main thread. Current text replication implementation
334 * only provides the benefit to workloads that do most of their work in the
335 * main thread of a process or all the threads of a process run in the same
336 * lgroup. To extend text replication benefit to different types of
337 * multithreaded workloads further work would be needed in the hat layer to
338 * allow the same virtual address in the same hat to simultaneously map
339 * different physical addresses (i.e. page table replication would be needed
342 * amp pages are used instead of vnode pages as long as segment has a very
343 * simple life cycle. It's created via segvn_create(), handles S_EXEC
344 * (S_READ) pagefaults and is fully unmapped. If anything more complicated
345 * happens such as protection is changed, real COW fault happens, pagesize is
346 * changed, MC_LOCK is requested or segment is partially unmapped we turn off
347 * text replication by converting the segment back to vnode only segment
348 * (unmap segment's address range and set svd->amp to NULL).
350 * The original file can be changed after amp is inserted into
351 * svntr_hashtab. Processes that are launched after the file is already
352 * changed can't use the replica's created prior to the file change. To
353 * implement this functionality hash entries are timestamped. Replica's can
354 * only be used if current file modification time is the same as the timestamp
355 * saved when hash entry was created. However just timestamps alone are not
356 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
357 * deal with file changes via MAP_SHARED mappings differently. When writable
358 * MAP_SHARED mappings are created to vnodes marked as executable we mark all
359 * existing replica's for this vnode as not usable for future text
360 * mappings. And we don't create new replica's for files that currently have
361 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
365 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20)
366 size_t segvn_textrepl_max_bytes_factor
= SEGVN_TEXTREPL_MAXBYTES_FACTOR
;
368 static ulong_t svntr_hashtab_sz
= 512;
369 static svntr_bucket_t
*svntr_hashtab
= NULL
;
370 static struct kmem_cache
*svntr_cache
;
371 static svntr_stats_t
*segvn_textrepl_stats
;
372 static ksema_t segvn_trasync_sem
;
374 int segvn_disable_textrepl
= 1;
375 size_t textrepl_size_thresh
= (size_t)-1;
376 size_t segvn_textrepl_bytes
= 0;
377 size_t segvn_textrepl_max_bytes
= 0;
378 clock_t segvn_update_textrepl_interval
= 0;
379 int segvn_update_tr_time
= 10;
380 int segvn_disable_textrepl_update
= 0;
382 static void segvn_textrepl(struct seg
*);
383 static void segvn_textunrepl(struct seg
*, int);
384 static void segvn_inval_trcache(vnode_t
*);
385 static void segvn_trasync_thread(void);
386 static void segvn_trupdate_wakeup(void *);
387 static void segvn_trupdate(void);
388 static void segvn_trupdate_seg(struct seg
*, segvn_data_t
*, svntr_t
*,
392 * Initialize segvn data structures
401 segvn_cache
= kmem_cache_create("segvn_cache",
402 sizeof (struct segvn_data
), 0,
403 segvn_cache_constructor
, segvn_cache_destructor
, NULL
,
406 if (segvn_lpg_disable
== 0) {
407 szc
= maxszc
= page_num_pagesizes() - 1;
409 segvn_lpg_disable
= 1;
411 if (page_get_pagesize(0) != PAGESIZE
) {
412 panic("segvn_init: bad szc 0");
416 pgsz
= page_get_pagesize(szc
);
417 if (pgsz
<= PAGESIZE
|| !IS_P2ALIGNED(pgsz
, pgsz
)) {
418 panic("segvn_init: bad szc %d", szc
);
423 if (segvn_maxpgszc
== 0 || segvn_maxpgszc
> maxszc
)
424 segvn_maxpgszc
= maxszc
;
427 if (segvn_maxpgszc
) {
428 segvn_szc_cache
= (struct kmem_cache
**)kmem_alloc(
429 (segvn_maxpgszc
+ 1) * sizeof (struct kmem_cache
*),
433 for (szc
= 1; szc
<= segvn_maxpgszc
; szc
++) {
436 (void) sprintf(str
, "segvn_szc_cache%d", szc
);
437 segvn_szc_cache
[szc
] = kmem_cache_create(str
,
438 page_get_pagecnt(szc
) * sizeof (page_t
*), 0,
439 NULL
, NULL
, NULL
, NULL
, NULL
, KMC_NODEBUG
);
443 if (segvn_use_regions
&& !hat_supported(HAT_SHARED_REGIONS
, NULL
))
444 segvn_use_regions
= 0;
447 * For now shared regions and text replication segvn support
448 * are mutually exclusive. This is acceptable because
449 * currently significant benefit from text replication was
450 * only observed on AMD64 NUMA platforms (due to relatively
451 * small L2$ size) and currently we don't support shared
454 if (segvn_use_regions
&& !segvn_disable_textrepl
) {
455 segvn_disable_textrepl
= 1;
459 if (lgrp_optimizations() && textrepl_size_thresh
!= (size_t)-1 &&
460 !segvn_disable_textrepl
) {
462 size_t hsz
= svntr_hashtab_sz
* sizeof (svntr_bucket_t
);
464 svntr_cache
= kmem_cache_create("svntr_cache",
465 sizeof (svntr_t
), 0, svntr_cache_constructor
, NULL
,
466 NULL
, NULL
, NULL
, 0);
467 svntr_hashtab
= kmem_zalloc(hsz
, KM_SLEEP
);
468 for (i
= 0; i
< svntr_hashtab_sz
; i
++) {
469 mutex_init(&svntr_hashtab
[i
].tr_lock
, NULL
,
470 MUTEX_DEFAULT
, NULL
);
472 segvn_textrepl_max_bytes
= ptob(physmem
) /
473 segvn_textrepl_max_bytes_factor
;
474 segvn_textrepl_stats
= kmem_zalloc(NCPU
*
475 sizeof (svntr_stats_t
), KM_SLEEP
);
476 sema_init(&segvn_trasync_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
477 (void) thread_create(NULL
, 0, segvn_trasync_thread
,
478 NULL
, 0, &p0
, TS_RUN
, minclsyspri
);
482 if (!ISP2(segvn_pglock_comb_balign
) ||
483 segvn_pglock_comb_balign
< PAGESIZE
) {
484 segvn_pglock_comb_balign
= 1UL << 16; /* 64K */
486 segvn_pglock_comb_bshift
= highbit(segvn_pglock_comb_balign
) - 1;
487 segvn_pglock_comb_palign
= btop(segvn_pglock_comb_balign
);
490 #define SEGVN_PAGEIO ((void *)0x1)
491 #define SEGVN_NOPAGEIO ((void *)0x2)
494 segvn_setvnode_mpss(vnode_t
*vp
)
498 ASSERT(vp
->v_mpssdata
== NULL
||
499 vp
->v_mpssdata
== SEGVN_PAGEIO
||
500 vp
->v_mpssdata
== SEGVN_NOPAGEIO
);
502 if (vp
->v_mpssdata
== NULL
) {
503 if (vn_vmpss_usepageio(vp
)) {
504 err
= fop_pageio(vp
, NULL
,
505 0, 0, 0, CRED(), NULL
);
510 * set v_mpssdata just once per vnode life
511 * so that it never changes.
513 mutex_enter(&vp
->v_lock
);
514 if (vp
->v_mpssdata
== NULL
) {
516 vp
->v_mpssdata
= SEGVN_PAGEIO
;
518 vp
->v_mpssdata
= SEGVN_NOPAGEIO
;
521 mutex_exit(&vp
->v_lock
);
526 segvn_create(struct seg
**segpp
, void *argsp
)
528 struct seg
*seg
= *segpp
;
529 extern lgrp_mem_policy_t lgrp_mem_default_policy
;
530 struct segvn_crargs
*a
= (struct segvn_crargs
*)argsp
;
531 struct segvn_data
*svd
;
534 struct anon_map
*amp
;
537 lgrp_mem_policy_t mpolicy
= lgrp_mem_default_policy
;
541 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
));
543 if (a
->type
!= MAP_PRIVATE
&& a
->type
!= MAP_SHARED
) {
544 panic("segvn_create type");
549 * Check arguments. If a shared anon structure is given then
550 * it is illegal to also specify a vp.
552 if (a
->amp
!= NULL
&& a
->vp
!= NULL
) {
553 panic("segvn_create anon_map");
557 if (a
->type
== MAP_PRIVATE
&& (a
->flags
& MAP_TEXT
) &&
558 a
->vp
!= NULL
&& a
->prot
== (PROT_USER
| PROT_READ
| PROT_EXEC
) &&
563 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
564 if (a
->type
== MAP_SHARED
)
565 a
->flags
&= ~MAP_NORESERVE
;
568 if (segvn_lpg_disable
!= 0 || (a
->szc
== AS_MAP_NO_LPOOB
) ||
569 (a
->amp
!= NULL
&& a
->type
== MAP_PRIVATE
) ||
570 (a
->flags
& MAP_NORESERVE
) || seg
->s_as
== &kas
) {
573 if (a
->szc
> segvn_maxpgszc
)
574 a
->szc
= segvn_maxpgszc
;
575 pgsz
= page_get_pagesize(a
->szc
);
576 if (!IS_P2ALIGNED(seg
->s_base
, pgsz
) ||
577 !IS_P2ALIGNED(seg
->s_size
, pgsz
)) {
579 } else if (a
->vp
!= NULL
) {
580 if (IS_SWAPFSVP(a
->vp
) || VN_ISKAS(a
->vp
)) {
583 * hat_page_demote() is not supported
587 } else if (map_addr_vacalign_check(seg
->s_base
,
588 a
->offset
& PAGEMASK
)) {
591 } else if (a
->amp
!= NULL
) {
592 pgcnt_t anum
= btopr(a
->offset
);
593 pgcnt_t pgcnt
= page_get_pagecnt(a
->szc
);
594 if (!IS_P2ALIGNED(anum
, pgcnt
)) {
602 * If segment may need private pages, reserve them now.
604 if (!(a
->flags
& MAP_NORESERVE
) && ((a
->vp
== NULL
&& a
->amp
== NULL
) ||
605 (a
->type
== MAP_PRIVATE
&& (a
->prot
& PROT_WRITE
)))) {
606 if (anon_resv_zone(seg
->s_size
,
607 seg
->s_as
->a_proc
->p_zone
) == 0)
609 swresv
= seg
->s_size
;
613 * Reserve any mapping structures that may be required.
615 * Don't do it for segments that may use regions. It's currently a
616 * noop in the hat implementations anyway.
619 hat_map(seg
->s_as
->a_hat
, seg
->s_base
, seg
->s_size
, HAT_MAP
);
626 crhold(cred
= CRED());
629 /* Inform the vnode of the new mapping */
631 error
= fop_addmap(a
->vp
, a
->offset
& PAGEMASK
,
632 seg
->s_as
, seg
->s_base
, seg
->s_size
, a
->prot
,
633 a
->maxprot
, a
->type
, cred
, NULL
);
636 anon_unresv_zone(swresv
,
637 seg
->s_as
->a_proc
->p_zone
);
641 hat_unload(seg
->s_as
->a_hat
, seg
->s_base
,
642 seg
->s_size
, HAT_UNLOAD_UNMAP
);
647 * svntr_hashtab will be NULL if we support shared regions.
649 trok
= ((a
->flags
& MAP_TEXT
) &&
650 (seg
->s_size
> textrepl_size_thresh
||
651 (a
->flags
& _MAP_TEXTREPL
)) &&
652 lgrp_optimizations() && svntr_hashtab
!= NULL
&&
653 a
->type
== MAP_PRIVATE
&& swresv
== 0 &&
654 !(a
->flags
& MAP_NORESERVE
) &&
655 seg
->s_as
!= &kas
&& a
->vp
->v_type
== VREG
);
657 ASSERT(!trok
|| !use_rgn
);
661 * MAP_NORESERVE mappings don't count towards the VSZ of a process
662 * until we fault the pages in.
664 if ((a
->vp
== NULL
|| a
->vp
->v_type
!= VREG
) &&
665 a
->flags
& MAP_NORESERVE
) {
666 seg
->s_as
->a_resvsize
-= seg
->s_size
;
670 * If more than one segment in the address space, and they're adjacent
671 * virtually, try to concatenate them. Don't concatenate if an
672 * explicit anon_map structure was supplied (e.g., SystemV shared
673 * memory) or if we'll use text replication for this segment.
675 if (a
->amp
== NULL
&& !use_rgn
&& !trok
) {
676 struct seg
*pseg
, *nseg
;
677 struct segvn_data
*psvd
, *nsvd
;
678 lgrp_mem_policy_t ppolicy
, npolicy
;
679 uint_t lgrp_mem_policy_flags
= 0;
682 * Memory policy flags (lgrp_mem_policy_flags) is valid when
683 * extending stack/heap segments.
685 if ((a
->vp
== NULL
) && (a
->type
== MAP_PRIVATE
) &&
686 !(a
->flags
& MAP_NORESERVE
) && (seg
->s_as
!= &kas
)) {
687 lgrp_mem_policy_flags
= a
->lgrp_mem_policy_flags
;
690 * Get policy when not extending it from another segment
692 mpolicy
= lgrp_mem_policy_default(seg
->s_size
, a
->type
);
696 * First, try to concatenate the previous and new segments
698 pseg
= AS_SEGPREV(seg
->s_as
, seg
);
700 pseg
->s_base
+ pseg
->s_size
== seg
->s_base
&&
701 pseg
->s_ops
== &segvn_ops
) {
703 * Get memory allocation policy from previous segment.
704 * When extension is specified (e.g. for heap) apply
705 * this policy to the new segment regardless of the
706 * outcome of segment concatenation. Extension occurs
707 * for non-default policy otherwise default policy is
708 * used and is based on extended segment size.
710 psvd
= (struct segvn_data
*)pseg
->s_data
;
711 ppolicy
= psvd
->policy_info
.mem_policy
;
712 if (lgrp_mem_policy_flags
==
713 LGRP_MP_FLAG_EXTEND_UP
) {
714 if (ppolicy
!= lgrp_mem_default_policy
) {
717 mpolicy
= lgrp_mem_policy_default(
718 pseg
->s_size
+ seg
->s_size
,
723 if (mpolicy
== ppolicy
&&
724 (pseg
->s_size
+ seg
->s_size
<=
725 segvn_comb_thrshld
|| psvd
->amp
== NULL
) &&
726 segvn_extend_prev(pseg
, seg
, a
, swresv
) == 0) {
728 * success! now try to concatenate
732 nseg
= AS_SEGNEXT(pseg
->s_as
, pseg
);
735 nseg
->s_ops
== &segvn_ops
&&
736 pseg
->s_base
+ pseg
->s_size
==
738 (void) segvn_concat(pseg
, nseg
, 0);
739 ASSERT(pseg
->s_szc
== 0 ||
740 (a
->szc
== pseg
->s_szc
&&
741 IS_P2ALIGNED(pseg
->s_base
, pgsz
) &&
742 IS_P2ALIGNED(pseg
->s_size
, pgsz
)));
744 * Communicate out the newly concatenated
745 * segment as part of the result.
753 * Failed, so try to concatenate with following seg
755 nseg
= AS_SEGNEXT(seg
->s_as
, seg
);
757 seg
->s_base
+ seg
->s_size
== nseg
->s_base
&&
758 nseg
->s_ops
== &segvn_ops
) {
760 * Get memory allocation policy from next segment.
761 * When extension is specified (e.g. for stack) apply
762 * this policy to the new segment regardless of the
763 * outcome of segment concatenation. Extension occurs
764 * for non-default policy otherwise default policy is
765 * used and is based on extended segment size.
767 nsvd
= (struct segvn_data
*)nseg
->s_data
;
768 npolicy
= nsvd
->policy_info
.mem_policy
;
769 if (lgrp_mem_policy_flags
==
770 LGRP_MP_FLAG_EXTEND_DOWN
) {
771 if (npolicy
!= lgrp_mem_default_policy
) {
774 mpolicy
= lgrp_mem_policy_default(
775 nseg
->s_size
+ seg
->s_size
,
780 if (mpolicy
== npolicy
&&
781 segvn_extend_next(seg
, nseg
, a
, swresv
) == 0) {
783 ASSERT(nseg
->s_szc
== 0 ||
784 (a
->szc
== nseg
->s_szc
&&
785 IS_P2ALIGNED(nseg
->s_base
, pgsz
) &&
786 IS_P2ALIGNED(nseg
->s_size
, pgsz
)));
788 * Communicate out the newly concatenated
789 * segment as part of the result.
799 if (a
->type
== MAP_SHARED
)
800 lgrp_shm_policy_init(NULL
, a
->vp
);
802 svd
= kmem_cache_alloc(segvn_cache
, KM_SLEEP
);
804 seg
->s_ops
= &segvn_ops
;
805 seg
->s_data
= (void *)svd
;
811 * Anonymous mappings have no backing file so the offset is meaningless.
813 svd
->offset
= a
->vp
? (a
->offset
& PAGEMASK
) : 0;
815 svd
->maxprot
= a
->maxprot
;
820 svd
->advice
= MADV_NORMAL
;
822 svd
->flags
= (ushort_t
)a
->flags
;
823 svd
->softlockcnt
= 0;
824 svd
->softlockcnt_sbase
= 0;
825 svd
->softlockcnt_send
= 0;
827 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
830 if (a
->szc
!= 0 && a
->vp
!= NULL
) {
831 segvn_setvnode_mpss(a
->vp
);
833 if (svd
->type
== MAP_SHARED
&& svd
->vp
!= NULL
&&
834 (svd
->vp
->v_flag
& VVMEXEC
) && (svd
->prot
& PROT_WRITE
)) {
835 ASSERT(vn_is_mapped(svd
->vp
, V_WRITE
));
836 segvn_inval_trcache(svd
->vp
);
840 if ((svd
->amp
= amp
) == NULL
) {
842 if (svd
->type
== MAP_SHARED
) {
845 * Shared mappings to a vp need no other setup.
846 * If we have a shared mapping to an anon_map object
847 * which hasn't been allocated yet, allocate the
848 * struct now so that it will be properly shared
849 * by remembering the swap reservation there.
852 svd
->amp
= anonmap_alloc(seg
->s_size
, swresv
,
854 svd
->amp
->a_szc
= seg
->s_szc
;
858 * Private mapping (with or without a vp).
859 * Allocate anon_map when needed.
861 svd
->swresv
= swresv
;
867 * Mapping to an existing anon_map structure without a vp.
868 * For now we will insure that the segment size isn't larger
869 * than the size - offset gives us. Later on we may wish to
870 * have the anon array dynamically allocated itself so that
871 * we don't always have to allocate all the anon pointer slots.
872 * This of course involves adding extra code to check that we
873 * aren't trying to use an anon pointer slot beyond the end
874 * of the currently allocated anon array.
876 if ((amp
->size
- a
->offset
) < seg
->s_size
) {
877 panic("segvn_create anon_map size");
881 anon_num
= btopr(a
->offset
);
883 if (a
->type
== MAP_SHARED
) {
885 * SHARED mapping to a given anon_map.
887 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
889 if (a
->szc
> amp
->a_szc
) {
892 ANON_LOCK_EXIT(&
->a_rwlock
);
893 svd
->anon_index
= anon_num
;
897 * PRIVATE mapping to a given anon_map.
898 * Make sure that all the needed anon
899 * structures are created (so that we will
900 * share the underlying pages if nothing
901 * is written by this mapping) and then
902 * duplicate the anon array as is done
903 * when a privately mapped segment is dup'ed.
909 int hat_flag
= HAT_LOAD
;
911 if (svd
->flags
& MAP_TEXT
) {
912 hat_flag
|= HAT_LOAD_TEXT
;
915 svd
->amp
= anonmap_alloc(seg
->s_size
, 0, ANON_SLEEP
);
916 svd
->amp
->a_szc
= seg
->s_szc
;
918 svd
->swresv
= swresv
;
921 * Prevent 2 threads from allocating anon
922 * slots simultaneously.
924 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
925 eaddr
= seg
->s_base
+ seg
->s_size
;
927 for (anon_idx
= anon_num
, addr
= seg
->s_base
;
928 addr
< eaddr
; addr
+= PAGESIZE
, anon_idx
++) {
931 if ((ap
= anon_get_ptr(amp
->ahp
,
936 * Allocate the anon struct now.
937 * Might as well load up translation
938 * to the page while we're at it...
940 pp
= anon_zero(seg
, addr
, &ap
, cred
);
941 if (ap
== NULL
|| pp
== NULL
) {
942 panic("segvn_create anon_zero");
947 * Re-acquire the anon_map lock and
948 * initialize the anon array entry.
950 ASSERT(anon_get_ptr(amp
->ahp
,
952 (void) anon_set_ptr(amp
->ahp
, anon_idx
, ap
,
955 ASSERT(seg
->s_szc
== 0);
956 ASSERT(!IS_VMODSORT(pp
->p_vnode
));
958 ASSERT(use_rgn
== 0);
959 hat_memload(seg
->s_as
->a_hat
, addr
, pp
,
960 svd
->prot
& ~PROT_WRITE
, hat_flag
);
964 ASSERT(seg
->s_szc
== 0);
965 anon_dup(amp
->ahp
, anon_num
, svd
->amp
->ahp
,
967 ANON_LOCK_EXIT(&
->a_rwlock
);
972 * Set default memory allocation policy for segment
974 * Always set policy for private memory at least for initialization
975 * even if this is a shared memory segment
977 (void) lgrp_privm_policy_set(mpolicy
, &svd
->policy_info
, seg
->s_size
);
979 if (svd
->type
== MAP_SHARED
)
980 (void) lgrp_shm_policy_set(mpolicy
, svd
->amp
, svd
->anon_index
,
981 svd
->vp
, svd
->offset
, seg
->s_size
);
985 ASSERT(svd
->amp
== NULL
);
986 svd
->rcookie
= hat_join_region(seg
->s_as
->a_hat
, seg
->s_base
,
987 seg
->s_size
, (void *)svd
->vp
, svd
->offset
, svd
->prot
,
988 (uchar_t
)seg
->s_szc
, segvn_hat_rgn_unload_callback
,
992 ASSERT(!trok
|| !(svd
->prot
& PROT_WRITE
));
993 svd
->tr_state
= trok
? SEGVN_TR_INIT
: SEGVN_TR_OFF
;
999 * Concatenate two existing segments, if possible.
1000 * Return 0 on success, -1 if two segments are not compatible
1001 * or -2 on memory allocation failure.
1002 * If amp_cat == 1 then try and concat segments with anon maps
1005 segvn_concat(struct seg
*seg1
, struct seg
*seg2
, int amp_cat
)
1007 struct segvn_data
*svd1
= seg1
->s_data
;
1008 struct segvn_data
*svd2
= seg2
->s_data
;
1009 struct anon_map
*amp1
= svd1
->amp
;
1010 struct anon_map
*amp2
= svd2
->amp
;
1011 struct vpage
*vpage1
= svd1
->vpage
;
1012 struct vpage
*vpage2
= svd2
->vpage
, *nvpage
= NULL
;
1013 size_t size
, nvpsize
;
1014 pgcnt_t npages1
, npages2
;
1016 ASSERT(seg1
->s_as
&& seg2
->s_as
&& seg1
->s_as
== seg2
->s_as
);
1017 ASSERT(AS_WRITE_HELD(seg1
->s_as
));
1018 ASSERT(seg1
->s_ops
== seg2
->s_ops
);
1020 if (HAT_IS_REGION_COOKIE_VALID(svd1
->rcookie
) ||
1021 HAT_IS_REGION_COOKIE_VALID(svd2
->rcookie
)) {
1025 /* both segments exist, try to merge them */
1026 #define incompat(x) (svd1->x != svd2->x)
1027 if (incompat(vp
) || incompat(maxprot
) ||
1028 (!svd1
->pageadvice
&& !svd2
->pageadvice
&& incompat(advice
)) ||
1029 (!svd1
->pageprot
&& !svd2
->pageprot
&& incompat(prot
)) ||
1030 incompat(type
) || incompat(cred
) || incompat(flags
) ||
1031 seg1
->s_szc
!= seg2
->s_szc
|| incompat(policy_info
.mem_policy
) ||
1032 (svd2
->softlockcnt
> 0) || svd1
->softlockcnt_send
> 0)
1037 * vp == NULL implies zfod, offset doesn't matter
1039 if (svd1
->vp
!= NULL
&&
1040 svd1
->offset
+ seg1
->s_size
!= svd2
->offset
) {
1045 * Don't concatenate if either segment uses text replication.
1047 if (svd1
->tr_state
!= SEGVN_TR_OFF
|| svd2
->tr_state
!= SEGVN_TR_OFF
) {
1052 * Fail early if we're not supposed to concatenate
1053 * segments with non NULL amp.
1055 if (amp_cat
== 0 && (amp1
!= NULL
|| amp2
!= NULL
)) {
1059 if (svd1
->vp
== NULL
&& svd1
->type
== MAP_SHARED
) {
1063 if (amp1
!= NULL
&& svd1
->anon_index
+ btop(seg1
->s_size
) !=
1067 ASSERT(amp1
== NULL
|| amp1
->refcnt
>= 2);
1071 * If either seg has vpages, create a new merged vpage array.
1073 if (vpage1
!= NULL
|| vpage2
!= NULL
) {
1074 struct vpage
*vp
, *evp
;
1076 npages1
= seg_pages(seg1
);
1077 npages2
= seg_pages(seg2
);
1078 nvpsize
= vpgtob(npages1
+ npages2
);
1080 if ((nvpage
= kmem_zalloc(nvpsize
, KM_NOSLEEP
)) == NULL
) {
1084 if (vpage1
!= NULL
) {
1085 bcopy(vpage1
, nvpage
, vpgtob(npages1
));
1087 evp
= nvpage
+ npages1
;
1088 for (vp
= nvpage
; vp
< evp
; vp
++) {
1089 VPP_SETPROT(vp
, svd1
->prot
);
1090 VPP_SETADVICE(vp
, svd1
->advice
);
1094 if (vpage2
!= NULL
) {
1095 bcopy(vpage2
, nvpage
+ npages1
, vpgtob(npages2
));
1097 evp
= nvpage
+ npages1
+ npages2
;
1098 for (vp
= nvpage
+ npages1
; vp
< evp
; vp
++) {
1099 VPP_SETPROT(vp
, svd2
->prot
);
1100 VPP_SETADVICE(vp
, svd2
->advice
);
1104 if (svd2
->pageswap
&& (!svd1
->pageswap
&& svd1
->swresv
)) {
1105 ASSERT(svd1
->swresv
== seg1
->s_size
);
1106 ASSERT(!(svd1
->flags
& MAP_NORESERVE
));
1107 ASSERT(!(svd2
->flags
& MAP_NORESERVE
));
1108 evp
= nvpage
+ npages1
;
1109 for (vp
= nvpage
; vp
< evp
; vp
++) {
1114 if (svd1
->pageswap
&& (!svd2
->pageswap
&& svd2
->swresv
)) {
1115 ASSERT(svd2
->swresv
== seg2
->s_size
);
1116 ASSERT(!(svd1
->flags
& MAP_NORESERVE
));
1117 ASSERT(!(svd2
->flags
& MAP_NORESERVE
));
1118 vp
= nvpage
+ npages1
;
1120 for (; vp
< evp
; vp
++) {
1125 ASSERT((vpage1
!= NULL
|| vpage2
!= NULL
) ||
1126 (svd1
->pageswap
== 0 && svd2
->pageswap
== 0));
1129 * If either segment has private pages, create a new merged anon
1130 * array. If mergeing shared anon segments just decrement anon map's
1133 if (amp1
!= NULL
&& svd1
->type
== MAP_SHARED
) {
1134 ASSERT(amp1
== amp2
&& svd1
->vp
== NULL
);
1135 ANON_LOCK_ENTER(&1
->a_rwlock
, RW_WRITER
);
1136 ASSERT(amp1
->refcnt
>= 2);
1138 ANON_LOCK_EXIT(&1
->a_rwlock
);
1140 } else if (amp1
!= NULL
|| amp2
!= NULL
) {
1141 struct anon_hdr
*nahp
;
1142 struct anon_map
*namp
= NULL
;
1145 ASSERT(svd1
->type
== MAP_PRIVATE
);
1147 asize
= seg1
->s_size
+ seg2
->s_size
;
1148 if ((nahp
= anon_create(btop(asize
), ANON_NOSLEEP
)) == NULL
) {
1149 if (nvpage
!= NULL
) {
1150 kmem_free(nvpage
, nvpsize
);
1156 * XXX anon rwlock is not really needed because
1157 * this is a private segment and we are writers.
1159 ANON_LOCK_ENTER(&1
->a_rwlock
, RW_WRITER
);
1160 ASSERT(amp1
->refcnt
== 1);
1161 if (anon_copy_ptr(amp1
->ahp
, svd1
->anon_index
,
1162 nahp
, 0, btop(seg1
->s_size
), ANON_NOSLEEP
)) {
1163 anon_release(nahp
, btop(asize
));
1164 ANON_LOCK_EXIT(&1
->a_rwlock
);
1165 if (nvpage
!= NULL
) {
1166 kmem_free(nvpage
, nvpsize
);
1172 ANON_LOCK_ENTER(&2
->a_rwlock
, RW_WRITER
);
1173 ASSERT(amp2
->refcnt
== 1);
1174 if (anon_copy_ptr(amp2
->ahp
, svd2
->anon_index
,
1175 nahp
, btop(seg1
->s_size
), btop(seg2
->s_size
),
1177 anon_release(nahp
, btop(asize
));
1178 ANON_LOCK_EXIT(&2
->a_rwlock
);
1180 ANON_LOCK_EXIT(&1
->a_rwlock
);
1182 if (nvpage
!= NULL
) {
1183 kmem_free(nvpage
, nvpsize
);
1190 anon_release(amp1
->ahp
, btop(amp1
->size
));
1194 ASSERT(amp1
== NULL
);
1196 anon_release(amp2
->ahp
, btop(amp2
->size
));
1199 ANON_LOCK_EXIT(&2
->a_rwlock
);
1202 svd2
->amp
= NULL
; /* needed for seg_free */
1207 svd1
->anon_index
= 0;
1208 ANON_LOCK_EXIT(&namp
->a_rwlock
);
1211 * Now free the old vpage structures.
1213 if (nvpage
!= NULL
) {
1214 if (vpage1
!= NULL
) {
1215 kmem_free(vpage1
, vpgtob(npages1
));
1217 if (vpage2
!= NULL
) {
1219 kmem_free(vpage2
, vpgtob(npages2
));
1221 if (svd2
->pageprot
) {
1224 if (svd2
->pageadvice
) {
1225 svd1
->pageadvice
= 1;
1227 if (svd2
->pageswap
) {
1230 svd1
->vpage
= nvpage
;
1233 /* all looks ok, merge segments */
1234 svd1
->swresv
+= svd2
->swresv
;
1235 svd2
->swresv
= 0; /* so seg_free doesn't release swap space */
1236 size
= seg2
->s_size
;
1238 seg1
->s_size
+= size
;
1243 * Extend the previous segment (seg1) to include the
1244 * new segment (seg2 + a), if possible.
1245 * Return 0 on success.
1248 segvn_extend_prev(struct seg
*seg1
, struct seg
*seg2
, struct segvn_crargs
*a
,
1251 struct segvn_data
*svd1
= (struct segvn_data
*)seg1
->s_data
;
1253 struct anon_map
*amp1
;
1254 struct vpage
*new_vpage
;
1257 * We don't need any segment level locks for "segvn" data
1258 * since the address space is "write" locked.
1260 ASSERT(seg1
->s_as
&& AS_WRITE_HELD(seg1
->s_as
));
1262 if (HAT_IS_REGION_COOKIE_VALID(svd1
->rcookie
)) {
1266 /* second segment is new, try to extend first */
1267 /* XXX - should also check cred */
1268 if (svd1
->vp
!= a
->vp
|| svd1
->maxprot
!= a
->maxprot
||
1269 (!svd1
->pageprot
&& (svd1
->prot
!= a
->prot
)) ||
1270 svd1
->type
!= a
->type
|| svd1
->flags
!= a
->flags
||
1271 seg1
->s_szc
!= a
->szc
|| svd1
->softlockcnt_send
> 0)
1274 /* vp == NULL implies zfod, offset doesn't matter */
1275 if (svd1
->vp
!= NULL
&&
1276 svd1
->offset
+ seg1
->s_size
!= (a
->offset
& PAGEMASK
))
1279 if (svd1
->tr_state
!= SEGVN_TR_OFF
) {
1288 * Segment has private pages, can data structures
1291 * Acquire the anon_map lock to prevent it from changing,
1292 * if it is shared. This ensures that the anon_map
1293 * will not change while a thread which has a read/write
1294 * lock on an address space references it.
1295 * XXX - Don't need the anon_map lock at all if "refcnt"
1298 * Can't grow a MAP_SHARED segment with an anonmap because
1299 * there may be existing anon slots where we want to extend
1300 * the segment and we wouldn't know what to do with them
1301 * (e.g., for tmpfs right thing is to just leave them there,
1302 * for /dev/zero they should be cleared out).
1304 if (svd1
->type
== MAP_SHARED
)
1307 ANON_LOCK_ENTER(&1
->a_rwlock
, RW_WRITER
);
1308 if (amp1
->refcnt
> 1) {
1309 ANON_LOCK_EXIT(&1
->a_rwlock
);
1312 newpgs
= anon_grow(amp1
->ahp
, &svd1
->anon_index
,
1313 btop(seg1
->s_size
), btop(seg2
->s_size
), ANON_NOSLEEP
);
1316 ANON_LOCK_EXIT(&1
->a_rwlock
);
1319 amp1
->size
= ptob(newpgs
);
1320 ANON_LOCK_EXIT(&1
->a_rwlock
);
1322 if (svd1
->vpage
!= NULL
) {
1323 struct vpage
*vp
, *evp
;
1325 kmem_zalloc(vpgtob(seg_pages(seg1
) + seg_pages(seg2
)),
1327 if (new_vpage
== NULL
)
1329 bcopy(svd1
->vpage
, new_vpage
, vpgtob(seg_pages(seg1
)));
1330 kmem_free(svd1
->vpage
, vpgtob(seg_pages(seg1
)));
1331 svd1
->vpage
= new_vpage
;
1333 vp
= new_vpage
+ seg_pages(seg1
);
1334 evp
= vp
+ seg_pages(seg2
);
1335 for (; vp
< evp
; vp
++)
1336 VPP_SETPROT(vp
, a
->prot
);
1337 if (svd1
->pageswap
&& swresv
) {
1338 ASSERT(!(svd1
->flags
& MAP_NORESERVE
));
1339 ASSERT(swresv
== seg2
->s_size
);
1340 vp
= new_vpage
+ seg_pages(seg1
);
1341 for (; vp
< evp
; vp
++) {
1346 ASSERT(svd1
->vpage
!= NULL
|| svd1
->pageswap
== 0);
1347 size
= seg2
->s_size
;
1349 seg1
->s_size
+= size
;
1350 svd1
->swresv
+= swresv
;
1351 if (svd1
->pageprot
&& (a
->prot
& PROT_WRITE
) &&
1352 svd1
->type
== MAP_SHARED
&& svd1
->vp
!= NULL
&&
1353 (svd1
->vp
->v_flag
& VVMEXEC
)) {
1354 ASSERT(vn_is_mapped(svd1
->vp
, V_WRITE
));
1355 segvn_inval_trcache(svd1
->vp
);
1361 * Extend the next segment (seg2) to include the
1362 * new segment (seg1 + a), if possible.
1363 * Return 0 on success.
1366 segvn_extend_next(struct seg
*seg1
, struct seg
*seg2
, struct segvn_crargs
*a
,
1369 struct segvn_data
*svd2
= (struct segvn_data
*)seg2
->s_data
;
1371 struct anon_map
*amp2
;
1372 struct vpage
*new_vpage
;
1375 * We don't need any segment level locks for "segvn" data
1376 * since the address space is "write" locked.
1378 ASSERT(seg2
->s_as
&& AS_WRITE_HELD(seg2
->s_as
));
1380 if (HAT_IS_REGION_COOKIE_VALID(svd2
->rcookie
)) {
1384 /* first segment is new, try to extend second */
1385 /* XXX - should also check cred */
1386 if (svd2
->vp
!= a
->vp
|| svd2
->maxprot
!= a
->maxprot
||
1387 (!svd2
->pageprot
&& (svd2
->prot
!= a
->prot
)) ||
1388 svd2
->type
!= a
->type
|| svd2
->flags
!= a
->flags
||
1389 seg2
->s_szc
!= a
->szc
|| svd2
->softlockcnt_sbase
> 0)
1391 /* vp == NULL implies zfod, offset doesn't matter */
1392 if (svd2
->vp
!= NULL
&&
1393 (a
->offset
& PAGEMASK
) + seg1
->s_size
!= svd2
->offset
)
1396 if (svd2
->tr_state
!= SEGVN_TR_OFF
) {
1405 * Segment has private pages, can data structures
1408 * Acquire the anon_map lock to prevent it from changing,
1409 * if it is shared. This ensures that the anon_map
1410 * will not change while a thread which has a read/write
1411 * lock on an address space references it.
1413 * XXX - Don't need the anon_map lock at all if "refcnt"
1416 if (svd2
->type
== MAP_SHARED
)
1419 ANON_LOCK_ENTER(&2
->a_rwlock
, RW_WRITER
);
1420 if (amp2
->refcnt
> 1) {
1421 ANON_LOCK_EXIT(&2
->a_rwlock
);
1424 newpgs
= anon_grow(amp2
->ahp
, &svd2
->anon_index
,
1425 btop(seg2
->s_size
), btop(seg1
->s_size
),
1426 ANON_NOSLEEP
| ANON_GROWDOWN
);
1429 ANON_LOCK_EXIT(&2
->a_rwlock
);
1432 amp2
->size
= ptob(newpgs
);
1433 ANON_LOCK_EXIT(&2
->a_rwlock
);
1435 if (svd2
->vpage
!= NULL
) {
1436 struct vpage
*vp
, *evp
;
1438 kmem_zalloc(vpgtob(seg_pages(seg1
) + seg_pages(seg2
)),
1440 if (new_vpage
== NULL
) {
1441 /* Not merging segments so adjust anon_index back */
1443 svd2
->anon_index
+= seg_pages(seg1
);
1446 bcopy(svd2
->vpage
, new_vpage
+ seg_pages(seg1
),
1447 vpgtob(seg_pages(seg2
)));
1448 kmem_free(svd2
->vpage
, vpgtob(seg_pages(seg2
)));
1449 svd2
->vpage
= new_vpage
;
1452 evp
= vp
+ seg_pages(seg1
);
1453 for (; vp
< evp
; vp
++)
1454 VPP_SETPROT(vp
, a
->prot
);
1455 if (svd2
->pageswap
&& swresv
) {
1456 ASSERT(!(svd2
->flags
& MAP_NORESERVE
));
1457 ASSERT(swresv
== seg1
->s_size
);
1459 for (; vp
< evp
; vp
++) {
1464 ASSERT(svd2
->vpage
!= NULL
|| svd2
->pageswap
== 0);
1465 size
= seg1
->s_size
;
1467 seg2
->s_size
+= size
;
1468 seg2
->s_base
-= size
;
1469 svd2
->offset
-= size
;
1470 svd2
->swresv
+= swresv
;
1471 if (svd2
->pageprot
&& (a
->prot
& PROT_WRITE
) &&
1472 svd2
->type
== MAP_SHARED
&& svd2
->vp
!= NULL
&&
1473 (svd2
->vp
->v_flag
& VVMEXEC
)) {
1474 ASSERT(vn_is_mapped(svd2
->vp
, V_WRITE
));
1475 segvn_inval_trcache(svd2
->vp
);
1481 * Duplicate all the pages in the segment. This may break COW sharing for a
1482 * given page. If the page is marked with inherit zero set, then instead of
1483 * duplicating the page, we zero the page.
1486 segvn_dup_pages(struct seg
*seg
, struct seg
*newseg
)
1491 struct anon
*ap
, *newap
;
1495 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1496 struct segvn_data
*newsvd
= (struct segvn_data
*)newseg
->s_data
;
1497 ulong_t old_idx
= svd
->anon_index
;
1498 ulong_t new_idx
= 0;
1500 i
= btopr(seg
->s_size
);
1504 * XXX break cow sharing using PAGESIZE
1505 * pages. They will be relocated into larger
1506 * pages at fault time.
1509 if ((ap
= anon_get_ptr(svd
->amp
->ahp
, old_idx
)) != NULL
) {
1512 vpp
= &svd
->vpage
[seg_page(seg
, addr
)];
1515 * prot need not be computed below 'cause anon_private
1516 * is going to ignore it anyway as child doesn't inherit
1517 * pagelock from parent.
1519 prot
= svd
->pageprot
? VPP_PROT(vpp
) : svd
->prot
;
1522 * Check whether we should zero this or dup it.
1524 if (svd
->svn_inz
== SEGVN_INZ_ALL
||
1525 (svd
->svn_inz
== SEGVN_INZ_VPP
&&
1526 VPP_ISINHZERO(vpp
))) {
1527 pp
= anon_zero(newseg
, addr
, &newap
,
1530 page_t
*anon_pl
[1+1];
1532 error
= anon_getpage(&ap
, &vpprot
, anon_pl
,
1533 PAGESIZE
, seg
, addr
, S_READ
, svd
->cred
);
1537 pp
= anon_private(&newap
, newseg
, addr
, prot
,
1538 anon_pl
[0], 0, newsvd
->cred
);
1543 (void) anon_set_ptr(newsvd
->amp
->ahp
, new_idx
, newap
,
1556 segvn_dup(struct seg
*seg
, struct seg
*newseg
)
1558 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1559 struct segvn_data
*newsvd
;
1560 pgcnt_t npages
= seg_pages(seg
);
1563 struct anon_map
*amp
;
1565 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
));
1566 ASSERT(newseg
->s_as
->a_proc
->p_parent
== curproc
);
1569 * If segment has anon reserved, reserve more for the new seg.
1570 * For a MAP_NORESERVE segment swresv will be a count of all the
1571 * allocated anon slots; thus we reserve for the child as many slots
1572 * as the parent has allocated. This semantic prevents the child or
1573 * parent from dieing during a copy-on-write fault caused by trying
1574 * to write a shared pre-existing anon page.
1576 if ((len
= svd
->swresv
) != 0) {
1577 if (anon_resv(svd
->swresv
) == 0)
1581 newsvd
= kmem_cache_alloc(segvn_cache
, KM_SLEEP
);
1583 newseg
->s_ops
= &segvn_ops
;
1584 newseg
->s_data
= (void *)newsvd
;
1585 newseg
->s_szc
= seg
->s_szc
;
1587 newsvd
->seg
= newseg
;
1588 if ((newsvd
->vp
= svd
->vp
) != NULL
) {
1590 if (svd
->type
== MAP_SHARED
)
1591 lgrp_shm_policy_init(NULL
, svd
->vp
);
1593 newsvd
->offset
= svd
->offset
;
1594 newsvd
->prot
= svd
->prot
;
1595 newsvd
->maxprot
= svd
->maxprot
;
1596 newsvd
->pageprot
= svd
->pageprot
;
1597 newsvd
->type
= svd
->type
;
1598 newsvd
->cred
= svd
->cred
;
1599 crhold(newsvd
->cred
);
1600 newsvd
->advice
= svd
->advice
;
1601 newsvd
->pageadvice
= svd
->pageadvice
;
1602 newsvd
->svn_inz
= svd
->svn_inz
;
1603 newsvd
->swresv
= svd
->swresv
;
1604 newsvd
->pageswap
= svd
->pageswap
;
1605 newsvd
->flags
= svd
->flags
;
1606 newsvd
->softlockcnt
= 0;
1607 newsvd
->softlockcnt_sbase
= 0;
1608 newsvd
->softlockcnt_send
= 0;
1609 newsvd
->policy_info
= svd
->policy_info
;
1610 newsvd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
1612 if ((amp
= svd
->amp
) == NULL
|| svd
->tr_state
== SEGVN_TR_ON
) {
1614 * Not attaching to a shared anon object.
1616 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
) ||
1617 svd
->tr_state
== SEGVN_TR_OFF
);
1618 if (svd
->tr_state
== SEGVN_TR_ON
) {
1619 ASSERT(newsvd
->vp
!= NULL
&& amp
!= NULL
);
1620 newsvd
->tr_state
= SEGVN_TR_INIT
;
1622 newsvd
->tr_state
= svd
->tr_state
;
1625 newsvd
->anon_index
= 0;
1627 /* regions for now are only used on pure vnode segments */
1628 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
1629 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1630 newsvd
->tr_state
= SEGVN_TR_OFF
;
1631 if (svd
->type
== MAP_SHARED
) {
1632 ASSERT(svd
->svn_inz
== SEGVN_INZ_NONE
);
1634 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
1636 ANON_LOCK_EXIT(&
->a_rwlock
);
1637 newsvd
->anon_index
= svd
->anon_index
;
1642 * Allocate and initialize new anon_map structure.
1644 newsvd
->amp
= anonmap_alloc(newseg
->s_size
, 0,
1646 newsvd
->amp
->a_szc
= newseg
->s_szc
;
1647 newsvd
->anon_index
= 0;
1648 ASSERT(svd
->svn_inz
== SEGVN_INZ_NONE
||
1649 svd
->svn_inz
== SEGVN_INZ_ALL
||
1650 svd
->svn_inz
== SEGVN_INZ_VPP
);
1653 * We don't have to acquire the anon_map lock
1654 * for the new segment (since it belongs to an
1655 * address space that is still not associated
1656 * with any process), or the segment in the old
1657 * address space (since all threads in it
1658 * are stopped while duplicating the address space).
1662 * The goal of the following code is to make sure that
1663 * softlocked pages do not end up as copy on write
1664 * pages. This would cause problems where one
1665 * thread writes to a page that is COW and a different
1666 * thread in the same process has softlocked it. The
1667 * softlock lock would move away from this process
1668 * because the write would cause this process to get
1669 * a copy (without the softlock).
1671 * The strategy here is to just break the
1672 * sharing on pages that could possibly be
1675 * In addition, if any pages have been marked that they
1676 * should be inherited as zero, then we immediately go
1677 * ahead and break COW and zero them. In the case of a
1678 * softlocked page that should be inherited zero, we
1679 * break COW and just get a zero page.
1682 if (svd
->softlockcnt
||
1683 svd
->svn_inz
!= SEGVN_INZ_NONE
) {
1685 * The softlock count might be non zero
1686 * because some pages are still stuck in the
1687 * cache for lazy reclaim. Flush the cache
1688 * now. This should drop the count to zero.
1689 * [or there is really I/O going on to these
1690 * pages]. Note, we have the writers lock so
1691 * nothing gets inserted during the flush.
1693 if (svd
->softlockcnt
&& reclaim
== 1) {
1699 error
= segvn_dup_pages(seg
, newseg
);
1701 newsvd
->vpage
= NULL
;
1704 } else { /* common case */
1705 if (seg
->s_szc
!= 0) {
1707 * If at least one of anon slots of a
1708 * large page exists then make sure
1709 * all anon slots of a large page
1710 * exist to avoid partial cow sharing
1711 * of a large page in the future.
1713 anon_dup_fill_holes(amp
->ahp
,
1714 svd
->anon_index
, newsvd
->amp
->ahp
,
1715 0, seg
->s_size
, seg
->s_szc
,
1718 anon_dup(amp
->ahp
, svd
->anon_index
,
1719 newsvd
->amp
->ahp
, 0, seg
->s_size
);
1722 hat_clrattr(seg
->s_as
->a_hat
, seg
->s_base
,
1723 seg
->s_size
, PROT_WRITE
);
1728 * If necessary, create a vpage structure for the new segment.
1729 * Do not copy any page lock indications.
1731 if (svd
->vpage
!= NULL
) {
1733 struct vpage
*ovp
= svd
->vpage
;
1736 nvp
= newsvd
->vpage
=
1737 kmem_alloc(vpgtob(npages
), KM_SLEEP
);
1738 for (i
= 0; i
< npages
; i
++) {
1740 VPP_CLRPPLOCK(nvp
++);
1743 newsvd
->vpage
= NULL
;
1745 /* Inform the vnode of the new mapping */
1746 if (newsvd
->vp
!= NULL
) {
1747 error
= fop_addmap(newsvd
->vp
, (offset_t
)newsvd
->offset
,
1748 newseg
->s_as
, newseg
->s_base
, newseg
->s_size
, newsvd
->prot
,
1749 newsvd
->maxprot
, newsvd
->type
, newsvd
->cred
, NULL
);
1752 if (error
== 0 && HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
1753 ASSERT(newsvd
->amp
== NULL
);
1754 ASSERT(newsvd
->tr_state
== SEGVN_TR_OFF
);
1755 newsvd
->rcookie
= svd
->rcookie
;
1756 hat_dup_region(newseg
->s_as
->a_hat
, newsvd
->rcookie
);
1763 * callback function to invoke free_vp_pages() for only those pages actually
1764 * processed by the HAT when a shared region is destroyed.
1766 extern int free_pages
;
1769 segvn_hat_rgn_unload_callback(caddr_t saddr
, caddr_t eaddr
, caddr_t r_saddr
,
1770 size_t r_size
, void *r_obj
, uoff_t r_objoff
)
1774 vnode_t
*vp
= (vnode_t
*)r_obj
;
1776 ASSERT(eaddr
> saddr
);
1777 ASSERT(saddr
>= r_saddr
);
1778 ASSERT(saddr
< r_saddr
+ r_size
);
1779 ASSERT(eaddr
> r_saddr
);
1780 ASSERT(eaddr
<= r_saddr
+ r_size
);
1787 len
= eaddr
- saddr
;
1788 off
= (saddr
- r_saddr
) + r_objoff
;
1789 free_vp_pages(&vp
->v_object
, off
, len
);
1793 * callback function used by segvn_unmap to invoke free_vp_pages() for only
1794 * those pages actually processed by the HAT
1797 segvn_hat_unload_callback(hat_callback_t
*cb
)
1799 struct seg
*seg
= cb
->hcb_data
;
1800 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1804 ASSERT(svd
->vp
!= NULL
);
1805 ASSERT(cb
->hcb_end_addr
> cb
->hcb_start_addr
);
1806 ASSERT(cb
->hcb_start_addr
>= seg
->s_base
);
1808 len
= cb
->hcb_end_addr
- cb
->hcb_start_addr
;
1809 off
= cb
->hcb_start_addr
- seg
->s_base
;
1810 free_vp_pages(&svd
->vp
->v_object
, svd
->offset
+ off
, len
);
1814 * This function determines the number of bytes of swap reserved by
1815 * a segment for which per-page accounting is present. It is used to
1816 * calculate the correct value of a segvn_data's swresv.
1819 segvn_count_swap_by_vpages(struct seg
*seg
)
1821 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1822 struct vpage
*vp
, *evp
;
1823 size_t nswappages
= 0;
1825 ASSERT(svd
->pageswap
);
1826 ASSERT(svd
->vpage
!= NULL
);
1828 evp
= &svd
->vpage
[seg_page(seg
, seg
->s_base
+ seg
->s_size
)];
1830 for (vp
= svd
->vpage
; vp
< evp
; vp
++) {
1831 if (VPP_ISSWAPRES(vp
))
1835 return (nswappages
<< PAGESHIFT
);
1839 segvn_unmap(struct seg
*seg
, caddr_t addr
, size_t len
)
1841 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
1842 struct segvn_data
*nsvd
;
1844 struct anon_map
*amp
;
1845 pgcnt_t opages
; /* old segment size in pages */
1846 pgcnt_t npages
; /* new segment size in pages */
1847 pgcnt_t dpages
; /* pages being deleted (unmapped) */
1848 hat_callback_t callback
; /* used for free_vp_pages() */
1849 hat_callback_t
*cbp
= NULL
;
1856 * We don't need any segment level locks for "segvn" data
1857 * since the address space is "write" locked.
1859 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
));
1862 * Fail the unmap if pages are SOFTLOCKed through this mapping.
1863 * softlockcnt is protected from change by the as write lock.
1866 if (svd
->softlockcnt
> 0) {
1867 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1870 * If this is shared segment non 0 softlockcnt
1871 * means locked pages are still in use.
1873 if (svd
->type
== MAP_SHARED
) {
1878 * since we do have the writers lock nobody can fill
1879 * the cache during the purge. The flush either succeeds
1880 * or we still have pending I/Os.
1891 * Check for bad sizes
1893 if (addr
< seg
->s_base
|| addr
+ len
> seg
->s_base
+ seg
->s_size
||
1894 (len
& PAGEOFFSET
) || ((uintptr_t)addr
& PAGEOFFSET
)) {
1895 panic("segvn_unmap");
1899 if (seg
->s_szc
!= 0) {
1900 size_t pgsz
= page_get_pagesize(seg
->s_szc
);
1902 if (!IS_P2ALIGNED(addr
, pgsz
) || !IS_P2ALIGNED(len
, pgsz
)) {
1903 ASSERT(seg
->s_base
!= addr
|| seg
->s_size
!= len
);
1904 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
1905 ASSERT(svd
->amp
== NULL
);
1906 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1907 hat_leave_region(seg
->s_as
->a_hat
,
1908 svd
->rcookie
, HAT_REGION_TEXT
);
1909 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
1911 * could pass a flag to segvn_demote_range()
1912 * below to tell it not to do any unloads but
1913 * this case is rare enough to not bother for
1916 } else if (svd
->tr_state
== SEGVN_TR_INIT
) {
1917 svd
->tr_state
= SEGVN_TR_OFF
;
1918 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
1919 ASSERT(svd
->amp
!= NULL
);
1920 segvn_textunrepl(seg
, 1);
1921 ASSERT(svd
->amp
== NULL
);
1922 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1924 VM_STAT_ADD(segvnvmstats
.demoterange
[0]);
1925 err
= segvn_demote_range(seg
, addr
, len
, SDR_END
, 0);
1933 /* Inform the vnode of the unmapping. */
1937 error
= fop_delmap(svd
->vp
,
1938 (offset_t
)svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
),
1939 seg
->s_as
, addr
, len
, svd
->prot
, svd
->maxprot
,
1940 svd
->type
, svd
->cred
, NULL
);
1942 if (error
== EAGAIN
)
1947 * Remove any page locks set through this mapping.
1948 * If text replication is not off no page locks could have been
1949 * established via this mapping.
1951 if (svd
->tr_state
== SEGVN_TR_OFF
) {
1952 (void) segvn_lockop(seg
, addr
, len
, 0, MC_UNLOCK
, NULL
, 0);
1955 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
1956 ASSERT(svd
->amp
== NULL
);
1957 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
1958 ASSERT(svd
->type
== MAP_PRIVATE
);
1959 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
1961 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
1962 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
1963 ASSERT(svd
->amp
!= NULL
);
1964 ASSERT(svd
->pageprot
== 0 && !(svd
->prot
& PROT_WRITE
));
1965 segvn_textunrepl(seg
, 1);
1966 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
1968 if (svd
->tr_state
!= SEGVN_TR_OFF
) {
1969 ASSERT(svd
->tr_state
== SEGVN_TR_INIT
);
1970 svd
->tr_state
= SEGVN_TR_OFF
;
1973 * Unload any hardware translations in the range to be taken
1974 * out. Use a callback to invoke free_vp_pages() effectively.
1976 if (svd
->vp
!= NULL
&& free_pages
!= 0) {
1977 callback
.hcb_data
= seg
;
1978 callback
.hcb_function
= segvn_hat_unload_callback
;
1981 hat_unload_callback(seg
->s_as
->a_hat
, addr
, len
,
1982 HAT_UNLOAD_UNMAP
, cbp
);
1984 if (svd
->type
== MAP_SHARED
&& svd
->vp
!= NULL
&&
1985 (svd
->vp
->v_flag
& VVMEXEC
) &&
1986 ((svd
->prot
& PROT_WRITE
) || svd
->pageprot
)) {
1987 segvn_inval_trcache(svd
->vp
);
1992 * Check for entire segment
1994 if (addr
== seg
->s_base
&& len
== seg
->s_size
) {
1999 opages
= seg_pages(seg
);
2001 npages
= opages
- dpages
;
2003 ASSERT(amp
== NULL
|| amp
->a_szc
>= seg
->s_szc
);
2006 * Check for beginning of segment
2008 if (addr
== seg
->s_base
) {
2009 if (svd
->vpage
!= NULL
) {
2011 struct vpage
*ovpage
;
2013 ovpage
= svd
->vpage
; /* keep pointer to vpage */
2015 nbytes
= vpgtob(npages
);
2016 svd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
2017 bcopy(&ovpage
[dpages
], svd
->vpage
, nbytes
);
2019 /* free up old vpage */
2020 kmem_free(ovpage
, vpgtob(opages
));
2023 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
2024 if (amp
->refcnt
== 1 || svd
->type
== MAP_PRIVATE
) {
2026 * Shared anon map is no longer in use. Before
2027 * freeing its pages purge all entries from
2028 * pcache that belong to this amp.
2030 if (svd
->type
== MAP_SHARED
) {
2031 ASSERT(amp
->refcnt
== 1);
2032 ASSERT(svd
->softlockcnt
== 0);
2036 * Free up now unused parts of anon_map array.
2038 if (amp
->a_szc
== seg
->s_szc
) {
2039 if (seg
->s_szc
!= 0) {
2040 anon_free_pages(amp
->ahp
,
2041 svd
->anon_index
, len
,
2049 ASSERT(svd
->type
== MAP_SHARED
);
2050 ASSERT(amp
->a_szc
> seg
->s_szc
);
2051 anon_shmap_free_pages(amp
,
2052 svd
->anon_index
, len
);
2056 * Unreserve swap space for the
2057 * unmapped chunk of this segment in
2058 * case it's MAP_SHARED
2060 if (svd
->type
== MAP_SHARED
) {
2061 anon_unresv_zone(len
,
2062 seg
->s_as
->a_proc
->p_zone
);
2066 ANON_LOCK_EXIT(&
->a_rwlock
);
2067 svd
->anon_index
+= dpages
;
2069 if (svd
->vp
!= NULL
)
2076 if (svd
->flags
& MAP_NORESERVE
) {
2078 oswresv
= svd
->swresv
;
2080 svd
->swresv
= ptob(anon_pages(amp
->ahp
,
2081 svd
->anon_index
, npages
));
2082 anon_unresv_zone(oswresv
- svd
->swresv
,
2083 seg
->s_as
->a_proc
->p_zone
);
2084 if (SEG_IS_PARTIAL_RESV(seg
))
2085 seg
->s_as
->a_resvsize
-= oswresv
-
2090 if (svd
->pageswap
) {
2091 oswresv
= svd
->swresv
;
2093 segvn_count_swap_by_vpages(seg
);
2094 ASSERT(oswresv
>= svd
->swresv
);
2095 unlen
= oswresv
- svd
->swresv
;
2098 ASSERT(svd
->swresv
== seg
->s_size
);
2101 anon_unresv_zone(unlen
,
2102 seg
->s_as
->a_proc
->p_zone
);
2110 * Check for end of segment
2112 if (addr
+ len
== seg
->s_base
+ seg
->s_size
) {
2113 if (svd
->vpage
!= NULL
) {
2115 struct vpage
*ovpage
;
2117 ovpage
= svd
->vpage
; /* keep pointer to vpage */
2119 nbytes
= vpgtob(npages
);
2120 svd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
2121 bcopy(ovpage
, svd
->vpage
, nbytes
);
2123 /* free up old vpage */
2124 kmem_free(ovpage
, vpgtob(opages
));
2128 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
2129 if (amp
->refcnt
== 1 || svd
->type
== MAP_PRIVATE
) {
2131 * Free up now unused parts of anon_map array.
2133 ulong_t an_idx
= svd
->anon_index
+ npages
;
2136 * Shared anon map is no longer in use. Before
2137 * freeing its pages purge all entries from
2138 * pcache that belong to this amp.
2140 if (svd
->type
== MAP_SHARED
) {
2141 ASSERT(amp
->refcnt
== 1);
2142 ASSERT(svd
->softlockcnt
== 0);
2146 if (amp
->a_szc
== seg
->s_szc
) {
2147 if (seg
->s_szc
!= 0) {
2148 anon_free_pages(amp
->ahp
,
2152 anon_free(amp
->ahp
, an_idx
,
2156 ASSERT(svd
->type
== MAP_SHARED
);
2157 ASSERT(amp
->a_szc
> seg
->s_szc
);
2158 anon_shmap_free_pages(amp
,
2163 * Unreserve swap space for the
2164 * unmapped chunk of this segment in
2165 * case it's MAP_SHARED
2167 if (svd
->type
== MAP_SHARED
) {
2168 anon_unresv_zone(len
,
2169 seg
->s_as
->a_proc
->p_zone
);
2173 ANON_LOCK_EXIT(&
->a_rwlock
);
2179 if (svd
->flags
& MAP_NORESERVE
) {
2181 oswresv
= svd
->swresv
;
2182 svd
->swresv
= ptob(anon_pages(amp
->ahp
,
2183 svd
->anon_index
, npages
));
2184 anon_unresv_zone(oswresv
- svd
->swresv
,
2185 seg
->s_as
->a_proc
->p_zone
);
2186 if (SEG_IS_PARTIAL_RESV(seg
))
2187 seg
->s_as
->a_resvsize
-= oswresv
-
2192 if (svd
->pageswap
) {
2193 oswresv
= svd
->swresv
;
2195 segvn_count_swap_by_vpages(seg
);
2196 ASSERT(oswresv
>= svd
->swresv
);
2197 unlen
= oswresv
- svd
->swresv
;
2200 ASSERT(svd
->swresv
== seg
->s_size
);
2203 anon_unresv_zone(unlen
,
2204 seg
->s_as
->a_proc
->p_zone
);
2212 * The section to go is in the middle of the segment,
2213 * have to make it into two segments. nseg is made for
2214 * the high end while seg is cut down at the low end.
2216 nbase
= addr
+ len
; /* new seg base */
2217 nsize
= (seg
->s_base
+ seg
->s_size
) - nbase
; /* new seg size */
2218 seg
->s_size
= addr
- seg
->s_base
; /* shrink old seg */
2219 nseg
= seg_alloc(seg
->s_as
, nbase
, nsize
);
2221 panic("segvn_unmap seg_alloc");
2224 nseg
->s_ops
= seg
->s_ops
;
2225 nsvd
= kmem_cache_alloc(segvn_cache
, KM_SLEEP
);
2226 nseg
->s_data
= (void *)nsvd
;
2227 nseg
->s_szc
= seg
->s_szc
;
2230 nsvd
->offset
= svd
->offset
+ (uintptr_t)(nseg
->s_base
- seg
->s_base
);
2232 nsvd
->softlockcnt
= 0;
2233 nsvd
->softlockcnt_sbase
= 0;
2234 nsvd
->softlockcnt_send
= 0;
2235 nsvd
->svn_inz
= svd
->svn_inz
;
2236 ASSERT(nsvd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2238 if (svd
->vp
!= NULL
) {
2240 if (nsvd
->type
== MAP_SHARED
)
2241 lgrp_shm_policy_init(NULL
, nsvd
->vp
);
2245 if (svd
->vpage
== NULL
) {
2248 /* need to split vpage into two arrays */
2250 struct vpage
*ovpage
;
2252 ovpage
= svd
->vpage
; /* keep pointer to vpage */
2254 npages
= seg_pages(seg
); /* seg has shrunk */
2255 nbytes
= vpgtob(npages
);
2256 svd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
2258 bcopy(ovpage
, svd
->vpage
, nbytes
);
2260 npages
= seg_pages(nseg
);
2261 nbytes
= vpgtob(npages
);
2262 nsvd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
2264 bcopy(&ovpage
[opages
- npages
], nsvd
->vpage
, nbytes
);
2266 /* free up old vpage */
2267 kmem_free(ovpage
, vpgtob(opages
));
2272 nsvd
->anon_index
= 0;
2275 * Need to create a new anon map for the new segment.
2276 * We'll also allocate a new smaller array for the old
2277 * smaller segment to save space.
2279 opages
= btop((uintptr_t)(addr
- seg
->s_base
));
2280 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
2281 if (amp
->refcnt
== 1 || svd
->type
== MAP_PRIVATE
) {
2283 * Free up now unused parts of anon_map array.
2285 ulong_t an_idx
= svd
->anon_index
+ opages
;
2288 * Shared anon map is no longer in use. Before
2289 * freeing its pages purge all entries from
2290 * pcache that belong to this amp.
2292 if (svd
->type
== MAP_SHARED
) {
2293 ASSERT(amp
->refcnt
== 1);
2294 ASSERT(svd
->softlockcnt
== 0);
2298 if (amp
->a_szc
== seg
->s_szc
) {
2299 if (seg
->s_szc
!= 0) {
2300 anon_free_pages(amp
->ahp
, an_idx
, len
,
2303 anon_free(amp
->ahp
, an_idx
,
2307 ASSERT(svd
->type
== MAP_SHARED
);
2308 ASSERT(amp
->a_szc
> seg
->s_szc
);
2309 anon_shmap_free_pages(amp
, an_idx
, len
);
2313 * Unreserve swap space for the
2314 * unmapped chunk of this segment in
2315 * case it's MAP_SHARED
2317 if (svd
->type
== MAP_SHARED
) {
2318 anon_unresv_zone(len
,
2319 seg
->s_as
->a_proc
->p_zone
);
2323 nsvd
->anon_index
= svd
->anon_index
+
2324 btop((uintptr_t)(nseg
->s_base
- seg
->s_base
));
2325 if (svd
->type
== MAP_SHARED
) {
2329 struct anon_map
*namp
;
2330 struct anon_hdr
*nahp
;
2332 ASSERT(svd
->type
== MAP_PRIVATE
);
2333 nahp
= anon_create(btop(seg
->s_size
), ANON_SLEEP
);
2334 namp
= anonmap_alloc(nseg
->s_size
, 0, ANON_SLEEP
);
2335 namp
->a_szc
= seg
->s_szc
;
2336 (void) anon_copy_ptr(amp
->ahp
, svd
->anon_index
, nahp
,
2337 0, btop(seg
->s_size
), ANON_SLEEP
);
2338 (void) anon_copy_ptr(amp
->ahp
, nsvd
->anon_index
,
2339 namp
->ahp
, 0, btop(nseg
->s_size
), ANON_SLEEP
);
2340 anon_release(amp
->ahp
, btop(amp
->size
));
2341 svd
->anon_index
= 0;
2342 nsvd
->anon_index
= 0;
2344 amp
->size
= seg
->s_size
;
2347 ANON_LOCK_EXIT(&
->a_rwlock
);
2350 if (svd
->flags
& MAP_NORESERVE
) {
2352 oswresv
= svd
->swresv
;
2353 svd
->swresv
= ptob(anon_pages(amp
->ahp
,
2354 svd
->anon_index
, btop(seg
->s_size
)));
2355 nsvd
->swresv
= ptob(anon_pages(nsvd
->amp
->ahp
,
2356 nsvd
->anon_index
, btop(nseg
->s_size
)));
2357 ASSERT(oswresv
>= (svd
->swresv
+ nsvd
->swresv
));
2358 anon_unresv_zone(oswresv
- (svd
->swresv
+ nsvd
->swresv
),
2359 seg
->s_as
->a_proc
->p_zone
);
2360 if (SEG_IS_PARTIAL_RESV(seg
))
2361 seg
->s_as
->a_resvsize
-= oswresv
-
2362 (svd
->swresv
+ nsvd
->swresv
);
2366 if (svd
->pageswap
) {
2367 oswresv
= svd
->swresv
;
2368 svd
->swresv
= segvn_count_swap_by_vpages(seg
);
2369 nsvd
->swresv
= segvn_count_swap_by_vpages(nseg
);
2370 ASSERT(oswresv
>= (svd
->swresv
+ nsvd
->swresv
));
2371 unlen
= oswresv
- (svd
->swresv
+ nsvd
->swresv
);
2373 if (seg
->s_size
+ nseg
->s_size
+ len
!=
2375 panic("segvn_unmap: cannot split "
2376 "swap reservation");
2379 svd
->swresv
= seg
->s_size
;
2380 nsvd
->swresv
= nseg
->s_size
;
2383 anon_unresv_zone(unlen
,
2384 seg
->s_as
->a_proc
->p_zone
);
2388 return (0); /* I'm glad that's all over with! */
2392 segvn_free(struct seg
*seg
)
2394 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
2395 pgcnt_t npages
= seg_pages(seg
);
2396 struct anon_map
*amp
;
2400 * We don't need any segment level locks for "segvn" data
2401 * since the address space is "write" locked.
2403 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
));
2404 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
2406 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2409 * Be sure to unlock pages. XXX Why do things get free'ed instead
2412 (void) segvn_lockop(seg
, seg
->s_base
, seg
->s_size
,
2413 0, MC_UNLOCK
, NULL
, 0);
2416 * Deallocate the vpage and anon pointers if necessary and possible.
2418 if (svd
->vpage
!= NULL
) {
2419 kmem_free(svd
->vpage
, vpgtob(npages
));
2422 if ((amp
= svd
->amp
) != NULL
) {
2424 * If there are no more references to this anon_map
2425 * structure, then deallocate the structure after freeing
2426 * up all the anon slot pointers that we can.
2428 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
2429 ASSERT(amp
->a_szc
>= seg
->s_szc
);
2430 if (--amp
->refcnt
== 0) {
2431 if (svd
->type
== MAP_PRIVATE
) {
2433 * Private - we only need to anon_free
2434 * the part that this segment refers to.
2436 if (seg
->s_szc
!= 0) {
2437 anon_free_pages(amp
->ahp
,
2438 svd
->anon_index
, seg
->s_size
,
2441 anon_free(amp
->ahp
, svd
->anon_index
,
2447 * Shared anon map is no longer in use. Before
2448 * freeing its pages purge all entries from
2449 * pcache that belong to this amp.
2451 ASSERT(svd
->softlockcnt
== 0);
2455 * Shared - anon_free the entire
2456 * anon_map's worth of stuff and
2457 * release any swap reservation.
2459 if (amp
->a_szc
!= 0) {
2460 anon_shmap_free_pages(amp
, 0,
2463 anon_free(amp
->ahp
, 0, amp
->size
);
2465 if ((len
= amp
->swresv
) != 0) {
2466 anon_unresv_zone(len
,
2467 seg
->s_as
->a_proc
->p_zone
);
2471 ANON_LOCK_EXIT(&
->a_rwlock
);
2473 } else if (svd
->type
== MAP_PRIVATE
) {
2475 * We had a private mapping which still has
2476 * a held anon_map so just free up all the
2477 * anon slot pointers that we were using.
2479 if (seg
->s_szc
!= 0) {
2480 anon_free_pages(amp
->ahp
, svd
->anon_index
,
2481 seg
->s_size
, seg
->s_szc
);
2483 anon_free(amp
->ahp
, svd
->anon_index
,
2486 ANON_LOCK_EXIT(&
->a_rwlock
);
2488 ANON_LOCK_EXIT(&
->a_rwlock
);
2493 * Release swap reservation.
2495 if ((len
= svd
->swresv
) != 0) {
2496 anon_unresv_zone(svd
->swresv
,
2497 seg
->s_as
->a_proc
->p_zone
);
2498 if (SEG_IS_PARTIAL_RESV(seg
))
2499 seg
->s_as
->a_resvsize
-= svd
->swresv
;
2503 * Release claim on vnode, credentials, and finally free the
2506 if (svd
->vp
!= NULL
) {
2507 if (svd
->type
== MAP_SHARED
)
2508 lgrp_shm_policy_fini(NULL
, svd
->vp
);
2514 svd
->pageadvice
= 0;
2519 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's
2520 * still working with this segment without holding as lock (in case
2521 * it's called by pcache async thread).
2523 ASSERT(svd
->softlockcnt
== 0);
2524 mutex_enter(&svd
->segfree_syncmtx
);
2525 mutex_exit(&svd
->segfree_syncmtx
);
2528 kmem_cache_free(segvn_cache
, svd
);
2532 * Do a F_SOFTUNLOCK call over the range requested. The range must have
2533 * already been F_SOFTLOCK'ed.
2534 * Caller must always match addr and len of a softunlock with a previous
2535 * softlock with exactly the same addr and len.
2538 segvn_softunlock(struct seg
*seg
, caddr_t addr
, size_t len
, enum seg_rw rw
)
2540 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
2546 struct anon_map
*amp
;
2547 struct anon
*ap
= NULL
;
2549 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
2550 ASSERT(SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
2552 if ((amp
= svd
->amp
) != NULL
)
2553 anon_index
= svd
->anon_index
+ seg_page(seg
, addr
);
2555 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
2556 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
2557 hat_unlock_region(seg
->s_as
->a_hat
, addr
, len
, svd
->rcookie
);
2559 hat_unlock(seg
->s_as
->a_hat
, addr
, len
);
2561 for (adr
= addr
; adr
< addr
+ len
; adr
+= PAGESIZE
) {
2563 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
2564 if ((ap
= anon_get_ptr(amp
->ahp
, anon_index
++))
2566 swap_xlate(ap
, &vp
, &offset
);
2569 offset
= svd
->offset
+
2570 (uintptr_t)(adr
- seg
->s_base
);
2572 ANON_LOCK_EXIT(&
->a_rwlock
);
2575 offset
= svd
->offset
+
2576 (uintptr_t)(adr
- seg
->s_base
);
2580 * Use page_find() instead of page_lookup() to
2581 * find the page since we know that it is locked.
2583 pp
= page_find(&vp
->v_object
, offset
);
2586 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
2587 (void *)adr
, (void *)ap
, (void *)vp
, offset
);
2591 if (rw
== S_WRITE
) {
2593 if (seg
->s_as
->a_vbits
)
2594 hat_setstat(seg
->s_as
, adr
, PAGESIZE
,
2596 } else if (rw
!= S_OTHER
) {
2598 if (seg
->s_as
->a_vbits
)
2599 hat_setstat(seg
->s_as
, adr
, PAGESIZE
, P_REF
);
2603 ASSERT(svd
->softlockcnt
>= btop(len
));
2604 if (!atomic_add_long_nv((ulong_t
*)&svd
->softlockcnt
, -btop(len
))) {
2606 * All SOFTLOCKS are gone. Wakeup any waiting
2607 * unmappers so they can try again to unmap.
2608 * Check for waiters first without the mutex
2609 * held so we don't always grab the mutex on
2612 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
2613 mutex_enter(&seg
->s_as
->a_contents
);
2614 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
2615 AS_CLRUNMAPWAIT(seg
->s_as
);
2616 cv_broadcast(&seg
->s_as
->a_cv
);
2618 mutex_exit(&seg
->s_as
->a_contents
);
2623 #define PAGE_HANDLED ((page_t *)-1)
2626 * Release all the pages in the NULL terminated ppp list
2627 * which haven't already been converted to PAGE_HANDLED.
2630 segvn_pagelist_rele(page_t
**ppp
)
2632 for (; *ppp
!= NULL
; ppp
++) {
2633 if (*ppp
!= PAGE_HANDLED
)
2638 static int stealcow
= 1;
2641 * Workaround for viking chip bug. See bug id 1220902.
2642 * To fix this down in pagefault() would require importing so
2643 * much as and segvn code as to be unmaintainable.
2645 int enable_mbit_wa
= 0;
2648 * Handles all the dirty work of getting the right
2649 * anonymous pages and loading up the translations.
2650 * This routine is called only from segvn_fault()
2651 * when looping over the range of addresses requested.
2653 * The basic algorithm here is:
2654 * If this is an anon_zero case
2655 * Call anon_zero to allocate page
2656 * Load up translation
2659 * If this is an anon page
2660 * Use anon_getpage to get the page
2662 * Find page in pl[] list passed in
2665 * Load up the translation to the page
2668 * Call anon_private to handle cow
2669 * Load up (writable) translation to new page
2673 struct hat
*hat
, /* the hat to use for mapping */
2674 struct seg
*seg
, /* seg_vn of interest */
2675 caddr_t addr
, /* address in as */
2676 uoff_t off
, /* offset in vp */
2677 struct vpage
*vpage
, /* pointer to vpage for vp, off */
2678 page_t
*pl
[], /* object source page pointer */
2679 uint_t vpprot
, /* access allowed to object pages */
2680 enum fault_type type
, /* type of fault */
2681 enum seg_rw rw
, /* type of access at fault */
2682 int brkcow
) /* we may need to break cow */
2684 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
2686 uint_t pageflags
= 0;
2687 page_t
*anon_pl
[1 + 1];
2688 page_t
*opp
= NULL
; /* original page */
2695 struct anon
*ap
, *oldap
;
2696 struct anon_map
*amp
;
2697 int hat_flag
= (type
== F_SOFTLOCK
) ? HAT_LOAD_LOCK
: HAT_LOAD
;
2699 anon_sync_obj_t cookie
;
2701 if (svd
->flags
& MAP_TEXT
) {
2702 hat_flag
|= HAT_LOAD_TEXT
;
2705 ASSERT(SEGVN_READ_HELD(seg
->s_as
, &svd
->lock
));
2706 ASSERT(seg
->s_szc
== 0);
2707 ASSERT(svd
->tr_state
!= SEGVN_TR_INIT
);
2710 * Initialize protection value for this page.
2711 * If we have per page protection values check it now.
2713 if (svd
->pageprot
) {
2718 protchk
= PROT_READ
;
2721 protchk
= PROT_WRITE
;
2724 protchk
= PROT_EXEC
;
2728 protchk
= PROT_READ
| PROT_WRITE
| PROT_EXEC
;
2732 prot
= VPP_PROT(vpage
);
2733 if ((prot
& protchk
) == 0)
2734 return (FC_PROT
); /* illegal access type */
2739 if (type
== F_SOFTLOCK
) {
2740 atomic_inc_ulong((ulong_t
*)&svd
->softlockcnt
);
2744 * Always acquire the anon array lock to prevent 2 threads from
2745 * allocating separate anon slots for the same "addr".
2748 if ((amp
= svd
->amp
) != NULL
) {
2749 ASSERT(RW_READ_HELD(&
->a_rwlock
));
2750 anon_index
= svd
->anon_index
+ seg_page(seg
, addr
);
2751 anon_array_enter(amp
, anon_index
, &cookie
);
2755 if (svd
->vp
== NULL
&& amp
!= NULL
) {
2756 if ((ap
= anon_get_ptr(amp
->ahp
, anon_index
)) == NULL
) {
2758 * Allocate a (normally) writable anonymous page of
2759 * zeroes. If no advance reservations, reserve now.
2761 if (svd
->flags
& MAP_NORESERVE
) {
2762 if (anon_resv_zone(ptob(1),
2763 seg
->s_as
->a_proc
->p_zone
)) {
2764 atomic_add_long(&svd
->swresv
, ptob(1));
2765 atomic_add_long(&seg
->s_as
->a_resvsize
,
2772 if ((pp
= anon_zero(seg
, addr
, &ap
,
2773 svd
->cred
)) == NULL
) {
2775 goto out
; /* out of swap space */
2778 * Re-acquire the anon_map lock and
2779 * initialize the anon array entry.
2781 (void) anon_set_ptr(amp
->ahp
, anon_index
, ap
,
2784 ASSERT(pp
->p_szc
== 0);
2787 * Handle pages that have been marked for migration
2789 if (lgrp_optimizations())
2790 page_migrate(seg
, addr
, &pp
, 1);
2792 if (enable_mbit_wa
) {
2795 else if (!hat_ismod(pp
))
2796 prot
&= ~PROT_WRITE
;
2799 * If AS_PAGLCK is set in a_flags (via memcntl(2)
2800 * with MC_LOCKAS, MCL_FUTURE) and this is a
2801 * MAP_NORESERVE segment, we may need to
2802 * permanently lock the page as it is being faulted
2803 * for the first time. The following text applies
2804 * only to MAP_NORESERVE segments:
2806 * As per memcntl(2), if this segment was created
2807 * after MCL_FUTURE was applied (a "future"
2808 * segment), its pages must be locked. If this
2809 * segment existed at MCL_FUTURE application (a
2810 * "past" segment), the interface is unclear.
2812 * We decide to lock only if vpage is present:
2814 * - "future" segments will have a vpage array (see
2815 * as_map), and so will be locked as required
2817 * - "past" segments may not have a vpage array,
2818 * depending on whether events (such as
2819 * mprotect) have occurred. Locking if vpage
2820 * exists will preserve legacy behavior. Not
2821 * locking if vpage is absent, will not break
2822 * the interface or legacy behavior. Note that
2823 * allocating vpage here if it's absent requires
2824 * upgrading the segvn reader lock, the cost of
2825 * which does not seem worthwhile.
2827 * Usually testing and setting VPP_ISPPLOCK and
2828 * VPP_SETPPLOCK requires holding the segvn lock as
2829 * writer, but in this case all readers are
2830 * serializing on the anon array lock.
2832 if (AS_ISPGLCK(seg
->s_as
) && vpage
!= NULL
&&
2833 (svd
->flags
& MAP_NORESERVE
) &&
2834 !VPP_ISPPLOCK(vpage
)) {
2835 proc_t
*p
= seg
->s_as
->a_proc
;
2836 ASSERT(svd
->type
== MAP_PRIVATE
);
2837 mutex_enter(&p
->p_lock
);
2838 if (rctl_incr_locked_mem(p
, NULL
, PAGESIZE
,
2840 claim
= VPP_PROT(vpage
) & PROT_WRITE
;
2841 if (page_pp_lock(pp
, claim
, 0)) {
2842 VPP_SETPPLOCK(vpage
);
2844 rctl_decr_locked_mem(p
, NULL
,
2848 mutex_exit(&p
->p_lock
);
2851 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2852 hat_memload(hat
, addr
, pp
, prot
, hat_flag
);
2854 if (!(hat_flag
& HAT_LOAD_LOCK
))
2857 anon_array_exit(&cookie
);
2863 * Obtain the page structure via anon_getpage() if it is
2864 * a private copy of an object (the result of a previous
2868 if ((ap
= anon_get_ptr(amp
->ahp
, anon_index
)) != NULL
) {
2869 err
= anon_getpage(&ap
, &vpprot
, anon_pl
, PAGESIZE
,
2870 seg
, addr
, rw
, svd
->cred
);
2874 if (svd
->type
== MAP_SHARED
) {
2876 * If this is a shared mapping to an
2877 * anon_map, then ignore the write
2878 * permissions returned by anon_getpage().
2879 * They apply to the private mappings
2882 vpprot
|= PROT_WRITE
;
2889 * Search the pl[] list passed in if it is from the
2890 * original object (i.e., not a private copy).
2894 * Find original page. We must be bringing it in
2895 * from the list in pl[].
2897 for (ppp
= pl
; (opp
= *ppp
) != NULL
; ppp
++) {
2898 if (opp
== PAGE_HANDLED
)
2900 VERIFY(opp
->p_object
== &svd
->vp
->v_object
); /* XXX */
2901 ASSERT(opp
->p_vnode
== svd
->vp
); /* XXX */
2902 if (opp
->p_offset
== off
)
2906 panic("segvn_faultpage not found");
2909 *ppp
= PAGE_HANDLED
;
2913 ASSERT(PAGE_LOCKED(opp
));
2916 * The fault is treated as a copy-on-write fault if a
2917 * write occurs on a private segment and the object
2918 * page (i.e., mapping) is write protected. We assume
2919 * that fatal protection checks have already been made.
2923 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
2924 cow
= !(vpprot
& PROT_WRITE
);
2925 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
2927 * If we are doing text replication COW on first touch.
2929 ASSERT(amp
!= NULL
);
2930 ASSERT(svd
->vp
!= NULL
);
2931 ASSERT(rw
!= S_WRITE
);
2938 * If not a copy-on-write case load the translation
2944 * Handle pages that have been marked for migration
2946 if (lgrp_optimizations())
2947 page_migrate(seg
, addr
, &opp
, 1);
2949 if (IS_VMODSORT(opp
->p_vnode
) || enable_mbit_wa
) {
2952 else if (rw
!= S_OTHER
&& !hat_ismod(opp
))
2953 prot
&= ~PROT_WRITE
;
2956 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
||
2957 (!svd
->pageprot
&& svd
->prot
== (prot
& vpprot
)));
2958 ASSERT(amp
== NULL
||
2959 svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2960 hat_memload_region(hat
, addr
, opp
, prot
& vpprot
, hat_flag
,
2963 if (!(hat_flag
& HAT_LOAD_LOCK
))
2967 anon_array_exit(&cookie
);
2972 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
2976 ASSERT(amp
!= NULL
&& anon_lock
);
2979 * Steal the page only if it isn't a private page
2980 * since stealing a private page is not worth the effort.
2982 if ((ap
= anon_get_ptr(amp
->ahp
, anon_index
)) == NULL
)
2986 * Steal the original page if the following conditions are true:
2988 * We are low on memory, the page is not private, page is not large,
2989 * not shared, not modified, not `locked' or if we have it `locked'
2990 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
2991 * that the page is not shared) and if it doesn't have any
2992 * translations. page_struct_lock isn't needed to look at p_cowcnt
2993 * and p_lckcnt because we first get exclusive lock on page.
2995 (void) hat_pagesync(opp
, HAT_SYNC_DONTZERO
| HAT_SYNC_STOPON_MOD
);
2997 if (stealcow
&& freemem
< minfree
&& steal
&& opp
->p_szc
== 0 &&
2998 page_tryupgrade(opp
) && !hat_ismod(opp
) &&
2999 ((opp
->p_lckcnt
== 0 && opp
->p_cowcnt
== 0) ||
3000 (opp
->p_lckcnt
== 0 && opp
->p_cowcnt
== 1 &&
3001 vpage
!= NULL
&& VPP_ISPPLOCK(vpage
)))) {
3003 * Check if this page has other translations
3004 * after unloading our translation.
3006 if (hat_page_is_mapped(opp
)) {
3007 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
3008 hat_unload(seg
->s_as
->a_hat
, addr
, PAGESIZE
,
3013 * hat_unload() might sync back someone else's recent
3014 * modification, so check again.
3016 if (!hat_ismod(opp
) && !hat_page_is_mapped(opp
))
3017 pageflags
|= STEAL_PAGE
;
3021 * If we have a vpage pointer, see if it indicates that we have
3022 * ``locked'' the page we map -- if so, tell anon_private to
3023 * transfer the locking resource to the new page.
3025 * See Statement at the beginning of segvn_lockop regarding
3026 * the way lockcnts/cowcnts are handled during COW.
3029 if (vpage
!= NULL
&& VPP_ISPPLOCK(vpage
))
3030 pageflags
|= LOCK_PAGE
;
3033 * Allocate a private page and perform the copy.
3034 * For MAP_NORESERVE reserve swap space now, unless this
3035 * is a cow fault on an existing anon page in which case
3036 * MAP_NORESERVE will have made advance reservations.
3038 if ((svd
->flags
& MAP_NORESERVE
) && (ap
== NULL
)) {
3039 if (anon_resv_zone(ptob(1), seg
->s_as
->a_proc
->p_zone
)) {
3040 atomic_add_long(&svd
->swresv
, ptob(1));
3041 atomic_add_long(&seg
->s_as
->a_resvsize
, ptob(1));
3049 pp
= anon_private(&ap
, seg
, addr
, prot
, opp
, pageflags
, svd
->cred
);
3051 err
= ENOMEM
; /* out of swap space */
3056 * If we copied away from an anonymous page, then
3057 * we are one step closer to freeing up an anon slot.
3059 * NOTE: The original anon slot must be released while
3060 * holding the "anon_map" lock. This is necessary to prevent
3061 * other threads from obtaining a pointer to the anon slot
3062 * which may be freed if its "refcnt" is 1.
3067 (void) anon_set_ptr(amp
->ahp
, anon_index
, ap
, ANON_SLEEP
);
3070 * Handle pages that have been marked for migration
3072 if (lgrp_optimizations())
3073 page_migrate(seg
, addr
, &pp
, 1);
3075 ASSERT(pp
->p_szc
== 0);
3077 ASSERT(!IS_VMODSORT(pp
->p_vnode
));
3078 if (enable_mbit_wa
) {
3081 else if (!hat_ismod(pp
))
3082 prot
&= ~PROT_WRITE
;
3085 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
3086 hat_memload(hat
, addr
, pp
, prot
, hat_flag
);
3088 if (!(hat_flag
& HAT_LOAD_LOCK
))
3092 anon_array_exit(&cookie
);
3096 anon_array_exit(&cookie
);
3098 if (type
== F_SOFTLOCK
) {
3099 atomic_dec_ulong((ulong_t
*)&svd
->softlockcnt
);
3101 return (FC_MAKE_ERR(err
));
3105 * relocate a bunch of smaller targ pages into one large repl page. all targ
3106 * pages must be complete pages smaller than replacement pages.
3107 * it's assumed that no page's szc can change since they are all PAGESIZE or
3108 * complete large pages locked SHARED.
3111 segvn_relocate_pages(page_t
**targ
, page_t
*replacement
)
3114 pgcnt_t repl_npgs
, curnpgs
;
3116 uint_t repl_szc
= replacement
->p_szc
;
3117 page_t
*first_repl
= replacement
;
3121 VM_STAT_ADD(segvnvmstats
.relocatepages
[0]);
3123 ASSERT(repl_szc
!= 0);
3124 npgs
= repl_npgs
= page_get_pagecnt(repl_szc
);
3130 ASSERT(replacement
!= NULL
);
3132 ASSERT(pp
->p_szc
< repl_szc
);
3133 ASSERT(PAGE_EXCL(pp
));
3134 ASSERT(!PP_ISFREE(pp
));
3135 curnpgs
= page_get_pagecnt(pp
->p_szc
);
3137 VM_STAT_ADD(segvnvmstats
.relocatepages
[1]);
3139 page_sub(&replacement
, repl
);
3140 ASSERT(PAGE_EXCL(repl
));
3141 ASSERT(!PP_ISFREE(repl
));
3142 ASSERT(repl
->p_szc
== repl_szc
);
3144 page_t
*repl_savepp
;
3146 VM_STAT_ADD(segvnvmstats
.relocatepages
[2]);
3147 repl_savepp
= replacement
;
3148 for (j
= 0; j
< curnpgs
; j
++) {
3150 page_sub(&replacement
, repl
);
3151 ASSERT(PAGE_EXCL(repl
));
3152 ASSERT(!PP_ISFREE(repl
));
3153 ASSERT(repl
->p_szc
== repl_szc
);
3154 ASSERT(page_pptonum(targ
[i
+ j
]) ==
3155 page_pptonum(targ
[i
]) + j
);
3158 ASSERT(IS_P2ALIGNED(page_pptonum(repl
), curnpgs
));
3160 err
= page_relocate(&pp
, &repl
, 0, 1, &nreloc
, NULL
);
3161 if (err
|| nreloc
!= curnpgs
) {
3162 panic("segvn_relocate_pages: "
3163 "page_relocate failed err=%d curnpgs=%ld "
3164 "nreloc=%ld", err
, curnpgs
, nreloc
);
3166 ASSERT(curnpgs
<= repl_npgs
);
3167 repl_npgs
-= curnpgs
;
3170 ASSERT(replacement
== NULL
);
3174 for (i
= 0; i
< repl_npgs
; i
++) {
3175 ASSERT(PAGE_EXCL(repl
));
3176 ASSERT(!PP_ISFREE(repl
));
3178 page_downgrade(targ
[i
]);
3184 * Check if all pages in ppa array are complete smaller than szc pages and
3185 * their roots will still be aligned relative to their current size if the
3186 * entire ppa array is relocated into one szc page. If these conditions are
3189 * If all pages are properly aligned attempt to upgrade their locks
3190 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0.
3191 * upgrdfail was set to 0 by caller.
3193 * Return 1 if all pages are aligned and locked exclusively.
3195 * If all pages in ppa array happen to be physically contiguous to make one
3196 * szc page and all exclusive locks are successfully obtained promote the page
3197 * size to szc and set *pszc to szc. Return 1 with pages locked shared.
3200 segvn_full_szcpages(page_t
**ppa
, uint_t szc
, int *upgrdfail
, uint_t
*pszc
)
3204 pgcnt_t totnpgs
= page_get_pagecnt(szc
);
3215 VM_STAT_ADD(segvnvmstats
.fullszcpages
[0]);
3217 for (i
= 0; i
< totnpgs
; i
++) {
3219 ASSERT(PAGE_SHARED(pp
));
3220 ASSERT(!PP_ISFREE(pp
));
3221 pfn
= page_pptonum(pp
);
3223 if (!IS_P2ALIGNED(pfn
, totnpgs
)) {
3228 } else if (contig
&& pfn
!= first_pfn
+ i
) {
3231 if (pp
->p_szc
== 0) {
3233 VM_STAT_ADD(segvnvmstats
.fullszcpages
[1]);
3237 if ((curszc
= pp
->p_szc
) >= szc
) {
3238 VM_STAT_ADD(segvnvmstats
.fullszcpages
[2]);
3243 * p_szc changed means we don't have all pages
3244 * locked. return failure.
3246 VM_STAT_ADD(segvnvmstats
.fullszcpages
[3]);
3249 curnpgs
= page_get_pagecnt(curszc
);
3250 if (!IS_P2ALIGNED(pfn
, curnpgs
) ||
3251 !IS_P2ALIGNED(i
, curnpgs
)) {
3252 VM_STAT_ADD(segvnvmstats
.fullszcpages
[4]);
3258 VM_STAT_ADD(segvnvmstats
.fullszcpages
[5]);
3259 if (pp
->p_szc
!= curszc
) {
3260 VM_STAT_ADD(segvnvmstats
.fullszcpages
[6]);
3263 if (pfn
- 1 != page_pptonum(ppa
[i
- 1])) {
3264 panic("segvn_full_szcpages: "
3265 "large page not physically contiguous");
3267 if (P2PHASE(pfn
, curnpgs
) == curnpgs
- 1) {
3273 for (i
= 0; i
< totnpgs
; i
++) {
3274 ASSERT(ppa
[i
]->p_szc
< szc
);
3275 if (!page_tryupgrade(ppa
[i
])) {
3276 for (j
= 0; j
< i
; j
++) {
3277 page_downgrade(ppa
[j
]);
3279 *pszc
= ppa
[i
]->p_szc
;
3281 VM_STAT_ADD(segvnvmstats
.fullszcpages
[7]);
3287 * When a page is put a free cachelist its szc is set to 0. if file
3288 * system reclaimed pages from cachelist targ pages will be physically
3289 * contiguous with 0 p_szc. in this case just upgrade szc of targ
3290 * pages without any relocations.
3291 * To avoid any hat issues with previous small mappings
3292 * hat_pageunload() the target pages first.
3295 VM_STAT_ADD(segvnvmstats
.fullszcpages
[8]);
3296 for (i
= 0; i
< totnpgs
; i
++) {
3297 (void) hat_pageunload(ppa
[i
], HAT_FORCE_PGUNLOAD
);
3299 for (i
= 0; i
< totnpgs
; i
++) {
3300 ppa
[i
]->p_szc
= szc
;
3302 for (i
= 0; i
< totnpgs
; i
++) {
3303 ASSERT(PAGE_EXCL(ppa
[i
]));
3304 page_downgrade(ppa
[i
]);
3310 VM_STAT_ADD(segvnvmstats
.fullszcpages
[9]);
3315 * Create physically contiguous pages for [vp, off] - [vp, off +
3316 * page_size(szc)) range and for private segment return them in ppa array.
3317 * Pages are created either via IO or relocations.
3319 * Return 1 on success and 0 on failure.
3321 * If physically contiguous pages already exist for this range return 1 without
3322 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa
3323 * array wasn't filled. In this case caller fills ppa array via fop_getpage().
3327 segvn_fill_vp_pages(struct segvn_data
*svd
, vnode_t
*vp
, uoff_t off
,
3328 uint_t szc
, page_t
**ppa
, page_t
**ppplist
, uint_t
*ret_pszc
,
3331 page_t
*pplist
= *ppplist
;
3332 size_t pgsz
= page_get_pagesize(szc
);
3333 pgcnt_t pages
= btop(pgsz
);
3334 ulong_t start_off
= off
;
3335 uoff_t eoff
= off
+ pgsz
;
3337 uoff_t io_off
= off
;
3339 page_t
*io_pplist
= NULL
;
3340 page_t
*done_pplist
= NULL
;
3349 page_t
*targ_pplist
= NULL
;
3350 page_t
*repl_pplist
= NULL
;
3356 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[0]);
3359 ASSERT(pplist
->p_szc
== szc
);
3362 * downsize will be set to 1 only if we fail to lock pages. this will
3363 * allow subsequent faults to try to relocate the page again. If we
3364 * fail due to misalignment don't downsize and let the caller map the
3365 * whole region with small mappings to avoid more faults into the area
3366 * where we can't get large pages anyway.
3370 while (off
< eoff
) {
3372 ASSERT(newpp
!= NULL
);
3373 ASSERT(PAGE_EXCL(newpp
));
3374 ASSERT(!PP_ISFREE(newpp
));
3376 * we pass NULL for nrelocp to page_lookup_create()
3377 * so that it doesn't relocate. We relocate here
3378 * later only after we make sure we can lock all
3379 * pages in the range we handle and they are all
3382 pp
= page_lookup_create(&vp
->v_object
, off
, SE_SHARED
, newpp
,
3385 ASSERT(!PP_ISFREE(pp
));
3386 VERIFY(pp
->p_object
== &vp
->v_object
);
3387 ASSERT(pp
->p_vnode
== vp
);
3388 ASSERT(pp
->p_offset
== off
);
3390 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[1]);
3391 page_sub(&pplist
, pp
);
3392 ASSERT(PAGE_EXCL(pp
));
3393 ASSERT(page_iolock_assert(pp
));
3394 page_list_concat(&io_pplist
, &pp
);
3398 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[2]);
3399 pfn
= page_pptonum(pp
);
3401 if (pszc
>= szc
&& targ_pplist
== NULL
&& io_pplist
== NULL
&&
3402 IS_P2ALIGNED(pfn
, pages
)) {
3403 ASSERT(repl_pplist
== NULL
);
3404 ASSERT(done_pplist
== NULL
);
3405 ASSERT(pplist
== *ppplist
);
3407 page_free_replacement_page(pplist
);
3408 page_create_putback(pages
);
3410 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[3]);
3415 segvn_faultvnmpss_align_err1
++;
3418 ppages
= page_get_pagecnt(pszc
);
3419 if (!IS_P2ALIGNED(pfn
, ppages
)) {
3422 * sizing down to pszc won't help.
3425 segvn_faultvnmpss_align_err2
++;
3428 pfn
= page_pptonum(newpp
);
3429 if (!IS_P2ALIGNED(pfn
, ppages
)) {
3432 * sizing down to pszc won't help.
3435 segvn_faultvnmpss_align_err3
++;
3438 if (!PAGE_EXCL(pp
)) {
3439 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[4]);
3442 *ret_pszc
= pp
->p_szc
;
3446 if (io_pplist
!= NULL
) {
3447 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[5]);
3448 io_len
= off
- io_off
;
3450 * Some file systems like NFS don't check EOF
3451 * conditions in fop_pageio(). Check it here
3452 * now that pages are locked SE_EXCL. Any file
3453 * truncation will wait until the pages are
3454 * unlocked so no need to worry that file will
3455 * be truncated after we check its size here.
3456 * XXX fix NFS to remove this check.
3458 va
.va_mask
= VATTR_SIZE
;
3459 if (fop_getattr(vp
, &va
, ATTR_HINT
, svd
->cred
, NULL
)) {
3460 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[6]);
3461 page_unlock(targpp
);
3464 if (btopr(va
.va_size
) < btopr(io_off
+ io_len
)) {
3465 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[7]);
3468 page_unlock(targpp
);
3471 io_err
= fop_pageio(vp
, io_pplist
, io_off
, io_len
,
3472 B_READ
, svd
->cred
, NULL
);
3474 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[8]);
3475 page_unlock(targpp
);
3476 if (io_err
== EDEADLK
) {
3477 segvn_vmpss_pageio_deadlk_err
++;
3482 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[9]);
3483 while (io_pplist
!= NULL
) {
3485 page_sub(&io_pplist
, pp
);
3486 ASSERT(page_iolock_assert(pp
));
3488 pgidx
= (pp
->p_offset
- start_off
) >>
3490 ASSERT(pgidx
< pages
);
3492 page_list_concat(&done_pplist
, &pp
);
3496 ASSERT(PAGE_EXCL(pp
));
3497 ASSERT(pp
->p_szc
<= pszc
);
3498 if (pszc
!= 0 && !group_page_trylock(pp
, SE_EXCL
)) {
3499 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[10]);
3502 *ret_pszc
= pp
->p_szc
;
3505 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[11]);
3507 * page szc chould have changed before the entire group was
3508 * locked. reread page szc.
3511 ppages
= page_get_pagecnt(pszc
);
3513 /* link just the roots */
3514 page_list_concat(&targ_pplist
, &pp
);
3515 page_sub(&pplist
, newpp
);
3516 page_list_concat(&repl_pplist
, &newpp
);
3518 while (--ppages
!= 0) {
3520 page_sub(&pplist
, newpp
);
3525 if (io_pplist
!= NULL
) {
3526 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[12]);
3527 io_len
= eoff
- io_off
;
3528 va
.va_mask
= VATTR_SIZE
;
3529 if (fop_getattr(vp
, &va
, ATTR_HINT
, svd
->cred
, NULL
) != 0) {
3530 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[13]);
3533 if (btopr(va
.va_size
) < btopr(io_off
+ io_len
)) {
3534 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[14]);
3539 io_err
= fop_pageio(vp
, io_pplist
, io_off
, io_len
,
3540 B_READ
, svd
->cred
, NULL
);
3542 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[15]);
3543 if (io_err
== EDEADLK
) {
3544 segvn_vmpss_pageio_deadlk_err
++;
3549 while (io_pplist
!= NULL
) {
3551 page_sub(&io_pplist
, pp
);
3552 ASSERT(page_iolock_assert(pp
));
3554 pgidx
= (pp
->p_offset
- start_off
) >> PAGESHIFT
;
3555 ASSERT(pgidx
< pages
);
3560 * we're now bound to succeed or panic.
3561 * remove pages from done_pplist. it's not needed anymore.
3563 while (done_pplist
!= NULL
) {
3565 page_sub(&done_pplist
, pp
);
3567 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[16]);
3568 ASSERT(pplist
== NULL
);
3570 while (targ_pplist
!= NULL
) {
3572 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[17]);
3573 ASSERT(repl_pplist
);
3575 page_sub(&targ_pplist
, pp
);
3576 pgidx
= (pp
->p_offset
- start_off
) >> PAGESHIFT
;
3577 newpp
= repl_pplist
;
3578 page_sub(&repl_pplist
, newpp
);
3580 pfn
= page_pptonum(pp
);
3582 ppages
= page_get_pagecnt(pszc
);
3583 ASSERT(IS_P2ALIGNED(pfn
, ppages
));
3584 pfn
= page_pptonum(newpp
);
3585 ASSERT(IS_P2ALIGNED(pfn
, ppages
));
3586 ASSERT(P2PHASE(pfn
, pages
) == pgidx
);
3589 ret
= page_relocate(&pp
, &newpp
, 0, 1, &nreloc
, NULL
);
3590 if (ret
!= 0 || nreloc
== 0) {
3591 panic("segvn_fill_vp_pages: "
3592 "page_relocate failed");
3595 while (nreloc
-- != 0) {
3596 ASSERT(PAGE_EXCL(pp
));
3597 VERIFY(pp
->p_object
== &vp
->v_object
);
3598 ASSERT(pp
->p_vnode
== vp
);
3600 ((pp
->p_offset
- start_off
) >> PAGESHIFT
));
3606 if (svd
->type
== MAP_PRIVATE
) {
3607 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[18]);
3608 for (i
= 0; i
< pages
; i
++) {
3609 ASSERT(ppa
[i
] != NULL
);
3610 ASSERT(PAGE_EXCL(ppa
[i
]));
3611 VERIFY(ppa
[i
]->p_object
== &vp
->v_object
);
3612 ASSERT(ppa
[i
]->p_vnode
== vp
);
3613 ASSERT(ppa
[i
]->p_offset
==
3614 start_off
+ (i
<< PAGESHIFT
));
3615 page_downgrade(ppa
[i
]);
3619 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[19]);
3621 * the caller will still call fop_getpage() for shared segments
3622 * to check FS write permissions. For private segments we map
3623 * file read only anyway. so no fop_getpage is needed.
3625 for (i
= 0; i
< pages
; i
++) {
3626 ASSERT(ppa
[i
] != NULL
);
3627 ASSERT(PAGE_EXCL(ppa
[i
]));
3628 VERIFY(ppa
[i
]->p_object
== &vp
->v_object
);
3629 ASSERT(ppa
[i
]->p_vnode
== vp
);
3630 ASSERT(ppa
[i
]->p_offset
==
3631 start_off
+ (i
<< PAGESHIFT
));
3632 page_unlock(ppa
[i
]);
3640 * Do the cleanup. Unlock target pages we didn't relocate. They are
3641 * linked on targ_pplist by root pages. reassemble unused replacement
3642 * and io pages back to pplist.
3644 if (io_pplist
!= NULL
) {
3645 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[20]);
3648 VERIFY(pp
->p_object
== &vp
->v_object
);
3649 ASSERT(pp
->p_vnode
== vp
);
3650 ASSERT(pp
->p_offset
== io_off
);
3651 ASSERT(page_iolock_assert(pp
));
3653 page_hashout(pp
, false);
3655 } while ((pp
= pp
->p_next
) != io_pplist
);
3656 page_list_concat(&io_pplist
, &pplist
);
3660 while (targ_pplist
!= NULL
) {
3661 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[21]);
3663 ASSERT(PAGE_EXCL(pp
));
3664 page_sub(&targ_pplist
, pp
);
3667 ppages
= page_get_pagecnt(pszc
);
3668 ASSERT(IS_P2ALIGNED(page_pptonum(pp
), ppages
));
3671 group_page_unlock(pp
);
3677 ASSERT(PAGE_EXCL(pp
));
3678 ASSERT(pp
->p_szc
== szc
);
3679 page_sub(&repl_pplist
, pp
);
3681 ASSERT(IS_P2ALIGNED(page_pptonum(pp
), ppages
));
3683 /* relink replacement page */
3684 page_list_concat(&tmp_pplist
, &pp
);
3685 while (--ppages
!= 0) {
3686 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[22]);
3688 ASSERT(PAGE_EXCL(pp
));
3689 ASSERT(pp
->p_szc
== szc
);
3690 page_list_concat(&tmp_pplist
, &pp
);
3693 if (tmp_pplist
!= NULL
) {
3694 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[23]);
3695 page_list_concat(&tmp_pplist
, &pplist
);
3696 pplist
= tmp_pplist
;
3699 * at this point all pages are either on done_pplist or
3700 * pplist. They can't be all on done_pplist otherwise
3701 * we'd've been done.
3703 ASSERT(pplist
!= NULL
);
3705 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[24]);
3708 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[25]);
3709 ASSERT(pp
->p_szc
== szc
);
3710 ASSERT(PAGE_EXCL(pp
));
3711 VERIFY(pp
->p_object
!= &vp
->v_object
);
3712 ASSERT(pp
->p_vnode
!= vp
);
3714 } while ((pp
= pp
->p_next
) != pplist
);
3718 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[26]);
3719 ASSERT(pp
->p_szc
== szc
);
3720 ASSERT(PAGE_EXCL(pp
));
3721 VERIFY(pp
->p_object
== &vp
->v_object
);
3722 ASSERT(pp
->p_vnode
== vp
);
3724 } while ((pp
= pp
->p_next
) != done_pplist
);
3726 while (pplist
!= NULL
) {
3727 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[27]);
3729 page_sub(&pplist
, pp
);
3733 while (done_pplist
!= NULL
) {
3734 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[28]);
3736 page_sub(&done_pplist
, pp
);
3742 ASSERT(pplist
== *ppplist
);
3744 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[29]);
3746 * don't downsize on io error.
3747 * see if vop_getpage succeeds.
3748 * pplist may still be used in this case
3753 VM_STAT_ADD(segvnvmstats
.fill_vp_pages
[30]);
3754 page_free_replacement_page(pplist
);
3755 page_create_putback(pages
);
3760 int segvn_anypgsz
= 0;
3762 #define SEGVN_RESTORE_SOFTLOCK_VP(type, pages) \
3763 if ((type) == F_SOFTLOCK) { \
3764 atomic_add_long((ulong_t *)&(svd)->softlockcnt, \
3768 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \
3769 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \
3770 if ((rw) == S_WRITE) { \
3771 for (i = 0; i < (pages); i++) { \
3772 VERIFY((ppa)[i]->p_object == \
3773 (ppa)[i]->p_object); \
3774 ASSERT((ppa)[i]->p_vnode == \
3775 (ppa)[0]->p_vnode); \
3776 hat_setmod((ppa)[i]); \
3778 } else if ((rw) != S_OTHER && \
3779 ((prot) & (vpprot) & PROT_WRITE)) { \
3780 for (i = 0; i < (pages); i++) { \
3781 VERIFY((ppa)[i]->p_object == \
3782 (ppa)[i]->p_object); \
3783 ASSERT((ppa)[i]->p_vnode == \
3784 (ppa)[0]->p_vnode); \
3785 if (!hat_ismod((ppa)[i])) { \
3786 prot &= ~PROT_WRITE; \
3793 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \
3794 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]);
3797 segvn_fault_vnodepages(struct hat
*hat
, struct seg
*seg
, caddr_t lpgaddr
,
3798 caddr_t lpgeaddr
, enum fault_type type
, enum seg_rw rw
, caddr_t addr
,
3799 caddr_t eaddr
, int brkcow
)
3801 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
3802 struct anon_map
*amp
= svd
->amp
;
3803 uchar_t segtype
= svd
->type
;
3804 uint_t szc
= seg
->s_szc
;
3805 size_t pgsz
= page_get_pagesize(szc
);
3806 size_t maxpgsz
= pgsz
;
3807 pgcnt_t pages
= btop(pgsz
);
3808 pgcnt_t maxpages
= pages
;
3809 size_t ppasize
= (pages
+ 1) * sizeof (page_t
*);
3810 caddr_t a
= lpgaddr
;
3811 caddr_t maxlpgeaddr
= lpgeaddr
;
3812 uoff_t off
= svd
->offset
+ (uintptr_t)(a
- seg
->s_base
);
3813 ulong_t aindx
= svd
->anon_index
+ seg_page(seg
, a
);
3814 struct vpage
*vpage
= (svd
->vpage
!= NULL
) ?
3815 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
3816 vnode_t
*vp
= svd
->vp
;
3821 faultcode_t err
= 0;
3823 int vop_size_err
= 0;
3824 uint_t protchk
, prot
, vpprot
;
3826 int hat_flag
= (type
== F_SOFTLOCK
) ? HAT_LOAD_LOCK
: HAT_LOAD
;
3827 anon_sync_obj_t an_cookie
;
3829 int alloc_failed
= 0;
3836 int segvn_anypgsz_vnode
= 0; /* for now map vnode with 2 page sizes */
3837 int tron
= (svd
->tr_state
== SEGVN_TR_ON
);
3841 ASSERT(brkcow
== 0 || amp
!= NULL
);
3842 ASSERT(tron
== 0 || amp
!= NULL
);
3843 ASSERT(enable_mbit_wa
== 0); /* no mbit simulations with large pages */
3844 ASSERT(!(svd
->flags
& MAP_NORESERVE
));
3845 ASSERT(type
!= F_SOFTUNLOCK
);
3846 ASSERT(IS_P2ALIGNED(a
, maxpgsz
));
3847 ASSERT(amp
== NULL
|| IS_P2ALIGNED(aindx
, maxpages
));
3848 ASSERT(SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
3849 ASSERT(seg
->s_szc
< NBBY
* sizeof (int));
3850 ASSERT(type
!= F_SOFTLOCK
|| lpgeaddr
- a
== maxpgsz
);
3851 ASSERT(svd
->tr_state
!= SEGVN_TR_INIT
);
3853 VM_STAT_COND_ADD(type
== F_SOFTLOCK
, segvnvmstats
.fltvnpages
[0]);
3854 VM_STAT_COND_ADD(type
!= F_SOFTLOCK
, segvnvmstats
.fltvnpages
[1]);
3856 if (svd
->flags
& MAP_TEXT
) {
3857 hat_flag
|= HAT_LOAD_TEXT
;
3860 if (svd
->pageprot
) {
3863 protchk
= PROT_READ
;
3866 protchk
= PROT_WRITE
;
3869 protchk
= PROT_EXEC
;
3873 protchk
= PROT_READ
| PROT_WRITE
| PROT_EXEC
;
3878 /* caller has already done segment level protection check. */
3881 if (rw
== S_WRITE
&& segtype
== MAP_PRIVATE
) {
3882 SEGVN_VMSTAT_FLTVNPAGES(2);
3888 ppa
= kmem_alloc(ppasize
, KM_SLEEP
);
3890 VM_STAT_COND_ADD(amp
!= NULL
, segvnvmstats
.fltvnpages
[3]);
3894 for (; a
< lpgeaddr
; a
+= pgsz
, off
+= pgsz
, aindx
+= pages
) {
3896 while (szc
< seg
->s_szc
) {
3899 tszc
= segvn_anypgsz_vnode
? szc
+ 1 :
3901 ppgsz
= page_get_pagesize(tszc
);
3902 if (!IS_P2ALIGNED(a
, ppgsz
) ||
3903 ((alloc_failed
>> tszc
) & 0x1)) {
3906 SEGVN_VMSTAT_FLTVNPAGES(4);
3910 e
= P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
3911 lpgeaddr
= (caddr_t
)e
;
3916 if (IS_P2ALIGNED(a
, maxpgsz
) && amp
!= NULL
) {
3917 ASSERT(IS_P2ALIGNED(aindx
, maxpages
));
3918 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
3919 anon_array_enter(amp
, aindx
, &an_cookie
);
3920 if (anon_get_ptr(amp
->ahp
, aindx
) != NULL
) {
3921 SEGVN_VMSTAT_FLTVNPAGES(5);
3922 ASSERT(anon_pages(amp
->ahp
, aindx
,
3923 maxpages
) == maxpages
);
3924 anon_array_exit(&an_cookie
);
3925 ANON_LOCK_EXIT(&
->a_rwlock
);
3926 err
= segvn_fault_anonpages(hat
, seg
,
3927 a
, a
+ maxpgsz
, type
, rw
,
3929 MIN(a
+ maxpgsz
, eaddr
), brkcow
);
3931 SEGVN_VMSTAT_FLTVNPAGES(6);
3934 if (szc
< seg
->s_szc
) {
3938 lpgeaddr
= maxlpgeaddr
;
3942 ASSERT(anon_pages(amp
->ahp
, aindx
,
3944 SEGVN_VMSTAT_FLTVNPAGES(7);
3945 anon_array_exit(&an_cookie
);
3946 ANON_LOCK_EXIT(&
->a_rwlock
);
3949 ASSERT(!brkcow
|| IS_P2ALIGNED(a
, maxpgsz
));
3950 ASSERT(!tron
|| IS_P2ALIGNED(a
, maxpgsz
));
3952 if (svd
->pageprot
!= 0 && IS_P2ALIGNED(a
, maxpgsz
)) {
3953 ASSERT(vpage
!= NULL
);
3954 prot
= VPP_PROT(vpage
);
3955 ASSERT(sameprot(seg
, a
, maxpgsz
));
3956 if ((prot
& protchk
) == 0) {
3957 SEGVN_VMSTAT_FLTVNPAGES(8);
3962 if (type
== F_SOFTLOCK
) {
3963 atomic_add_long((ulong_t
*)&svd
->softlockcnt
,
3970 if (!brkcow
&& !tron
&& szc
&&
3971 !page_exists_physcontig(&vp
->v_object
, off
, szc
,
3972 segtype
== MAP_PRIVATE
? ppa
: NULL
)) {
3973 SEGVN_VMSTAT_FLTVNPAGES(9);
3974 if (page_alloc_pages(&vp
->v_object
, seg
, a
,
3975 &pplist
, NULL
, szc
, 0, 0) &&
3976 type
!= F_SOFTLOCK
) {
3977 SEGVN_VMSTAT_FLTVNPAGES(10);
3980 alloc_failed
|= (1 << szc
);
3983 if (pplist
!= NULL
&&
3984 vp
->v_mpssdata
== SEGVN_PAGEIO
) {
3986 SEGVN_VMSTAT_FLTVNPAGES(11);
3987 physcontig
= segvn_fill_vp_pages(svd
,
3988 vp
, off
, szc
, ppa
, &pplist
,
3990 ASSERT(!physcontig
|| pplist
== NULL
);
3991 if (!physcontig
&& downsize
&&
3992 type
!= F_SOFTLOCK
) {
3993 ASSERT(pplist
== NULL
);
3994 SEGVN_VMSTAT_FLTVNPAGES(12);
3998 ASSERT(!physcontig
||
3999 segtype
== MAP_PRIVATE
||
4001 if (physcontig
&& ppa
[0] == NULL
) {
4005 } else if (!brkcow
&& !tron
&& szc
&& ppa
[0] != NULL
) {
4006 SEGVN_VMSTAT_FLTVNPAGES(13);
4007 ASSERT(segtype
== MAP_PRIVATE
);
4012 SEGVN_VMSTAT_FLTVNPAGES(14);
4014 ierr
= fop_getpage(vp
, (offset_t
)off
, pgsz
,
4015 &vpprot
, ppa
, pgsz
, seg
, a
, arw
,
4019 for (i
= 0; i
< pages
; i
++) {
4020 ASSERT(PAGE_LOCKED(ppa
[i
]));
4021 ASSERT(!PP_ISFREE(ppa
[i
]));
4022 VERIFY(ppa
[i
]->p_object
==
4024 ASSERT(ppa
[i
]->p_vnode
== vp
);
4025 ASSERT(ppa
[i
]->p_offset
==
4026 off
+ (i
<< PAGESHIFT
));
4030 if (segtype
== MAP_PRIVATE
) {
4031 SEGVN_VMSTAT_FLTVNPAGES(15);
4032 vpprot
&= ~PROT_WRITE
;
4035 ASSERT(segtype
== MAP_PRIVATE
);
4036 SEGVN_VMSTAT_FLTVNPAGES(16);
4037 vpprot
= PROT_ALL
& ~PROT_WRITE
;
4042 SEGVN_VMSTAT_FLTVNPAGES(17);
4043 if (pplist
!= NULL
) {
4044 SEGVN_VMSTAT_FLTVNPAGES(18);
4045 page_free_replacement_page(pplist
);
4046 page_create_putback(pages
);
4048 SEGVN_RESTORE_SOFTLOCK_VP(type
, pages
);
4049 if (a
+ pgsz
<= eaddr
) {
4050 SEGVN_VMSTAT_FLTVNPAGES(19);
4051 err
= FC_MAKE_ERR(ierr
);
4054 va
.va_mask
= VATTR_SIZE
;
4055 if (fop_getattr(vp
, &va
, 0, svd
->cred
, NULL
)) {
4056 SEGVN_VMSTAT_FLTVNPAGES(20);
4057 err
= FC_MAKE_ERR(EIO
);
4060 if (btopr(va
.va_size
) >= btopr(off
+ pgsz
)) {
4061 SEGVN_VMSTAT_FLTVNPAGES(21);
4062 err
= FC_MAKE_ERR(ierr
);
4065 if (btopr(va
.va_size
) <
4066 btopr(off
+ (eaddr
- a
))) {
4067 SEGVN_VMSTAT_FLTVNPAGES(22);
4068 err
= FC_MAKE_ERR(ierr
);
4071 if (brkcow
|| tron
|| type
== F_SOFTLOCK
) {
4072 /* can't reduce map area */
4073 SEGVN_VMSTAT_FLTVNPAGES(23);
4077 SEGVN_VMSTAT_FLTVNPAGES(24);
4085 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
4086 anon_array_enter(amp
, aindx
, &an_cookie
);
4089 anon_get_ptr(amp
->ahp
, aindx
) != NULL
) {
4090 ulong_t taindx
= P2ALIGN(aindx
, maxpages
);
4092 SEGVN_VMSTAT_FLTVNPAGES(25);
4093 ASSERT(anon_pages(amp
->ahp
, taindx
,
4094 maxpages
) == maxpages
);
4095 for (i
= 0; i
< pages
; i
++) {
4096 page_unlock(ppa
[i
]);
4098 anon_array_exit(&an_cookie
);
4099 ANON_LOCK_EXIT(&
->a_rwlock
);
4100 if (pplist
!= NULL
) {
4101 page_free_replacement_page(pplist
);
4102 page_create_putback(pages
);
4104 SEGVN_RESTORE_SOFTLOCK_VP(type
, pages
);
4105 if (szc
< seg
->s_szc
) {
4106 SEGVN_VMSTAT_FLTVNPAGES(26);
4108 * For private segments SOFTLOCK
4109 * either always breaks cow (any rw
4110 * type except S_READ_NOCOW) or
4111 * address space is locked as writer
4112 * (S_READ_NOCOW case) and anon slots
4113 * can't show up on second check.
4114 * Therefore if we are here for
4115 * SOFTLOCK case it must be a cow
4116 * break but cow break never reduces
4117 * szc. text replication (tron) in
4118 * this case works as cow break.
4119 * Thus the assert below.
4121 ASSERT(!brkcow
&& !tron
&&
4122 type
!= F_SOFTLOCK
);
4127 ASSERT(IS_P2ALIGNED(a
, maxpgsz
));
4132 ulong_t taindx
= P2ALIGN(aindx
, maxpages
);
4133 ASSERT(!anon_pages(amp
->ahp
, taindx
, maxpages
));
4137 if (brkcow
|| tron
) {
4138 ASSERT(amp
!= NULL
);
4139 ASSERT(pplist
== NULL
);
4140 ASSERT(szc
== seg
->s_szc
);
4141 ASSERT(IS_P2ALIGNED(a
, maxpgsz
));
4142 ASSERT(IS_P2ALIGNED(aindx
, maxpages
));
4143 SEGVN_VMSTAT_FLTVNPAGES(27);
4144 ierr
= anon_map_privatepages(amp
, aindx
, szc
,
4145 seg
, a
, prot
, ppa
, vpage
, segvn_anypgsz
,
4146 tron
? PG_LOCAL
: 0, svd
->cred
);
4148 SEGVN_VMSTAT_FLTVNPAGES(28);
4149 anon_array_exit(&an_cookie
);
4150 ANON_LOCK_EXIT(&
->a_rwlock
);
4151 SEGVN_RESTORE_SOFTLOCK_VP(type
, pages
);
4152 err
= FC_MAKE_ERR(ierr
);
4156 ASSERT(!IS_VMODSORT(ppa
[0]->p_vnode
));
4158 * p_szc can't be changed for locked
4161 ASSERT(svd
->rcookie
==
4162 HAT_INVALID_REGION_COOKIE
);
4163 hat_memload_array(hat
, a
, pgsz
, ppa
, prot
,
4166 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4167 SEGVN_VMSTAT_FLTVNPAGES(29);
4168 for (i
= 0; i
< pages
; i
++) {
4169 page_unlock(ppa
[i
]);
4172 anon_array_exit(&an_cookie
);
4173 ANON_LOCK_EXIT(&
->a_rwlock
);
4177 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
||
4178 (!svd
->pageprot
&& svd
->prot
== (prot
& vpprot
)));
4180 pfn
= page_pptonum(ppa
[0]);
4182 * hat_page_demote() needs an SE_EXCL lock on one of
4183 * constituent page_t's and it decreases root's p_szc
4184 * last. This means if root's p_szc is equal szc and
4185 * all its constituent pages are locked
4186 * hat_page_demote() that could have changed p_szc to
4187 * szc is already done and no new have page_demote()
4188 * can start for this large page.
4192 * we need to make sure same mapping size is used for
4193 * the same address range if there's a possibility the
4194 * adddress is already mapped because hat layer panics
4195 * when translation is loaded for the range already
4196 * mapped with a different page size. We achieve it
4197 * by always using largest page size possible subject
4198 * to the constraints of page size, segment page size
4199 * and page alignment. Since mappings are invalidated
4200 * when those constraints change and make it
4201 * impossible to use previously used mapping size no
4202 * mapping size conflicts should happen.
4206 if ((pszc
= ppa
[0]->p_szc
) == szc
&&
4207 IS_P2ALIGNED(pfn
, pages
)) {
4209 SEGVN_VMSTAT_FLTVNPAGES(30);
4211 for (i
= 0; i
< pages
; i
++) {
4212 ASSERT(PAGE_LOCKED(ppa
[i
]));
4213 ASSERT(!PP_ISFREE(ppa
[i
]));
4214 ASSERT(page_pptonum(ppa
[i
]) ==
4216 ASSERT(ppa
[i
]->p_szc
== szc
);
4217 VERIFY(ppa
[i
]->p_object
== &vp
->v_object
);
4218 ASSERT(ppa
[i
]->p_vnode
== vp
);
4219 ASSERT(ppa
[i
]->p_offset
==
4220 off
+ (i
<< PAGESHIFT
));
4224 * All pages are of szc we need and they are
4225 * all locked so they can't change szc. load
4228 * if page got promoted since last check
4229 * we don't need pplist.
4231 if (pplist
!= NULL
) {
4232 page_free_replacement_page(pplist
);
4233 page_create_putback(pages
);
4235 if (PP_ISMIGRATE(ppa
[0])) {
4236 page_migrate(seg
, a
, ppa
, pages
);
4238 SEGVN_UPDATE_MODBITS(ppa
, pages
, rw
,
4240 hat_memload_array_region(hat
, a
, pgsz
,
4241 ppa
, prot
& vpprot
, hat_flag
,
4244 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4245 for (i
= 0; i
< pages
; i
++) {
4246 page_unlock(ppa
[i
]);
4250 anon_array_exit(&an_cookie
);
4251 ANON_LOCK_EXIT(&
->a_rwlock
);
4257 * See if upsize is possible.
4259 if (pszc
> szc
&& szc
< seg
->s_szc
&&
4260 (segvn_anypgsz_vnode
|| pszc
>= seg
->s_szc
)) {
4262 uint_t pszc1
= MIN(pszc
, seg
->s_szc
);
4263 ppgsz
= page_get_pagesize(pszc1
);
4264 ppages
= btop(ppgsz
);
4265 aphase
= btop(P2PHASE((uintptr_t)a
, ppgsz
));
4267 ASSERT(type
!= F_SOFTLOCK
);
4269 SEGVN_VMSTAT_FLTVNPAGES(31);
4270 if (aphase
!= P2PHASE(pfn
, ppages
)) {
4271 segvn_faultvnmpss_align_err4
++;
4273 SEGVN_VMSTAT_FLTVNPAGES(32);
4274 if (pplist
!= NULL
) {
4275 page_t
*pl
= pplist
;
4276 page_free_replacement_page(pl
);
4277 page_create_putback(pages
);
4279 for (i
= 0; i
< pages
; i
++) {
4280 page_unlock(ppa
[i
]);
4283 anon_array_exit(&an_cookie
);
4284 ANON_LOCK_EXIT(&
->a_rwlock
);
4293 * check if we should use smallest mapping size.
4298 !IS_P2ALIGNED(pfn
, pages
)) ||
4300 !segvn_full_szcpages(ppa
, szc
, &upgrdfail
,
4303 if (upgrdfail
&& type
!= F_SOFTLOCK
) {
4305 * segvn_full_szcpages failed to lock
4306 * all pages EXCL. Size down.
4310 SEGVN_VMSTAT_FLTVNPAGES(33);
4312 if (pplist
!= NULL
) {
4313 page_t
*pl
= pplist
;
4314 page_free_replacement_page(pl
);
4315 page_create_putback(pages
);
4318 for (i
= 0; i
< pages
; i
++) {
4319 page_unlock(ppa
[i
]);
4322 anon_array_exit(&an_cookie
);
4323 ANON_LOCK_EXIT(&
->a_rwlock
);
4328 if (szc
!= 0 && !upgrdfail
) {
4329 segvn_faultvnmpss_align_err5
++;
4331 SEGVN_VMSTAT_FLTVNPAGES(34);
4332 if (pplist
!= NULL
) {
4333 page_free_replacement_page(pplist
);
4334 page_create_putback(pages
);
4336 SEGVN_UPDATE_MODBITS(ppa
, pages
, rw
,
4338 if (upgrdfail
&& segvn_anypgsz_vnode
) {
4340 hat_memload_array_region(hat
, a
, pgsz
,
4341 ppa
, prot
& vpprot
, hat_flag
,
4344 for (i
= 0; i
< pages
; i
++) {
4345 hat_memload_region(hat
,
4346 a
+ (i
<< PAGESHIFT
),
4347 ppa
[i
], prot
& vpprot
,
4348 hat_flag
, svd
->rcookie
);
4351 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4352 for (i
= 0; i
< pages
; i
++) {
4353 page_unlock(ppa
[i
]);
4357 anon_array_exit(&an_cookie
);
4358 ANON_LOCK_EXIT(&
->a_rwlock
);
4365 * segvn_full_szcpages() upgraded pages szc.
4367 ASSERT(pszc
== ppa
[0]->p_szc
);
4368 ASSERT(IS_P2ALIGNED(pfn
, pages
));
4374 SEGVN_VMSTAT_FLTVNPAGES(35);
4376 * p_szc of ppa[0] can change since we haven't
4377 * locked all constituent pages. Call
4378 * page_lock_szc() to prevent szc changes.
4379 * This should be a rare case that happens when
4380 * multiple segments use a different page size
4381 * to map the same file offsets.
4383 szcmtx
= page_szc_lock(ppa
[0]);
4384 pszc
= ppa
[0]->p_szc
;
4385 ASSERT(szcmtx
!= NULL
|| pszc
== 0);
4386 ASSERT(ppa
[0]->p_szc
<= pszc
);
4388 SEGVN_VMSTAT_FLTVNPAGES(36);
4389 if (szcmtx
!= NULL
) {
4394 if (pplist
!= NULL
) {
4396 * page got promoted since last check.
4397 * we don't need preaalocated large
4400 SEGVN_VMSTAT_FLTVNPAGES(37);
4401 page_free_replacement_page(pplist
);
4402 page_create_putback(pages
);
4404 SEGVN_UPDATE_MODBITS(ppa
, pages
, rw
,
4406 hat_memload_array_region(hat
, a
, pgsz
, ppa
,
4407 prot
& vpprot
, hat_flag
, svd
->rcookie
);
4409 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4410 for (i
= 0; i
< pages
; i
++) {
4411 page_unlock(ppa
[i
]);
4415 anon_array_exit(&an_cookie
);
4416 ANON_LOCK_EXIT(&
->a_rwlock
);
4422 * if page got demoted since last check
4423 * we could have not allocated larger page.
4426 if (pplist
== NULL
&&
4427 page_alloc_pages(&vp
->v_object
, seg
, a
, &pplist
,
4428 NULL
, szc
, 0, 0) && type
!= F_SOFTLOCK
) {
4429 SEGVN_VMSTAT_FLTVNPAGES(38);
4430 for (i
= 0; i
< pages
; i
++) {
4431 page_unlock(ppa
[i
]);
4434 anon_array_exit(&an_cookie
);
4435 ANON_LOCK_EXIT(&
->a_rwlock
);
4438 alloc_failed
|= (1 << szc
);
4442 SEGVN_VMSTAT_FLTVNPAGES(39);
4444 if (pplist
!= NULL
) {
4445 segvn_relocate_pages(ppa
, pplist
);
4448 ASSERT(type
== F_SOFTLOCK
);
4449 SEGVN_VMSTAT_FLTVNPAGES(40);
4453 SEGVN_UPDATE_MODBITS(ppa
, pages
, rw
, prot
, vpprot
);
4455 if (pplist
== NULL
&& segvn_anypgsz_vnode
== 0) {
4456 ASSERT(type
== F_SOFTLOCK
);
4457 for (i
= 0; i
< pages
; i
++) {
4458 ASSERT(ppa
[i
]->p_szc
< szc
);
4459 hat_memload_region(hat
,
4460 a
+ (i
<< PAGESHIFT
),
4461 ppa
[i
], prot
& vpprot
, hat_flag
,
4465 ASSERT(pplist
!= NULL
|| type
== F_SOFTLOCK
);
4466 hat_memload_array_region(hat
, a
, pgsz
, ppa
,
4467 prot
& vpprot
, hat_flag
, svd
->rcookie
);
4469 if (!(hat_flag
& HAT_LOAD_LOCK
)) {
4470 for (i
= 0; i
< pages
; i
++) {
4471 ASSERT(PAGE_SHARED(ppa
[i
]));
4472 page_unlock(ppa
[i
]);
4476 anon_array_exit(&an_cookie
);
4477 ANON_LOCK_EXIT(&
->a_rwlock
);
4481 if (vpage
!= NULL
) {
4488 ASSERT(a
< lpgeaddr
);
4490 ASSERT(!brkcow
&& !tron
&& type
!= F_SOFTLOCK
);
4493 * ierr == -1 means we failed to map with a large page.
4494 * (either due to allocation/relocation failures or
4495 * misalignment with other mappings to this file.
4497 * ierr == -2 means some other thread allocated a large page
4498 * after we gave up tp map with a large page. retry with
4501 ASSERT(ierr
== -1 || ierr
== -2);
4502 ASSERT(ierr
== -2 || szc
!= 0);
4503 ASSERT(ierr
== -1 || szc
< seg
->s_szc
);
4505 SEGVN_VMSTAT_FLTVNPAGES(41);
4506 ASSERT(pszc
> szc
&& pszc
<= seg
->s_szc
);
4508 } else if (segvn_anypgsz_vnode
) {
4509 SEGVN_VMSTAT_FLTVNPAGES(42);
4512 SEGVN_VMSTAT_FLTVNPAGES(43);
4515 * other process created pszc large page.
4516 * but we still have to drop to 0 szc.
4521 pgsz
= page_get_pagesize(szc
);
4525 * Size up case. Note lpgaddr may only be needed for
4526 * softlock case so we don't adjust it here.
4528 a
= (caddr_t
)P2ALIGN((uintptr_t)a
, pgsz
);
4529 ASSERT(a
>= lpgaddr
);
4530 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
4531 off
= svd
->offset
+ (uintptr_t)(a
- seg
->s_base
);
4532 aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4533 vpage
= (svd
->vpage
!= NULL
) ?
4534 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4537 * Size down case. Note lpgaddr may only be needed for
4538 * softlock case so we don't adjust it here.
4540 ASSERT(IS_P2ALIGNED(a
, pgsz
));
4541 ASSERT(IS_P2ALIGNED(lpgeaddr
, pgsz
));
4542 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
4543 ASSERT(a
< lpgeaddr
);
4545 SEGVN_VMSTAT_FLTVNPAGES(44);
4547 * The beginning of the large page region can
4548 * be pulled to the right to make a smaller
4549 * region. We haven't yet faulted a single
4552 a
= (caddr_t
)P2ALIGN((uintptr_t)addr
, pgsz
);
4553 ASSERT(a
>= lpgaddr
);
4555 (uintptr_t)(a
- seg
->s_base
);
4556 aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4557 vpage
= (svd
->vpage
!= NULL
) ?
4558 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4563 kmem_free(ppa
, ppasize
);
4564 if (!err
&& !vop_size_err
) {
4565 SEGVN_VMSTAT_FLTVNPAGES(45);
4568 if (type
== F_SOFTLOCK
&& a
> lpgaddr
) {
4569 SEGVN_VMSTAT_FLTVNPAGES(46);
4570 segvn_softunlock(seg
, lpgaddr
, a
- lpgaddr
, S_OTHER
);
4572 if (!vop_size_err
) {
4573 SEGVN_VMSTAT_FLTVNPAGES(47);
4576 ASSERT(brkcow
|| tron
|| type
== F_SOFTLOCK
);
4578 * Large page end is mapped beyond the end of file and it's a cow
4579 * fault (can be a text replication induced cow) or softlock so we can't
4580 * reduce the map area. For now just demote the segment. This should
4581 * really only happen if the end of the file changed after the mapping
4582 * was established since when large page segments are created we make
4583 * sure they don't extend beyond the end of the file.
4585 SEGVN_VMSTAT_FLTVNPAGES(48);
4587 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4588 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
4590 if (seg
->s_szc
!= 0) {
4591 segvn_fltvnpages_clrszc_cnt
++;
4592 ASSERT(svd
->softlockcnt
== 0);
4593 err
= segvn_clrszc(seg
);
4595 segvn_fltvnpages_clrszc_err
++;
4598 ASSERT(err
|| seg
->s_szc
== 0);
4599 SEGVN_LOCK_DOWNGRADE(seg
->s_as
, &svd
->lock
);
4600 /* segvn_fault will do its job as if szc had been zero to begin with */
4601 return (err
== 0 ? IE_RETRY
: FC_MAKE_ERR(err
));
4605 * This routine will attempt to fault in one large page.
4606 * it will use smaller pages if that fails.
4607 * It should only be called for pure anonymous segments.
4610 segvn_fault_anonpages(struct hat
*hat
, struct seg
*seg
, caddr_t lpgaddr
,
4611 caddr_t lpgeaddr
, enum fault_type type
, enum seg_rw rw
, caddr_t addr
,
4612 caddr_t eaddr
, int brkcow
)
4614 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
4615 struct anon_map
*amp
= svd
->amp
;
4616 uchar_t segtype
= svd
->type
;
4617 uint_t szc
= seg
->s_szc
;
4618 size_t pgsz
= page_get_pagesize(szc
);
4619 size_t maxpgsz
= pgsz
;
4620 pgcnt_t pages
= btop(pgsz
);
4621 uint_t ppaszc
= szc
;
4622 caddr_t a
= lpgaddr
;
4623 ulong_t aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4624 struct vpage
*vpage
= (svd
->vpage
!= NULL
) ?
4625 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4630 uint_t protchk
, prot
, vpprot
;
4632 int hat_flag
= (type
== F_SOFTLOCK
) ? HAT_LOAD_LOCK
: HAT_LOAD
;
4633 anon_sync_obj_t cookie
;
4635 int pgflags
= (svd
->tr_state
== SEGVN_TR_ON
) ? PG_LOCAL
: 0;
4638 ASSERT(amp
!= NULL
);
4639 ASSERT(enable_mbit_wa
== 0); /* no mbit simulations with large pages */
4640 ASSERT(!(svd
->flags
& MAP_NORESERVE
));
4641 ASSERT(type
!= F_SOFTUNLOCK
);
4642 ASSERT(IS_P2ALIGNED(a
, maxpgsz
));
4643 ASSERT(!brkcow
|| svd
->tr_state
== SEGVN_TR_OFF
);
4644 ASSERT(svd
->tr_state
!= SEGVN_TR_INIT
);
4646 ASSERT(SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
4648 VM_STAT_COND_ADD(type
== F_SOFTLOCK
, segvnvmstats
.fltanpages
[0]);
4649 VM_STAT_COND_ADD(type
!= F_SOFTLOCK
, segvnvmstats
.fltanpages
[1]);
4651 if (svd
->flags
& MAP_TEXT
) {
4652 hat_flag
|= HAT_LOAD_TEXT
;
4655 if (svd
->pageprot
) {
4658 protchk
= PROT_READ
;
4661 protchk
= PROT_WRITE
;
4664 protchk
= PROT_EXEC
;
4668 protchk
= PROT_READ
| PROT_WRITE
| PROT_EXEC
;
4671 VM_STAT_ADD(segvnvmstats
.fltanpages
[2]);
4674 /* caller has already done segment level protection check. */
4677 ppa
= kmem_cache_alloc(segvn_szc_cache
[ppaszc
], KM_SLEEP
);
4678 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
4681 for (; a
< lpgeaddr
; a
+= pgsz
, aindx
+= pages
) {
4682 if (svd
->pageprot
!= 0 && IS_P2ALIGNED(a
, maxpgsz
)) {
4683 VM_STAT_ADD(segvnvmstats
.fltanpages
[3]);
4684 ASSERT(vpage
!= NULL
);
4685 prot
= VPP_PROT(vpage
);
4686 ASSERT(sameprot(seg
, a
, maxpgsz
));
4687 if ((prot
& protchk
) == 0) {
4692 if (adjszc_chk
&& IS_P2ALIGNED(a
, maxpgsz
) &&
4694 ASSERT(a
> lpgaddr
);
4698 ASSERT(IS_P2ALIGNED(aindx
, pages
));
4699 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
,
4702 if (type
== F_SOFTLOCK
) {
4703 atomic_add_long((ulong_t
*)&svd
->softlockcnt
,
4706 anon_array_enter(amp
, aindx
, &cookie
);
4707 ppa_szc
= (uint_t
)-1;
4708 ierr
= anon_map_getpages(amp
, aindx
, szc
, seg
, a
,
4709 prot
, &vpprot
, ppa
, &ppa_szc
, vpage
, rw
, brkcow
,
4710 segvn_anypgsz
, pgflags
, svd
->cred
);
4712 anon_array_exit(&cookie
);
4713 VM_STAT_ADD(segvnvmstats
.fltanpages
[4]);
4714 if (type
== F_SOFTLOCK
) {
4716 (ulong_t
*)&svd
->softlockcnt
,
4720 VM_STAT_ADD(segvnvmstats
.fltanpages
[6]);
4721 err
= FC_MAKE_ERR(ierr
);
4727 ASSERT(!IS_VMODSORT(ppa
[0]->p_vnode
));
4729 ASSERT(segtype
== MAP_SHARED
||
4730 ppa
[0]->p_szc
<= szc
);
4731 ASSERT(segtype
== MAP_PRIVATE
||
4732 ppa
[0]->p_szc
>= szc
);
4735 * Handle pages that have been marked for migration
4737 if (lgrp_optimizations())
4738 page_migrate(seg
, a
, ppa
, pages
);
4740 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
4742 if (segtype
== MAP_SHARED
) {
4743 vpprot
|= PROT_WRITE
;
4746 hat_memload_array(hat
, a
, pgsz
, ppa
,
4747 prot
& vpprot
, hat_flag
);
4749 if (hat_flag
& HAT_LOAD_LOCK
) {
4750 VM_STAT_ADD(segvnvmstats
.fltanpages
[7]);
4752 VM_STAT_ADD(segvnvmstats
.fltanpages
[8]);
4753 for (i
= 0; i
< pages
; i
++)
4754 page_unlock(ppa
[i
]);
4759 anon_array_exit(&cookie
);
4764 ASSERT(a
< lpgeaddr
);
4766 * ierr == -1 means we failed to allocate a large page.
4767 * so do a size down operation.
4769 * ierr == -2 means some other process that privately shares
4770 * pages with this process has allocated a larger page and we
4771 * need to retry with larger pages. So do a size up
4772 * operation. This relies on the fact that large pages are
4773 * never partially shared i.e. if we share any constituent
4774 * page of a large page with another process we must share the
4775 * entire large page. Note this cannot happen for SOFTLOCK
4776 * case, unless current address (a) is at the beginning of the
4777 * next page size boundary because the other process couldn't
4778 * have relocated locked pages.
4780 ASSERT(ierr
== -1 || ierr
== -2);
4782 if (segvn_anypgsz
) {
4783 ASSERT(ierr
== -2 || szc
!= 0);
4784 ASSERT(ierr
== -1 || szc
< seg
->s_szc
);
4785 szc
= (ierr
== -1) ? szc
- 1 : szc
+ 1;
4788 * For non COW faults and segvn_anypgsz == 0
4789 * we need to be careful not to loop forever
4790 * if existing page is found with szc other
4791 * than 0 or seg->s_szc. This could be due
4792 * to page relocations on behalf of DR or
4793 * more likely large page creation. For this
4794 * case simply re-size to existing page's szc
4795 * if returned by anon_map_getpages().
4797 if (ppa_szc
== (uint_t
)-1) {
4798 szc
= (ierr
== -1) ? 0 : seg
->s_szc
;
4800 ASSERT(ppa_szc
<= seg
->s_szc
);
4801 ASSERT(ierr
== -2 || ppa_szc
< szc
);
4802 ASSERT(ierr
== -1 || ppa_szc
> szc
);
4807 pgsz
= page_get_pagesize(szc
);
4809 ASSERT(type
!= F_SOFTLOCK
|| ierr
== -1 ||
4810 (IS_P2ALIGNED(a
, pgsz
) && IS_P2ALIGNED(lpgeaddr
, pgsz
)));
4811 if (type
== F_SOFTLOCK
) {
4813 * For softlocks we cannot reduce the fault area
4814 * (calculated based on the largest page size for this
4815 * segment) for size down and a is already next
4816 * page size aligned as assertted above for size
4817 * ups. Therefore just continue in case of softlock.
4819 VM_STAT_ADD(segvnvmstats
.fltanpages
[9]);
4820 continue; /* keep lint happy */
4821 } else if (ierr
== -2) {
4824 * Size up case. Note lpgaddr may only be needed for
4825 * softlock case so we don't adjust it here.
4827 VM_STAT_ADD(segvnvmstats
.fltanpages
[10]);
4828 a
= (caddr_t
)P2ALIGN((uintptr_t)a
, pgsz
);
4829 ASSERT(a
>= lpgaddr
);
4830 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
4831 aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4832 vpage
= (svd
->vpage
!= NULL
) ?
4833 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4836 * Size down case. Note lpgaddr may only be needed for
4837 * softlock case so we don't adjust it here.
4839 VM_STAT_ADD(segvnvmstats
.fltanpages
[11]);
4840 ASSERT(IS_P2ALIGNED(a
, pgsz
));
4841 ASSERT(IS_P2ALIGNED(lpgeaddr
, pgsz
));
4842 lpgeaddr
= (caddr_t
)P2ROUNDUP((uintptr_t)eaddr
, pgsz
);
4843 ASSERT(a
< lpgeaddr
);
4846 * The beginning of the large page region can
4847 * be pulled to the right to make a smaller
4848 * region. We haven't yet faulted a single
4851 VM_STAT_ADD(segvnvmstats
.fltanpages
[12]);
4852 a
= (caddr_t
)P2ALIGN((uintptr_t)addr
, pgsz
);
4853 ASSERT(a
>= lpgaddr
);
4854 aindx
= svd
->anon_index
+ seg_page(seg
, a
);
4855 vpage
= (svd
->vpage
!= NULL
) ?
4856 &svd
->vpage
[seg_page(seg
, a
)] : NULL
;
4860 VM_STAT_ADD(segvnvmstats
.fltanpages
[13]);
4861 ANON_LOCK_EXIT(&
->a_rwlock
);
4862 kmem_cache_free(segvn_szc_cache
[ppaszc
], ppa
);
4865 VM_STAT_ADD(segvnvmstats
.fltanpages
[14]);
4866 ANON_LOCK_EXIT(&
->a_rwlock
);
4867 kmem_cache_free(segvn_szc_cache
[ppaszc
], ppa
);
4868 if (type
== F_SOFTLOCK
&& a
> lpgaddr
) {
4869 VM_STAT_ADD(segvnvmstats
.fltanpages
[15]);
4870 segvn_softunlock(seg
, lpgaddr
, a
- lpgaddr
, S_OTHER
);
4875 int fltadvice
= 1; /* set to free behind pages for sequential access */
4878 * This routine is called via a machine specific fault handling routine.
4879 * It is also called by software routines wishing to lock or unlock
4880 * a range of addresses.
4882 * Here is the basic algorithm:
4884 * Call segvn_softunlock
4887 * Checking and set up work
4888 * If we will need some non-anonymous pages
4889 * Call fop_getpage over the range of non-anonymous pages
4891 * Loop over all addresses requested
4892 * Call segvn_faultpage passing in page list
4893 * to load up translations and handle anonymous pages
4895 * Load up translation to any additional pages in page list not
4896 * already handled that fit into this segment
4899 segvn_fault(struct hat
*hat
, struct seg
*seg
, caddr_t addr
, size_t len
,
4900 enum fault_type type
, enum seg_rw rw
)
4902 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
4903 page_t
**plp
, **ppp
, *pp
;
4906 struct vpage
*vpage
;
4907 uint_t vpprot
, prot
;
4909 page_t
*pl
[FAULT_TMP_PAGES_NUM
+ 1];
4910 size_t plsz
, pl_alloc_sz
;
4913 struct anon_map
*amp
;
4915 caddr_t lpgaddr
, lpgeaddr
;
4917 anon_sync_obj_t cookie
;
4918 int brkcow
= BREAK_COW_SHARE(rw
, type
, svd
->type
);
4920 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
4921 ASSERT(svd
->amp
== NULL
|| svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
4924 * First handle the easy stuff
4926 if (type
== F_SOFTUNLOCK
) {
4927 if (rw
== S_READ_NOCOW
) {
4929 ASSERT(AS_WRITE_HELD(seg
->s_as
));
4931 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
4932 pgsz
= (seg
->s_szc
== 0) ? PAGESIZE
:
4933 page_get_pagesize(seg
->s_szc
);
4934 VM_STAT_COND_ADD(pgsz
> PAGESIZE
, segvnvmstats
.fltanpages
[16]);
4935 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
, lpgeaddr
);
4936 segvn_softunlock(seg
, lpgaddr
, lpgeaddr
- lpgaddr
, rw
);
4937 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4941 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
||
4942 !HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
));
4944 if (svd
->tr_state
== SEGVN_TR_INIT
) {
4945 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
4946 if (svd
->tr_state
== SEGVN_TR_INIT
) {
4947 ASSERT(svd
->vp
!= NULL
&& svd
->amp
== NULL
);
4948 ASSERT(svd
->flags
& MAP_TEXT
);
4949 ASSERT(svd
->type
== MAP_PRIVATE
);
4950 segvn_textrepl(seg
);
4951 ASSERT(svd
->tr_state
!= SEGVN_TR_INIT
);
4952 ASSERT(svd
->tr_state
!= SEGVN_TR_ON
||
4955 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4957 } else if (svd
->tr_state
!= SEGVN_TR_OFF
) {
4958 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
4960 if (rw
== S_WRITE
&& svd
->tr_state
!= SEGVN_TR_OFF
) {
4961 ASSERT(!svd
->pageprot
&& !(svd
->prot
& PROT_WRITE
));
4962 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4966 if (svd
->tr_state
== SEGVN_TR_ON
) {
4967 ASSERT(svd
->vp
!= NULL
&& svd
->amp
!= NULL
);
4968 segvn_textunrepl(seg
, 0);
4969 ASSERT(svd
->amp
== NULL
&&
4970 svd
->tr_state
== SEGVN_TR_OFF
);
4971 } else if (svd
->tr_state
!= SEGVN_TR_OFF
) {
4972 svd
->tr_state
= SEGVN_TR_OFF
;
4974 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
4975 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
4979 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
4982 * If we have the same protections for the entire segment,
4983 * insure that the access being attempted is legitimate.
4986 if (svd
->pageprot
== 0) {
4992 protchk
= PROT_READ
;
4995 protchk
= PROT_WRITE
;
4998 protchk
= PROT_EXEC
;
5002 protchk
= PROT_READ
| PROT_WRITE
| PROT_EXEC
;
5006 if ((svd
->prot
& protchk
) == 0) {
5007 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5008 return (FC_PROT
); /* illegal access type */
5012 if (brkcow
&& HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
5013 /* this must be SOFTLOCK S_READ fault */
5014 ASSERT(svd
->amp
== NULL
);
5015 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
5016 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5017 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
5018 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
5020 * this must be the first ever non S_READ_NOCOW
5021 * softlock for this segment.
5023 ASSERT(svd
->softlockcnt
== 0);
5024 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
5026 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
5028 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5033 * We can't allow the long term use of softlocks for vmpss segments,
5034 * because in some file truncation cases we should be able to demote
5035 * the segment, which requires that there are no softlocks. The
5036 * only case where it's ok to allow a SOFTLOCK fault against a vmpss
5037 * segment is S_READ_NOCOW, where the caller holds the address space
5038 * locked as writer and calls softunlock before dropping the as lock.
5039 * S_READ_NOCOW is used by /proc to read memory from another user.
5041 * Another deadlock between SOFTLOCK and file truncation can happen
5042 * because segvn_fault_vnodepages() calls the FS one pagesize at
5043 * a time. A second fop_getpage() call by segvn_fault_vnodepages()
5044 * can cause a deadlock because the first set of page_t's remain
5045 * locked SE_SHARED. To avoid this, we demote segments on a first
5046 * SOFTLOCK if they have a length greater than the segment's
5049 * So for now, we only avoid demoting a segment on a SOFTLOCK when
5050 * the access type is S_READ_NOCOW and the fault length is less than
5051 * or equal to the segment's page size. While this is quite restrictive,
5052 * it should be the most common case of SOFTLOCK against a vmpss
5055 * For S_READ_NOCOW, it's safe not to do a copy on write because the
5056 * caller makes sure no COW will be caused by another thread for a
5059 if (type
== F_SOFTLOCK
&& svd
->vp
!= NULL
&& seg
->s_szc
!= 0) {
5062 if (rw
!= S_READ_NOCOW
) {
5065 if (!demote
&& len
> PAGESIZE
) {
5066 pgsz
= page_get_pagesize(seg
->s_szc
);
5067 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
,
5069 if (lpgeaddr
- lpgaddr
> pgsz
) {
5074 ASSERT(demote
|| AS_WRITE_HELD(seg
->s_as
));
5077 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5078 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
5079 if (seg
->s_szc
!= 0) {
5080 segvn_vmpss_clrszc_cnt
++;
5081 ASSERT(svd
->softlockcnt
== 0);
5082 err
= segvn_clrszc(seg
);
5084 segvn_vmpss_clrszc_err
++;
5085 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5086 return (FC_MAKE_ERR(err
));
5089 ASSERT(seg
->s_szc
== 0);
5090 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5096 * Check to see if we need to allocate an anon_map structure.
5098 if (svd
->amp
== NULL
&& (svd
->vp
== NULL
|| brkcow
)) {
5099 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
5101 * Drop the "read" lock on the segment and acquire
5102 * the "write" version since we have to allocate the
5105 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5106 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
5108 if (svd
->amp
== NULL
) {
5109 svd
->amp
= anonmap_alloc(seg
->s_size
, 0, ANON_SLEEP
);
5110 svd
->amp
->a_szc
= seg
->s_szc
;
5112 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5115 * Start all over again since segment protections
5116 * may have changed after we dropped the "read" lock.
5122 * S_READ_NOCOW vs S_READ distinction was
5123 * only needed for the code above. After
5124 * that we treat it as S_READ.
5126 if (rw
== S_READ_NOCOW
) {
5127 ASSERT(type
== F_SOFTLOCK
);
5128 ASSERT(AS_WRITE_HELD(seg
->s_as
));
5135 * MADV_SEQUENTIAL work is ignored for large page segments.
5137 if (seg
->s_szc
!= 0) {
5138 pgsz
= page_get_pagesize(seg
->s_szc
);
5139 ASSERT(SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
5140 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
, lpgeaddr
);
5141 if (svd
->vp
== NULL
) {
5142 err
= segvn_fault_anonpages(hat
, seg
, lpgaddr
,
5143 lpgeaddr
, type
, rw
, addr
, addr
+ len
, brkcow
);
5145 err
= segvn_fault_vnodepages(hat
, seg
, lpgaddr
,
5146 lpgeaddr
, type
, rw
, addr
, addr
+ len
, brkcow
);
5147 if (err
== IE_RETRY
) {
5148 ASSERT(seg
->s_szc
== 0);
5149 ASSERT(SEGVN_READ_HELD(seg
->s_as
, &svd
->lock
));
5150 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5154 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5158 page
= seg_page(seg
, addr
);
5160 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
5161 anon_index
= svd
->anon_index
+ page
;
5163 if (type
== F_PROT
&& rw
== S_READ
&&
5164 svd
->tr_state
== SEGVN_TR_OFF
&&
5165 svd
->type
== MAP_PRIVATE
&& svd
->pageprot
== 0) {
5166 size_t index
= anon_index
;
5169 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5171 * The fast path could apply to S_WRITE also, except
5172 * that the protection fault could be caused by lazy
5173 * tlb flush when ro->rw. In this case, the pte is
5174 * RW already. But RO in the other cpu's tlb causes
5175 * the fault. Since hat_chgprot won't do anything if
5176 * pte doesn't change, we may end up faulting
5177 * indefinitely until the RO tlb entry gets replaced.
5179 for (a
= addr
; a
< addr
+ len
; a
+= PAGESIZE
, index
++) {
5180 anon_array_enter(amp
, index
, &cookie
);
5181 ap
= anon_get_ptr(amp
->ahp
, index
);
5182 anon_array_exit(&cookie
);
5183 if ((ap
== NULL
) || (ap
->an_refcnt
!= 1)) {
5184 ANON_LOCK_EXIT(&
->a_rwlock
);
5188 hat_chgprot(seg
->s_as
->a_hat
, addr
, len
, svd
->prot
);
5189 ANON_LOCK_EXIT(&
->a_rwlock
);
5190 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5196 if (svd
->vpage
== NULL
)
5199 vpage
= &svd
->vpage
[page
];
5201 off
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
5204 * If MADV_SEQUENTIAL has been set for the particular page we
5205 * are faulting on, free behind all pages in the segment and put
5206 * them on the free list.
5209 if ((page
!= 0) && fltadvice
&& svd
->tr_state
!= SEGVN_TR_ON
) {
5211 ulong_t fanon_index
;
5213 uoff_t pgoff
, fpgoff
;
5215 struct anon
*fap
= NULL
;
5217 if (svd
->advice
== MADV_SEQUENTIAL
||
5219 VPP_ADVICE(vpage
) == MADV_SEQUENTIAL
)) {
5220 pgoff
= off
- PAGESIZE
;
5223 vpp
= &svd
->vpage
[fpage
];
5225 fanon_index
= svd
->anon_index
+ fpage
;
5227 while (pgoff
> svd
->offset
) {
5228 if (svd
->advice
!= MADV_SEQUENTIAL
&&
5229 (!svd
->pageadvice
|| (vpage
&&
5230 VPP_ADVICE(vpp
) != MADV_SEQUENTIAL
)))
5234 * If this is an anon page, we must find the
5235 * correct <vp, offset> for it
5239 ANON_LOCK_ENTER(&
->a_rwlock
,
5241 anon_array_enter(amp
, fanon_index
,
5243 fap
= anon_get_ptr(amp
->ahp
,
5246 swap_xlate(fap
, &fvp
, &fpgoff
);
5251 anon_array_exit(&cookie
);
5252 ANON_LOCK_EXIT(&
->a_rwlock
);
5260 * Skip pages that are free or have an
5263 pp
= page_lookup_nowait(&fvp
->v_object
,
5268 * We don't need the page_struct_lock to test
5269 * as this is only advisory; even if we
5270 * acquire it someone might race in and lock
5271 * the page after we unlock and before the
5272 * PUTPAGE, then fop_putpage will do nothing.
5274 if (pp
->p_lckcnt
== 0 && pp
->p_cowcnt
== 0) {
5276 * Hold the vnode before releasing
5277 * the page lock to prevent it from
5278 * being freed and re-used by some
5284 * We should build a page list
5285 * to kluster putpages XXX
5287 (void) fop_putpage(fvp
,
5288 (offset_t
)fpgoff
, PAGESIZE
,
5289 (B_DONTNEED
|B_FREE
|B_ASYNC
),
5294 * XXX - Should the loop terminate if
5295 * the page is `locked'?
5311 * See if we need to call fop_getpage for
5312 * *any* of the range being faulted on.
5313 * We can skip all of this work if there
5314 * was no original vnode.
5316 if (svd
->vp
!= NULL
) {
5329 * Only acquire reader lock to prevent amp->ahp
5330 * from being changed. It's ok to miss pages,
5331 * hence we don't do anon_array_enter
5333 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5334 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
5336 if (len
<= PAGESIZE
)
5337 /* inline non_anon() */
5338 dogetpage
= (ap
== NULL
);
5340 dogetpage
= non_anon(amp
->ahp
, anon_index
,
5342 ANON_LOCK_EXIT(&
->a_rwlock
);
5347 struct as
*as
= seg
->s_as
;
5349 if (len
> FAULT_TMP_PAGES_SZ
) {
5351 * Page list won't fit in local array,
5352 * allocate one of the needed size.
5355 (btop(len
) + 1) * sizeof (page_t
*);
5356 plp
= kmem_alloc(pl_alloc_sz
, KM_SLEEP
);
5359 } else if (rw
== S_WRITE
&& svd
->type
== MAP_PRIVATE
||
5360 svd
->tr_state
== SEGVN_TR_ON
|| rw
== S_OTHER
||
5361 (((size_t)(addr
+ PAGESIZE
) <
5362 (size_t)(seg
->s_base
+ seg
->s_size
)) &&
5363 hat_probe(as
->a_hat
, addr
+ PAGESIZE
))) {
5365 * Ask fop_getpage to return the exact number
5367 * (a) this is a COW fault, or
5368 * (b) this is a software fault, or
5369 * (c) next page is already mapped.
5374 * Ask fop_getpage to return adjacent pages
5375 * within the segment.
5377 plsz
= MIN((size_t)FAULT_TMP_PAGES_SZ
, (size_t)
5378 ((seg
->s_base
+ seg
->s_size
) - addr
));
5379 ASSERT((addr
+ plsz
) <=
5380 (seg
->s_base
+ seg
->s_size
));
5384 * Need to get some non-anonymous pages.
5385 * We need to make only one call to GETPAGE to do
5386 * this to prevent certain deadlocking conditions
5387 * when we are doing locking. In this case
5388 * non_anon() should have picked up the smallest
5389 * range which includes all the non-anonymous
5390 * pages in the requested range. We have to
5391 * be careful regarding which rw flag to pass in
5392 * because on a private mapping, the underlying
5393 * object is never allowed to be written.
5395 if (rw
== S_WRITE
&& svd
->type
== MAP_PRIVATE
) {
5401 err
= fop_getpage(vp
, (offset_t
)vp_off
, vp_len
,
5402 &vpprot
, plp
, plsz
, seg
, addr
+ (vp_off
- off
), arw
,
5405 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5406 segvn_pagelist_rele(plp
);
5408 kmem_free(plp
, pl_alloc_sz
);
5409 return (FC_MAKE_ERR(err
));
5411 if (svd
->type
== MAP_PRIVATE
)
5412 vpprot
&= ~PROT_WRITE
;
5417 * N.B. at this time the plp array has all the needed non-anon
5418 * pages in addition to (possibly) having some adjacent pages.
5422 * Always acquire the anon_array_lock to prevent
5423 * 2 threads from allocating separate anon slots for
5426 * If this is a copy-on-write fault and we don't already
5427 * have the anon_array_lock, acquire it to prevent the
5428 * fault routine from handling multiple copy-on-write faults
5429 * on the same "addr" in the same address space.
5431 * Only one thread should deal with the fault since after
5432 * it is handled, the other threads can acquire a translation
5433 * to the newly created private page. This prevents two or
5434 * more threads from creating different private pages for the
5437 * We grab "serialization" lock here if this is a MAP_PRIVATE segment
5438 * to prevent deadlock between this thread and another thread
5439 * which has soft-locked this page and wants to acquire serial_lock.
5442 * The fix for bug 4026339 becomes unnecessary when using the
5443 * locking scheme with per amp rwlock and a global set of hash
5444 * lock, anon_array_lock. If we steal a vnode page when low
5445 * on memory and upgrad the page lock through page_rename,
5446 * then the page is PAGE_HANDLED, nothing needs to be done
5447 * for this page after returning from segvn_faultpage.
5449 * But really, the page lock should be downgraded after
5450 * the stolen page is page_rename'd.
5454 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5457 * Ok, now loop over the address range and handle faults
5459 for (a
= addr
; a
< addr
+ len
; a
+= PAGESIZE
, off
+= PAGESIZE
) {
5460 err
= segvn_faultpage(hat
, seg
, a
, off
, vpage
, plp
, vpprot
,
5464 ANON_LOCK_EXIT(&
->a_rwlock
);
5465 if (type
== F_SOFTLOCK
&& a
> addr
) {
5466 segvn_softunlock(seg
, addr
, (a
- addr
),
5469 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5470 segvn_pagelist_rele(plp
);
5472 kmem_free(plp
, pl_alloc_sz
);
5477 } else if (svd
->vpage
) {
5478 page
= seg_page(seg
, addr
);
5479 vpage
= &svd
->vpage
[++page
];
5483 /* Didn't get pages from the underlying fs so we're done */
5488 * Now handle any other pages in the list returned.
5489 * If the page can be used, load up the translations now.
5490 * Note that the for loop will only be entered if "plp"
5491 * is pointing to a non-NULL page pointer which means that
5492 * fop_getpage() was called and vpprot has been initialized.
5494 if (svd
->pageprot
== 0)
5495 prot
= svd
->prot
& vpprot
;
5499 * Large Files: diff should be unsigned value because we started
5500 * supporting > 2GB segment sizes from 2.5.1 and when a
5501 * large file of size > 2GB gets mapped to address space
5502 * the diff value can be > 2GB.
5505 for (ppp
= plp
; (pp
= *ppp
) != NULL
; ppp
++) {
5509 anon_sync_obj_t cookie
;
5510 int hat_flag
= HAT_LOAD_ADV
;
5512 if (svd
->flags
& MAP_TEXT
) {
5513 hat_flag
|= HAT_LOAD_TEXT
;
5516 if (pp
== PAGE_HANDLED
)
5519 if (svd
->tr_state
!= SEGVN_TR_ON
&&
5520 pp
->p_offset
>= svd
->offset
&&
5521 pp
->p_offset
< svd
->offset
+ seg
->s_size
) {
5523 diff
= pp
->p_offset
- svd
->offset
;
5526 * Large Files: Following is the assertion
5527 * validating the above cast.
5529 VERIFY(&svd
->vp
->v_object
== pp
->p_object
);
5530 ASSERT(svd
->vp
== pp
->p_vnode
);
5534 prot
= VPP_PROT(&svd
->vpage
[page
]) & vpprot
;
5537 * Prevent other threads in the address space from
5538 * creating private pages (i.e., allocating anon slots)
5539 * while we are in the process of loading translations
5540 * to additional pages returned by the underlying
5544 anon_index
= svd
->anon_index
+ page
;
5545 anon_array_enter(amp
, anon_index
, &cookie
);
5546 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
5548 if ((amp
== NULL
) || (ap
== NULL
)) {
5549 if (IS_VMODSORT(pp
->p_vnode
) ||
5553 else if (rw
!= S_OTHER
&&
5555 prot
&= ~PROT_WRITE
;
5558 * Skip mapping read ahead pages marked
5559 * for migration, so they will get migrated
5562 ASSERT(amp
== NULL
||
5563 svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
5564 if ((prot
& PROT_READ
) && !PP_ISMIGRATE(pp
)) {
5565 hat_memload_region(hat
,
5572 anon_array_exit(&cookie
);
5578 ANON_LOCK_EXIT(&
->a_rwlock
);
5579 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5581 kmem_free(plp
, pl_alloc_sz
);
5586 * This routine is used to start I/O on pages asynchronously. XXX it will
5587 * only create PAGESIZE pages. At fault time they will be relocated into
5591 segvn_faulta(struct seg
*seg
, caddr_t addr
)
5593 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
5595 struct anon_map
*amp
;
5598 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
5600 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
5601 if ((amp
= svd
->amp
) != NULL
) {
5605 * Reader lock to prevent amp->ahp from being changed.
5606 * This is advisory, it's ok to miss a page, so
5607 * we don't do anon_array_enter lock.
5609 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5610 if ((ap
= anon_get_ptr(amp
->ahp
,
5611 svd
->anon_index
+ seg_page(seg
, addr
))) != NULL
) {
5613 err
= anon_getpage(&ap
, NULL
, NULL
,
5614 0, seg
, addr
, S_READ
, svd
->cred
);
5616 ANON_LOCK_EXIT(&
->a_rwlock
);
5617 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5619 return (FC_MAKE_ERR(err
));
5622 ANON_LOCK_EXIT(&
->a_rwlock
);
5625 if (svd
->vp
== NULL
) {
5626 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5627 return (0); /* zfod page - do nothing now */
5631 err
= fop_getpage(vp
,
5632 (offset_t
)(svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
)),
5633 PAGESIZE
, NULL
, NULL
, 0, seg
, addr
,
5634 S_OTHER
, svd
->cred
, NULL
);
5636 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5638 return (FC_MAKE_ERR(err
));
5643 segvn_setprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t prot
)
5645 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
5646 struct vpage
*cvp
, *svp
, *evp
;
5650 anon_sync_obj_t cookie
;
5651 int unload_done
= 0;
5653 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
5655 if ((svd
->maxprot
& prot
) != prot
)
5656 return (EACCES
); /* violated maxprot */
5658 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
5660 /* return if prot is the same */
5661 if (!svd
->pageprot
&& svd
->prot
== prot
) {
5662 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5667 * Since we change protections we first have to flush the cache.
5668 * This makes sure all the pagelock calls have to recheck
5671 if (svd
->softlockcnt
> 0) {
5672 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
5675 * If this is shared segment non 0 softlockcnt
5676 * means locked pages are still in use.
5678 if (svd
->type
== MAP_SHARED
) {
5679 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5684 * Since we do have the segvn writers lock nobody can fill
5685 * the cache with entries belonging to this seg during
5686 * the purge. The flush either succeeds or we still have
5690 if (svd
->softlockcnt
> 0) {
5691 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5696 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
5697 ASSERT(svd
->amp
== NULL
);
5698 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
5699 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
5701 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
5703 } else if (svd
->tr_state
== SEGVN_TR_INIT
) {
5704 svd
->tr_state
= SEGVN_TR_OFF
;
5705 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
5706 ASSERT(svd
->amp
!= NULL
);
5707 segvn_textunrepl(seg
, 0);
5708 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
5712 if ((prot
& PROT_WRITE
) && svd
->type
== MAP_SHARED
&&
5713 svd
->vp
!= NULL
&& (svd
->vp
->v_flag
& VVMEXEC
)) {
5714 ASSERT(vn_is_mapped(svd
->vp
, V_WRITE
));
5715 segvn_inval_trcache(svd
->vp
);
5717 if (seg
->s_szc
!= 0) {
5719 pgsz
= page_get_pagesize(seg
->s_szc
);
5720 pgcnt
= pgsz
>> PAGESHIFT
;
5721 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
5722 if (!IS_P2ALIGNED(addr
, pgsz
) || !IS_P2ALIGNED(len
, pgsz
)) {
5723 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5724 ASSERT(seg
->s_base
!= addr
|| seg
->s_size
!= len
);
5726 * If we are holding the as lock as a reader then
5727 * we need to return IE_RETRY and let the as
5728 * layer drop and re-acquire the lock as a writer.
5730 if (AS_READ_HELD(seg
->s_as
))
5732 VM_STAT_ADD(segvnvmstats
.demoterange
[1]);
5733 if (svd
->type
== MAP_PRIVATE
|| svd
->vp
!= NULL
) {
5734 err
= segvn_demote_range(seg
, addr
, len
,
5737 uint_t szcvec
= map_pgszcvec(seg
->s_base
,
5738 pgsz
, (uintptr_t)seg
->s_base
,
5739 (svd
->flags
& MAP_TEXT
), MAPPGSZC_SHM
, 0);
5740 err
= segvn_demote_range(seg
, addr
, len
,
5753 * If it's a private mapping and we're making it writable then we
5754 * may have to reserve the additional swap space now. If we are
5755 * making writable only a part of the segment then we use its vpage
5756 * array to keep a record of the pages for which we have reserved
5757 * swap. In this case we set the pageswap field in the segment's
5758 * segvn structure to record this.
5760 * If it's a private mapping to a file (i.e., vp != NULL) and we're
5761 * removing write permission on the entire segment and we haven't
5762 * modified any pages, we can release the swap space.
5764 if (svd
->type
== MAP_PRIVATE
) {
5765 if (prot
& PROT_WRITE
) {
5766 if (!(svd
->flags
& MAP_NORESERVE
) &&
5767 !(svd
->swresv
&& svd
->pageswap
== 0)) {
5771 * Start by determining how much swap
5772 * space is required.
5774 if (addr
== seg
->s_base
&&
5775 len
== seg
->s_size
&&
5776 svd
->pageswap
== 0) {
5777 /* The whole segment */
5781 * Make sure that the vpage array
5782 * exists, and make a note of the
5783 * range of elements corresponding
5787 if (svd
->vpage
== NULL
) {
5788 SEGVN_LOCK_EXIT(seg
->s_as
,
5792 svp
= &svd
->vpage
[seg_page(seg
, addr
)];
5793 evp
= &svd
->vpage
[seg_page(seg
,
5796 if (svd
->pageswap
== 0) {
5798 * This is the first time we've
5799 * asked for a part of this
5800 * segment, so we need to
5801 * reserve everything we've
5807 * We have to count the number
5808 * of pages required.
5810 for (cvp
= svp
; cvp
< evp
;
5812 if (!VPP_ISSWAPRES(cvp
))
5819 /* Try to reserve the necessary swap. */
5820 if (anon_resv_zone(sz
,
5821 seg
->s_as
->a_proc
->p_zone
) == 0) {
5822 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5827 * Make a note of how much swap space
5830 if (svd
->pageswap
== 0 && sz
== seg
->s_size
) {
5833 ASSERT(svd
->vpage
!= NULL
);
5836 for (cvp
= svp
; cvp
< evp
; cvp
++) {
5837 if (!VPP_ISSWAPRES(cvp
))
5838 VPP_SETSWAPRES(cvp
);
5844 * Swap space is released only if this segment
5845 * does not map anonymous memory, since read faults
5846 * on such segments still need an anon slot to read
5849 if (svd
->swresv
!= 0 && svd
->vp
!= NULL
&&
5850 svd
->amp
== NULL
&& addr
== seg
->s_base
&&
5851 len
== seg
->s_size
&& svd
->pageprot
== 0) {
5852 ASSERT(svd
->pageswap
== 0);
5853 anon_unresv_zone(svd
->swresv
,
5854 seg
->s_as
->a_proc
->p_zone
);
5860 if (addr
== seg
->s_base
&& len
== seg
->s_size
&& svd
->vpage
== NULL
) {
5861 if (svd
->prot
== prot
) {
5862 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5863 return (0); /* all done */
5865 svd
->prot
= (uchar_t
)prot
;
5866 } else if (svd
->type
== MAP_PRIVATE
) {
5867 struct anon
*ap
= NULL
;
5870 struct anon_map
*amp
;
5871 ulong_t anon_idx
= 0;
5874 * A vpage structure exists or else the change does not
5875 * involve the entire segment. Establish a vpage structure
5876 * if none is there. Then, for each page in the range,
5877 * adjust its individual permissions. Note that write-
5878 * enabling a MAP_PRIVATE page can affect the claims for
5879 * locked down memory. Overcommitting memory terminates
5883 if (svd
->vpage
== NULL
) {
5884 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5888 if ((amp
= svd
->amp
) != NULL
) {
5889 anon_idx
= svd
->anon_index
+ seg_page(seg
, addr
);
5890 ASSERT(seg
->s_szc
== 0 ||
5891 IS_P2ALIGNED(anon_idx
, pgcnt
));
5892 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
5895 offset
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
5896 evp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
5899 * See Statement at the beginning of segvn_lockop regarding
5900 * the way cowcnts and lckcnts are handled.
5902 for (svp
= &svd
->vpage
[seg_page(seg
, addr
)]; svp
< evp
; svp
++) {
5904 if (seg
->s_szc
!= 0) {
5906 anon_array_enter(amp
, anon_idx
,
5909 if (IS_P2ALIGNED(anon_idx
, pgcnt
) &&
5910 !segvn_claim_pages(seg
, svp
, offset
,
5913 anon_array_exit(&cookie
);
5918 anon_array_exit(&cookie
);
5923 anon_array_enter(amp
, anon_idx
,
5925 ap
= anon_get_ptr(amp
->ahp
, anon_idx
++);
5928 if (VPP_ISPPLOCK(svp
) &&
5929 VPP_PROT(svp
) != prot
) {
5931 if (amp
== NULL
|| ap
== NULL
) {
5935 swap_xlate(ap
, &vp
, &off
);
5937 anon_array_exit(&cookie
);
5939 if ((pp
= page_lookup(&vp
->v_object
, off
, SE_SHARED
)) == NULL
) {
5940 panic("segvn_setprot: no page");
5943 ASSERT(seg
->s_szc
== 0);
5944 if ((VPP_PROT(svp
) ^ prot
) &
5946 if (prot
& PROT_WRITE
) {
5961 } else if (amp
!= NULL
)
5962 anon_array_exit(&cookie
);
5964 VPP_SETPROT(svp
, prot
);
5968 ANON_LOCK_EXIT(&
->a_rwlock
);
5971 * Did we terminate prematurely? If so, simply unload
5972 * the translations to the things we've updated so far.
5976 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5979 len
= (svp
- &svd
->vpage
[seg_page(seg
, addr
)]) *
5981 ASSERT(seg
->s_szc
== 0 || IS_P2ALIGNED(len
, pgsz
));
5983 hat_unload(seg
->s_as
->a_hat
, addr
,
5985 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5990 if (svd
->vpage
== NULL
) {
5991 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
5995 evp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
5996 for (svp
= &svd
->vpage
[seg_page(seg
, addr
)]; svp
< evp
; svp
++) {
5997 VPP_SETPROT(svp
, prot
);
6002 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6006 if (((prot
& PROT_WRITE
) != 0 &&
6007 (svd
->vp
!= NULL
|| svd
->type
== MAP_PRIVATE
)) ||
6008 (prot
& ~PROT_USER
) == PROT_NONE
) {
6010 * Either private or shared data with write access (in
6011 * which case we need to throw out all former translations
6012 * so that we get the right translations set up on fault
6013 * and we don't allow write access to any copy-on-write pages
6014 * that might be around or to prevent write access to pages
6015 * representing holes in a file), or we don't have permission
6016 * to access the memory at all (in which case we have to
6017 * unload any current translations that might exist).
6019 hat_unload(seg
->s_as
->a_hat
, addr
, len
, HAT_UNLOAD
);
6022 * A shared mapping or a private mapping in which write
6023 * protection is going to be denied - just change all the
6024 * protections over the range of addresses in question.
6025 * segvn does not support any other attributes other
6026 * than prot so we can use hat_chgattr.
6028 hat_chgattr(seg
->s_as
->a_hat
, addr
, len
, prot
);
6031 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6037 * segvn_setpagesize is called via segop_setpagesize from as_setpagesize,
6038 * to determine if the seg is capable of mapping the requested szc.
6041 segvn_setpagesize(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t szc
)
6043 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6044 struct segvn_data
*nsvd
;
6045 struct anon_map
*amp
= svd
->amp
;
6047 caddr_t eaddr
= addr
+ len
, a
;
6048 size_t pgsz
= page_get_pagesize(szc
);
6049 pgcnt_t pgcnt
= page_get_pagecnt(szc
);
6051 uoff_t off
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
6053 ASSERT(seg
->s_as
&& AS_WRITE_HELD(seg
->s_as
));
6054 ASSERT(addr
>= seg
->s_base
&& eaddr
<= seg
->s_base
+ seg
->s_size
);
6056 if (seg
->s_szc
== szc
|| segvn_lpg_disable
!= 0) {
6061 * addr should always be pgsz aligned but eaddr may be misaligned if
6062 * it's at the end of the segment.
6064 * XXX we should assert this condition since as_setpagesize() logic
6067 if (!IS_P2ALIGNED(addr
, pgsz
) ||
6068 (!IS_P2ALIGNED(eaddr
, pgsz
) &&
6069 eaddr
!= seg
->s_base
+ seg
->s_size
)) {
6071 segvn_setpgsz_align_err
++;
6075 if (amp
!= NULL
&& svd
->type
== MAP_SHARED
) {
6076 ulong_t an_idx
= svd
->anon_index
+ seg_page(seg
, addr
);
6077 if (!IS_P2ALIGNED(an_idx
, pgcnt
)) {
6079 segvn_setpgsz_anon_align_err
++;
6084 if ((svd
->flags
& MAP_NORESERVE
) || seg
->s_as
== &kas
||
6085 szc
> segvn_maxpgszc
) {
6089 /* paranoid check */
6090 if (svd
->vp
!= NULL
&&
6091 (IS_SWAPFSVP(svd
->vp
) || VN_ISKAS(svd
->vp
))) {
6095 if (seg
->s_szc
== 0 && svd
->vp
!= NULL
&&
6096 map_addr_vacalign_check(addr
, off
)) {
6101 * Check that protections are the same within new page
6104 if (svd
->pageprot
) {
6105 for (a
= addr
; a
< eaddr
; a
+= pgsz
) {
6106 if ((a
+ pgsz
) > eaddr
) {
6107 if (!sameprot(seg
, a
, eaddr
- a
)) {
6111 if (!sameprot(seg
, a
, pgsz
)) {
6119 * Since we are changing page size we first have to flush
6120 * the cache. This makes sure all the pagelock calls have
6121 * to recheck protections.
6123 if (svd
->softlockcnt
> 0) {
6124 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6127 * If this is shared segment non 0 softlockcnt
6128 * means locked pages are still in use.
6130 if (svd
->type
== MAP_SHARED
) {
6135 * Since we do have the segvn writers lock nobody can fill
6136 * the cache with entries belonging to this seg during
6137 * the purge. The flush either succeeds or we still have
6141 if (svd
->softlockcnt
> 0) {
6146 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
6147 ASSERT(svd
->amp
== NULL
);
6148 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6149 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
6151 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
6152 } else if (svd
->tr_state
== SEGVN_TR_INIT
) {
6153 svd
->tr_state
= SEGVN_TR_OFF
;
6154 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
6155 ASSERT(svd
->amp
!= NULL
);
6156 segvn_textunrepl(seg
, 1);
6157 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
6162 * Operation for sub range of existing segment.
6164 if (addr
!= seg
->s_base
|| eaddr
!= (seg
->s_base
+ seg
->s_size
)) {
6165 if (szc
< seg
->s_szc
) {
6166 VM_STAT_ADD(segvnvmstats
.demoterange
[2]);
6167 err
= segvn_demote_range(seg
, addr
, len
, SDR_RANGE
, 0);
6171 if (err
== ENOMEM
) {
6176 if (addr
!= seg
->s_base
) {
6177 nseg
= segvn_split_seg(seg
, addr
);
6178 if (eaddr
!= (nseg
->s_base
+ nseg
->s_size
)) {
6179 /* eaddr is szc aligned */
6180 (void) segvn_split_seg(nseg
, eaddr
);
6184 if (eaddr
!= (seg
->s_base
+ seg
->s_size
)) {
6185 /* eaddr is szc aligned */
6186 (void) segvn_split_seg(seg
, eaddr
);
6192 * Break any low level sharing and reset seg->s_szc to 0.
6194 if ((err
= segvn_clrszc(seg
)) != 0) {
6195 if (err
== ENOMEM
) {
6200 ASSERT(seg
->s_szc
== 0);
6203 * If the end of the current segment is not pgsz aligned
6204 * then attempt to concatenate with the next segment.
6206 if (!IS_P2ALIGNED(eaddr
, pgsz
)) {
6207 nseg
= AS_SEGNEXT(seg
->s_as
, seg
);
6208 if (nseg
== NULL
|| nseg
== seg
|| eaddr
!= nseg
->s_base
) {
6211 if (nseg
->s_ops
!= &segvn_ops
) {
6214 nsvd
= (struct segvn_data
*)nseg
->s_data
;
6215 if (nsvd
->softlockcnt
> 0) {
6217 * If this is shared segment non 0 softlockcnt
6218 * means locked pages are still in use.
6220 if (nsvd
->type
== MAP_SHARED
) {
6224 if (nsvd
->softlockcnt
> 0) {
6228 err
= segvn_clrszc(nseg
);
6229 if (err
== ENOMEM
) {
6235 ASSERT(nsvd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6236 err
= segvn_concat(seg
, nseg
, 1);
6247 * May need to re-align anon array to
6251 if (!IS_P2ALIGNED(svd
->anon_index
, pgcnt
)) {
6252 struct anon_hdr
*nahp
;
6254 ASSERT(svd
->type
== MAP_PRIVATE
);
6256 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
6257 ASSERT(amp
->refcnt
== 1);
6258 nahp
= anon_create(btop(amp
->size
), ANON_NOSLEEP
);
6260 ANON_LOCK_EXIT(&
->a_rwlock
);
6263 if (anon_copy_ptr(amp
->ahp
, svd
->anon_index
,
6264 nahp
, 0, btop(seg
->s_size
), ANON_NOSLEEP
)) {
6265 anon_release(nahp
, btop(amp
->size
));
6266 ANON_LOCK_EXIT(&
->a_rwlock
);
6269 anon_release(amp
->ahp
, btop(amp
->size
));
6271 svd
->anon_index
= 0;
6272 ANON_LOCK_EXIT(&
->a_rwlock
);
6275 if (svd
->vp
!= NULL
&& szc
!= 0) {
6277 uoff_t eoffpage
= svd
->offset
;
6278 va
.va_mask
= VATTR_SIZE
;
6279 eoffpage
+= seg
->s_size
;
6280 eoffpage
= btopr(eoffpage
);
6281 if (fop_getattr(svd
->vp
, &va
, 0, svd
->cred
, NULL
) != 0) {
6282 segvn_setpgsz_getattr_err
++;
6285 if (btopr(va
.va_size
) < eoffpage
) {
6286 segvn_setpgsz_eof_err
++;
6291 * anon_fill_cow_holes() may call fop_getpage().
6292 * don't take anon map lock here to avoid holding it
6293 * across fop_getpage() calls that may call back into
6294 * segvn for klsutering checks. We don't really need
6295 * anon map lock here since it's a private segment and
6296 * we hold as level lock as writers.
6298 if ((err
= anon_fill_cow_holes(seg
, seg
->s_base
,
6299 amp
->ahp
, svd
->anon_index
, svd
->vp
, svd
->offset
,
6300 seg
->s_size
, szc
, svd
->prot
, svd
->vpage
,
6305 segvn_setvnode_mpss(svd
->vp
);
6309 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
6310 if (svd
->type
== MAP_PRIVATE
) {
6312 } else if (szc
> amp
->a_szc
) {
6315 ANON_LOCK_EXIT(&
->a_rwlock
);
6324 segvn_clrszc(struct seg
*seg
)
6326 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6327 struct anon_map
*amp
= svd
->amp
;
6331 caddr_t a
= seg
->s_base
;
6332 caddr_t ea
= a
+ seg
->s_size
;
6333 ulong_t an_idx
= svd
->anon_index
;
6334 vnode_t
*vp
= svd
->vp
;
6335 struct vpage
*vpage
= svd
->vpage
;
6336 page_t
*anon_pl
[1 + 1], *pp
;
6337 struct anon
*ap
, *oldap
;
6338 uint_t prot
= svd
->prot
, vpprot
;
6341 ASSERT(AS_WRITE_HELD(seg
->s_as
) ||
6342 SEGVN_WRITE_HELD(seg
->s_as
, &svd
->lock
));
6343 ASSERT(svd
->softlockcnt
== 0);
6345 if (vp
== NULL
&& amp
== NULL
) {
6346 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6351 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
6352 ASSERT(svd
->amp
== NULL
);
6353 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6354 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
6356 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
6357 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
6358 ASSERT(svd
->amp
!= NULL
);
6359 segvn_textunrepl(seg
, 1);
6360 ASSERT(svd
->amp
== NULL
&& svd
->tr_state
== SEGVN_TR_OFF
);
6363 if (svd
->tr_state
!= SEGVN_TR_OFF
) {
6364 ASSERT(svd
->tr_state
== SEGVN_TR_INIT
);
6365 svd
->tr_state
= SEGVN_TR_OFF
;
6369 * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
6370 * unload argument is 0 when we are freeing the segment
6371 * and unload was already done.
6373 hat_unload(seg
->s_as
->a_hat
, seg
->s_base
, seg
->s_size
,
6377 if (amp
== NULL
|| svd
->type
== MAP_SHARED
) {
6382 pgsz
= page_get_pagesize(seg
->s_szc
);
6386 * XXX anon rwlock is not really needed because this is a
6387 * private segment and we are writers.
6389 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
6391 for (; a
< ea
; a
+= pgsz
, an_idx
+= pages
) {
6392 if ((oldap
= anon_get_ptr(amp
->ahp
, an_idx
)) != NULL
) {
6393 ASSERT(vpage
!= NULL
|| svd
->pageprot
== 0);
6394 if (vpage
!= NULL
) {
6395 ASSERT(sameprot(seg
, a
, pgsz
));
6396 prot
= VPP_PROT(vpage
);
6397 pageflag
= VPP_ISPPLOCK(vpage
) ? LOCK_PAGE
: 0;
6399 if (seg
->s_szc
!= 0) {
6400 ASSERT(vp
== NULL
|| anon_pages(amp
->ahp
,
6401 an_idx
, pages
) == pages
);
6402 if ((err
= anon_map_demotepages(amp
, an_idx
,
6403 seg
, a
, prot
, vpage
, svd
->cred
)) != 0) {
6407 if (oldap
->an_refcnt
== 1) {
6410 if ((err
= anon_getpage(&oldap
, &vpprot
,
6411 anon_pl
, PAGESIZE
, seg
, a
, S_READ
,
6415 if ((pp
= anon_private(&ap
, seg
, a
, prot
,
6416 anon_pl
[0], pageflag
, svd
->cred
)) == NULL
) {
6421 (void) anon_set_ptr(amp
->ahp
, an_idx
, ap
,
6426 vpage
= (vpage
== NULL
) ? NULL
: vpage
+ pages
;
6432 ANON_LOCK_EXIT(&
->a_rwlock
);
6444 pgcnt_t pgcnt
= page_get_pagecnt(seg
->s_szc
);
6445 size_t ppasize
= (pgcnt
+ 1) * sizeof (page_t
*);
6447 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6448 struct anon_map
*amp
= svd
->amp
;
6449 struct vpage
*evp
= svp
+ pgcnt
;
6450 caddr_t addr
= ((uintptr_t)(svp
- svd
->vpage
) << PAGESHIFT
)
6453 struct vnode
*vp
= svd
->vp
;
6458 int anon
= (amp
!= NULL
) ? 1 : 0;
6460 ASSERT(svd
->type
== MAP_PRIVATE
);
6461 ASSERT(svd
->vpage
!= NULL
);
6462 ASSERT(seg
->s_szc
!= 0);
6463 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
6464 ASSERT(amp
== NULL
|| IS_P2ALIGNED(anon_idx
, pgcnt
));
6465 ASSERT(sameprot(seg
, addr
, pgcnt
<< PAGESHIFT
));
6467 if (VPP_PROT(svp
) == prot
)
6469 if (!((VPP_PROT(svp
) ^ prot
) & PROT_WRITE
))
6472 ppa
= kmem_alloc(ppasize
, KM_SLEEP
);
6473 if (anon
&& vp
!= NULL
) {
6474 if (anon_get_ptr(amp
->ahp
, anon_idx
) == NULL
) {
6476 ASSERT(!anon_pages(amp
->ahp
, anon_idx
, pgcnt
));
6479 anon_pages(amp
->ahp
, anon_idx
, pgcnt
) == pgcnt
);
6482 for (*ppa
= NULL
, pg_idx
= 0; svp
< evp
; svp
++, anon_idx
++) {
6483 if (!VPP_ISPPLOCK(svp
))
6486 ap
= anon_get_ptr(amp
->ahp
, anon_idx
);
6488 panic("segvn_claim_pages: no anon slot");
6490 swap_xlate(ap
, &vp
, &aoff
);
6494 if ((pp
= page_lookup(&vp
->v_object
, (uoff_t
)off
, SE_SHARED
)) == NULL
) {
6495 panic("segvn_claim_pages: no page");
6501 if (ppa
[0] == NULL
) {
6502 kmem_free(ppa
, ppasize
);
6506 ASSERT(pg_idx
<= pgcnt
);
6510 /* Find each large page within ppa, and adjust its claim */
6512 /* Does ppa cover a single large page? */
6513 if (ppa
[0]->p_szc
== seg
->s_szc
) {
6514 if (prot
& PROT_WRITE
)
6515 err
= page_addclaim_pages(ppa
);
6517 err
= page_subclaim_pages(ppa
);
6519 for (i
= 0; ppa
[i
]; i
+= pgcnt
) {
6520 ASSERT(IS_P2ALIGNED(page_pptonum(ppa
[i
]), pgcnt
));
6521 if (prot
& PROT_WRITE
)
6522 err
= page_addclaim_pages(&ppa
[i
]);
6524 err
= page_subclaim_pages(&ppa
[i
]);
6530 for (i
= 0; i
< pg_idx
; i
++) {
6531 ASSERT(ppa
[i
] != NULL
);
6532 page_unlock(ppa
[i
]);
6535 kmem_free(ppa
, ppasize
);
6540 * Returns right (upper address) segment if split occurred.
6541 * If the address is equal to the beginning or end of its segment it returns
6542 * the current segment.
6545 segvn_split_seg(struct seg
*seg
, caddr_t addr
)
6547 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6550 struct segvn_data
*nsvd
;
6552 ASSERT(AS_WRITE_HELD(seg
->s_as
));
6553 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6555 ASSERT(addr
>= seg
->s_base
);
6556 ASSERT(addr
<= seg
->s_base
+ seg
->s_size
);
6557 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6559 if (addr
== seg
->s_base
|| addr
== seg
->s_base
+ seg
->s_size
)
6562 nsize
= seg
->s_base
+ seg
->s_size
- addr
;
6563 seg
->s_size
= addr
- seg
->s_base
;
6564 nseg
= seg_alloc(seg
->s_as
, addr
, nsize
);
6565 ASSERT(nseg
!= NULL
);
6566 nseg
->s_ops
= seg
->s_ops
;
6567 nsvd
= kmem_cache_alloc(segvn_cache
, KM_SLEEP
);
6568 nseg
->s_data
= (void *)nsvd
;
6569 nseg
->s_szc
= seg
->s_szc
;
6571 ASSERT(nsvd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6573 rw_init(&nsvd
->lock
, NULL
, RW_DEFAULT
, NULL
);
6575 if (nsvd
->vp
!= NULL
) {
6577 nsvd
->offset
= svd
->offset
+
6578 (uintptr_t)(nseg
->s_base
- seg
->s_base
);
6579 if (nsvd
->type
== MAP_SHARED
)
6580 lgrp_shm_policy_init(NULL
, nsvd
->vp
);
6583 * The offset for an anonymous segment has no signifigance in
6584 * terms of an offset into a file. If we were to use the above
6585 * calculation instead, the structures read out of
6586 * /proc/<pid>/xmap would be more difficult to decipher since
6587 * it would be unclear whether two seemingly contiguous
6588 * prxmap_t structures represented different segments or a
6589 * single segment that had been split up into multiple prxmap_t
6590 * structures (e.g. if some part of the segment had not yet
6596 ASSERT(svd
->softlockcnt
== 0);
6597 ASSERT(svd
->softlockcnt_sbase
== 0);
6598 ASSERT(svd
->softlockcnt_send
== 0);
6601 if (svd
->vpage
!= NULL
) {
6602 size_t bytes
= vpgtob(seg_pages(seg
));
6603 size_t nbytes
= vpgtob(seg_pages(nseg
));
6604 struct vpage
*ovpage
= svd
->vpage
;
6606 svd
->vpage
= kmem_alloc(bytes
, KM_SLEEP
);
6607 bcopy(ovpage
, svd
->vpage
, bytes
);
6608 nsvd
->vpage
= kmem_alloc(nbytes
, KM_SLEEP
);
6609 bcopy(ovpage
+ seg_pages(seg
), nsvd
->vpage
, nbytes
);
6610 kmem_free(ovpage
, bytes
+ nbytes
);
6612 if (svd
->amp
!= NULL
&& svd
->type
== MAP_PRIVATE
) {
6613 struct anon_map
*oamp
= svd
->amp
, *namp
;
6614 struct anon_hdr
*nahp
;
6616 ANON_LOCK_ENTER(&oamp
->a_rwlock
, RW_WRITER
);
6617 ASSERT(oamp
->refcnt
== 1);
6618 nahp
= anon_create(btop(seg
->s_size
), ANON_SLEEP
);
6619 (void) anon_copy_ptr(oamp
->ahp
, svd
->anon_index
,
6620 nahp
, 0, btop(seg
->s_size
), ANON_SLEEP
);
6622 namp
= anonmap_alloc(nseg
->s_size
, 0, ANON_SLEEP
);
6623 namp
->a_szc
= nseg
->s_szc
;
6624 (void) anon_copy_ptr(oamp
->ahp
,
6625 svd
->anon_index
+ btop(seg
->s_size
),
6626 namp
->ahp
, 0, btop(nseg
->s_size
), ANON_SLEEP
);
6627 anon_release(oamp
->ahp
, btop(oamp
->size
));
6629 oamp
->size
= seg
->s_size
;
6630 svd
->anon_index
= 0;
6632 nsvd
->anon_index
= 0;
6633 ANON_LOCK_EXIT(&oamp
->a_rwlock
);
6634 } else if (svd
->amp
!= NULL
) {
6635 pgcnt_t pgcnt
= page_get_pagecnt(seg
->s_szc
);
6636 ASSERT(svd
->amp
== nsvd
->amp
);
6637 ASSERT(seg
->s_szc
<= svd
->amp
->a_szc
);
6638 nsvd
->anon_index
= svd
->anon_index
+ seg_pages(seg
);
6639 ASSERT(IS_P2ALIGNED(nsvd
->anon_index
, pgcnt
));
6640 ANON_LOCK_ENTER(&svd
->amp
->a_rwlock
, RW_WRITER
);
6642 ANON_LOCK_EXIT(&svd
->amp
->a_rwlock
);
6646 * Split the amount of swap reserved.
6650 * For MAP_NORESERVE, only allocate swap reserve for pages
6651 * being used. Other segments get enough to cover whole
6654 if (svd
->flags
& MAP_NORESERVE
) {
6658 oswresv
= svd
->swresv
;
6659 svd
->swresv
= ptob(anon_pages(svd
->amp
->ahp
,
6660 svd
->anon_index
, btop(seg
->s_size
)));
6661 nsvd
->swresv
= ptob(anon_pages(nsvd
->amp
->ahp
,
6662 nsvd
->anon_index
, btop(nseg
->s_size
)));
6663 ASSERT(oswresv
>= (svd
->swresv
+ nsvd
->swresv
));
6665 if (svd
->pageswap
) {
6666 svd
->swresv
= segvn_count_swap_by_vpages(seg
);
6667 ASSERT(nsvd
->swresv
>= svd
->swresv
);
6668 nsvd
->swresv
-= svd
->swresv
;
6670 ASSERT(svd
->swresv
== seg
->s_size
+
6672 svd
->swresv
= seg
->s_size
;
6673 nsvd
->swresv
= nseg
->s_size
;
6682 * called on memory operations (unmap, setprot, setpagesize) for a subset
6683 * of a large page segment to either demote the memory range (SDR_RANGE)
6684 * or the ends (SDR_END) by addr/len.
6686 * returns 0 on success. returns errno, including ENOMEM, on failure.
6696 caddr_t eaddr
= addr
+ len
;
6697 caddr_t lpgaddr
, lpgeaddr
;
6699 struct seg
*badseg1
= NULL
;
6700 struct seg
*badseg2
= NULL
;
6702 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6704 uint_t szc
= seg
->s_szc
;
6707 ASSERT(AS_WRITE_HELD(seg
->s_as
));
6708 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
6710 pgsz
= page_get_pagesize(szc
);
6711 ASSERT(seg
->s_base
!= addr
|| seg
->s_size
!= len
);
6712 ASSERT(addr
>= seg
->s_base
&& eaddr
<= seg
->s_base
+ seg
->s_size
);
6713 ASSERT(svd
->softlockcnt
== 0);
6714 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
6715 ASSERT(szcvec
== 0 || (flag
== SDR_END
&& svd
->type
== MAP_SHARED
));
6717 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
, lpgeaddr
);
6718 ASSERT(flag
== SDR_RANGE
|| eaddr
< lpgeaddr
|| addr
> lpgaddr
);
6719 if (flag
== SDR_RANGE
) {
6720 /* demote entire range */
6721 badseg1
= nseg
= segvn_split_seg(seg
, lpgaddr
);
6722 (void) segvn_split_seg(nseg
, lpgeaddr
);
6723 ASSERT(badseg1
->s_base
== lpgaddr
);
6724 ASSERT(badseg1
->s_size
== lpgeaddr
- lpgaddr
);
6725 } else if (addr
!= lpgaddr
) {
6726 ASSERT(flag
== SDR_END
);
6727 badseg1
= nseg
= segvn_split_seg(seg
, lpgaddr
);
6728 if (eaddr
!= lpgeaddr
&& eaddr
> lpgaddr
+ pgsz
&&
6729 eaddr
< lpgaddr
+ 2 * pgsz
) {
6730 (void) segvn_split_seg(nseg
, lpgeaddr
);
6731 ASSERT(badseg1
->s_base
== lpgaddr
);
6732 ASSERT(badseg1
->s_size
== 2 * pgsz
);
6734 nseg
= segvn_split_seg(nseg
, lpgaddr
+ pgsz
);
6735 ASSERT(badseg1
->s_base
== lpgaddr
);
6736 ASSERT(badseg1
->s_size
== pgsz
);
6737 if (eaddr
!= lpgeaddr
&& eaddr
> lpgaddr
+ pgsz
) {
6738 ASSERT(lpgeaddr
- lpgaddr
> 2 * pgsz
);
6739 nseg
= segvn_split_seg(nseg
, lpgeaddr
- pgsz
);
6741 (void) segvn_split_seg(nseg
, lpgeaddr
);
6742 ASSERT(badseg2
->s_base
== lpgeaddr
- pgsz
);
6743 ASSERT(badseg2
->s_size
== pgsz
);
6747 ASSERT(flag
== SDR_END
);
6748 ASSERT(eaddr
< lpgeaddr
);
6749 badseg1
= nseg
= segvn_split_seg(seg
, lpgeaddr
- pgsz
);
6750 (void) segvn_split_seg(nseg
, lpgeaddr
);
6751 ASSERT(badseg1
->s_base
== lpgeaddr
- pgsz
);
6752 ASSERT(badseg1
->s_size
== pgsz
);
6755 ASSERT(badseg1
!= NULL
);
6756 ASSERT(badseg1
->s_szc
== szc
);
6757 ASSERT(flag
== SDR_RANGE
|| badseg1
->s_size
== pgsz
||
6758 badseg1
->s_size
== 2 * pgsz
);
6759 ASSERT(sameprot(badseg1
, badseg1
->s_base
, pgsz
));
6760 ASSERT(badseg1
->s_size
== pgsz
||
6761 sameprot(badseg1
, badseg1
->s_base
+ pgsz
, pgsz
));
6762 if (err
= segvn_clrszc(badseg1
)) {
6765 ASSERT(badseg1
->s_szc
== 0);
6767 if (szc
> 1 && (tszcvec
= P2PHASE(szcvec
, 1 << szc
)) > 1) {
6768 uint_t tszc
= highbit(tszcvec
) - 1;
6769 caddr_t ta
= MAX(addr
, badseg1
->s_base
);
6771 size_t tpgsz
= page_get_pagesize(tszc
);
6773 ASSERT(svd
->type
== MAP_SHARED
);
6774 ASSERT(flag
== SDR_END
);
6775 ASSERT(tszc
< szc
&& tszc
> 0);
6777 if (eaddr
> badseg1
->s_base
+ badseg1
->s_size
) {
6778 te
= badseg1
->s_base
+ badseg1
->s_size
;
6784 badseg1
->s_szc
= tszc
;
6785 if (!IS_P2ALIGNED(ta
, tpgsz
) || !IS_P2ALIGNED(te
, tpgsz
)) {
6786 if (badseg2
!= NULL
) {
6787 err
= segvn_demote_range(badseg1
, ta
, te
- ta
,
6793 return (segvn_demote_range(badseg1
, ta
,
6794 te
- ta
, SDR_END
, tszcvec
));
6799 if (badseg2
== NULL
)
6801 ASSERT(badseg2
->s_szc
== szc
);
6802 ASSERT(badseg2
->s_size
== pgsz
);
6803 ASSERT(sameprot(badseg2
, badseg2
->s_base
, badseg2
->s_size
));
6804 if (err
= segvn_clrszc(badseg2
)) {
6807 ASSERT(badseg2
->s_szc
== 0);
6809 if (szc
> 1 && (tszcvec
= P2PHASE(szcvec
, 1 << szc
)) > 1) {
6810 uint_t tszc
= highbit(tszcvec
) - 1;
6811 size_t tpgsz
= page_get_pagesize(tszc
);
6813 ASSERT(svd
->type
== MAP_SHARED
);
6814 ASSERT(flag
== SDR_END
);
6815 ASSERT(tszc
< szc
&& tszc
> 0);
6816 ASSERT(badseg2
->s_base
> addr
);
6817 ASSERT(eaddr
> badseg2
->s_base
);
6818 ASSERT(eaddr
< badseg2
->s_base
+ badseg2
->s_size
);
6820 badseg2
->s_szc
= tszc
;
6821 if (!IS_P2ALIGNED(eaddr
, tpgsz
)) {
6822 return (segvn_demote_range(badseg2
, badseg2
->s_base
,
6823 eaddr
- badseg2
->s_base
, SDR_END
, tszcvec
));
6831 segvn_checkprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t prot
)
6833 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6834 struct vpage
*vp
, *evp
;
6836 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
6838 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
6840 * If segment protection can be used, simply check against them.
6842 if (svd
->pageprot
== 0) {
6845 err
= ((svd
->prot
& prot
) != prot
) ? EACCES
: 0;
6846 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6851 * Have to check down to the vpage level.
6853 evp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
6854 for (vp
= &svd
->vpage
[seg_page(seg
, addr
)]; vp
< evp
; vp
++) {
6855 if ((VPP_PROT(vp
) & prot
) != prot
) {
6856 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6860 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6865 segvn_getprot(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t
*protv
)
6867 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6868 size_t pgno
= seg_page(seg
, addr
+ len
) - seg_page(seg
, addr
) + 1;
6870 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
6873 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
6874 if (svd
->pageprot
== 0) {
6876 protv
[--pgno
] = svd
->prot
;
6877 } while (pgno
!= 0);
6879 size_t pgoff
= seg_page(seg
, addr
);
6883 protv
[pgno
] = VPP_PROT(&svd
->vpage
[pgno
+pgoff
]);
6884 } while (pgno
!= 0);
6886 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
6892 segvn_getoffset(struct seg
*seg
, caddr_t addr
)
6894 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6896 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
6898 return (svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
));
6903 segvn_gettype(struct seg
*seg
, caddr_t addr
)
6905 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6907 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
6909 return (svd
->type
| (svd
->flags
& (MAP_NORESERVE
| MAP_TEXT
|
6915 segvn_getvp(struct seg
*seg
, caddr_t addr
, struct vnode
**vpp
)
6917 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6919 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
6926 * Check to see if it makes sense to do kluster/read ahead to
6927 * addr + delta relative to the mapping at addr. We assume here
6928 * that delta is a signed PAGESIZE'd multiple (which can be negative).
6930 * For segvn, we currently "approve" of the action if we are
6931 * still in the segment and it maps from the same vp/off,
6932 * or if the advice stored in segvn_data or vpages allows it.
6933 * Currently, klustering is not allowed only if MADV_RANDOM is set.
6936 segvn_kluster(struct seg
*seg
, caddr_t addr
, ssize_t delta
)
6938 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
6939 struct anon
*oap
, *ap
;
6942 struct vnode
*vp1
, *vp2
;
6944 struct anon_map
*amp
;
6946 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
6947 ASSERT(AS_WRITE_HELD(seg
->s_as
) ||
6948 SEGVN_LOCK_HELD(seg
->s_as
, &svd
->lock
));
6950 if (addr
+ delta
< seg
->s_base
||
6951 addr
+ delta
>= (seg
->s_base
+ seg
->s_size
))
6952 return (-1); /* exceeded segment bounds */
6954 pd
= delta
/ (ssize_t
)PAGESIZE
; /* divide to preserve sign bit */
6955 page
= seg_page(seg
, addr
);
6958 * Check to see if either of the pages addr or addr + delta
6959 * have advice set that prevents klustering (if MADV_RANDOM advice
6960 * is set for entire segment, or MADV_SEQUENTIAL is set and delta
6963 if (svd
->advice
== MADV_RANDOM
||
6964 svd
->advice
== MADV_SEQUENTIAL
&& delta
< 0)
6966 else if (svd
->pageadvice
&& svd
->vpage
) {
6967 struct vpage
*bvpp
, *evpp
;
6969 bvpp
= &svd
->vpage
[page
];
6970 evpp
= &svd
->vpage
[page
+ pd
];
6971 if (VPP_ADVICE(bvpp
) == MADV_RANDOM
||
6972 VPP_ADVICE(evpp
) == MADV_SEQUENTIAL
&& delta
< 0)
6974 if (VPP_ADVICE(bvpp
) != VPP_ADVICE(evpp
) &&
6975 VPP_ADVICE(evpp
) == MADV_RANDOM
)
6979 if (svd
->type
== MAP_SHARED
)
6980 return (0); /* shared mapping - all ok */
6982 if ((amp
= svd
->amp
) == NULL
)
6983 return (0); /* off original vnode */
6985 page
+= svd
->anon_index
;
6987 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
6989 oap
= anon_get_ptr(amp
->ahp
, page
);
6990 ap
= anon_get_ptr(amp
->ahp
, page
+ pd
);
6992 ANON_LOCK_EXIT(&
->a_rwlock
);
6994 if ((oap
== NULL
&& ap
!= NULL
) || (oap
!= NULL
&& ap
== NULL
)) {
6995 return (-1); /* one with and one without an anon */
6998 if (oap
== NULL
) { /* implies that ap == NULL */
6999 return (0); /* off original vnode */
7003 * Now we know we have two anon pointers - check to
7004 * see if they happen to be properly allocated.
7008 * XXX We cheat here and don't lock the anon slots. We can't because
7009 * we may have been called from the anon layer which might already
7010 * have locked them. We are holding a refcnt on the slots so they
7011 * can't disappear. The worst that will happen is we'll get the wrong
7012 * names (vp, off) for the slots and make a poor klustering decision.
7014 swap_xlate(ap
, &vp1
, &off1
);
7015 swap_xlate(oap
, &vp2
, &off2
);
7018 if (!fop_cmp(vp1
, vp2
, NULL
) || off1
- off2
!= delta
)
7024 * Synchronize primary storage cache with real object in virtual memory.
7026 * XXX - Anonymous pages should not be sync'ed out at all.
7029 segvn_sync(struct seg
*seg
, caddr_t addr
, size_t len
, int attr
, uint_t flags
)
7031 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
7044 struct anon_map
*amp
;
7046 anon_sync_obj_t cookie
;
7048 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
7050 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
7052 if (svd
->softlockcnt
> 0) {
7054 * If this is shared segment non 0 softlockcnt
7055 * means locked pages are still in use.
7057 if (svd
->type
== MAP_SHARED
) {
7058 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7063 * flush all pages from seg cache
7064 * otherwise we may deadlock in swap_putpage
7065 * for B_INVAL page (4175402).
7067 * Even if we grab segvn WRITER's lock
7068 * here, there might be another thread which could've
7069 * successfully performed lookup/insert just before
7070 * we acquired the lock here. So, grabbing either
7071 * lock here is of not much use. Until we devise
7072 * a strategy at upper layers to solve the
7073 * synchronization issues completely, we expect
7074 * applications to handle this appropriately.
7077 if (svd
->softlockcnt
> 0) {
7078 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7081 } else if (svd
->type
== MAP_SHARED
&& svd
->amp
!= NULL
&&
7082 svd
->amp
->a_softlockcnt
> 0) {
7084 * Try to purge this amp's entries from pcache. It will
7085 * succeed only if other segments that share the amp have no
7086 * outstanding softlock's.
7089 if (svd
->amp
->a_softlockcnt
> 0 || svd
->softlockcnt
> 0) {
7090 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7096 offset
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
7097 bflags
= ((flags
& MS_ASYNC
) ? B_ASYNC
: 0) |
7098 ((flags
& MS_INVALIDATE
) ? B_INVAL
: 0);
7101 pageprot
= attr
& ~(SHARED
|PRIVATE
);
7102 segtype
= (attr
& SHARED
) ? MAP_SHARED
: MAP_PRIVATE
;
7105 * We are done if the segment types don't match
7106 * or if we have segment level protections and
7109 if (svd
->type
!= segtype
) {
7110 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7114 if (svd
->prot
!= pageprot
) {
7115 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7120 vpp
= &svd
->vpage
[seg_page(seg
, addr
)];
7122 } else if (svd
->vp
&& svd
->amp
== NULL
&&
7123 (flags
& MS_INVALIDATE
) == 0) {
7126 * No attributes, no anonymous pages and MS_INVALIDATE flag
7127 * is not on, just use one big request.
7129 err
= fop_putpage(svd
->vp
, (offset_t
)offset
, len
,
7130 bflags
, svd
->cred
, NULL
);
7131 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7135 if ((amp
= svd
->amp
) != NULL
)
7136 anon_index
= svd
->anon_index
+ seg_page(seg
, addr
);
7138 for (eaddr
= addr
+ len
; addr
< eaddr
; addr
+= PAGESIZE
) {
7141 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7142 anon_array_enter(amp
, anon_index
, &cookie
);
7143 ap
= anon_get_ptr(amp
->ahp
, anon_index
++);
7145 swap_xlate(ap
, &vp
, &off
);
7150 anon_array_exit(&cookie
);
7151 ANON_LOCK_EXIT(&
->a_rwlock
);
7158 if (vp
== NULL
) /* untouched zfod page */
7163 prot
= VPP_PROT(vpp
);
7166 if (prot
!= pageprot
) {
7172 * See if any of these pages are locked -- if so, then we
7173 * will have to truncate an invalidate request at the first
7174 * locked one. We don't need the page_struct_lock to test
7175 * as this is only advisory; even if we acquire it someone
7176 * might race in and lock the page after we unlock and before
7177 * we do the PUTPAGE, then PUTPAGE simply does nothing.
7179 if (flags
& MS_INVALIDATE
) {
7180 if ((pp
= page_lookup(&vp
->v_object
, off
, SE_SHARED
)) != NULL
) {
7181 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
7183 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7186 if (ap
!= NULL
&& pp
->p_szc
!= 0 &&
7187 page_tryupgrade(pp
)) {
7188 if (pp
->p_lckcnt
== 0 &&
7189 pp
->p_cowcnt
== 0) {
7191 * swapfs VN_DISPOSE() won't
7192 * invalidate large pages.
7193 * Attempt to demote.
7194 * XXX can't help it if it
7195 * fails. But for swapfs
7196 * pages it is no big deal.
7198 (void) page_try_demote_pages(
7204 } else if (svd
->type
== MAP_SHARED
&& amp
!= NULL
) {
7206 * Avoid writing out to disk ISM's large pages
7207 * because segspt_free_pages() relies on NULL an_pvp
7208 * of anon slots of such pages.
7211 ASSERT(svd
->vp
== NULL
);
7213 * swapfs uses page_lookup_nowait if not freeing or
7214 * invalidating and skips a page if
7215 * page_lookup_nowait returns NULL.
7217 pp
= page_lookup_nowait(&vp
->v_object
, off
, SE_SHARED
);
7221 if (pp
->p_szc
!= 0) {
7227 * Note ISM pages are created large so (vp, off)'s
7228 * page cannot suddenly become large after we unlock
7234 * XXX - Should ultimately try to kluster
7235 * calls to fop_putpage() for performance.
7238 err
= fop_putpage(vp
, (offset_t
)off
, PAGESIZE
,
7239 (bflags
| (IS_SWAPFSVP(vp
) ? B_PAGE_NOWAIT
: 0)),
7246 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7251 * Determine if we have data corresponding to pages in the
7252 * primary storage virtual memory cache (i.e., "in core").
7255 segvn_incore(struct seg
*seg
, caddr_t addr
, size_t len
, char *vec
)
7257 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
7258 struct vnode
*vp
, *avp
;
7259 uoff_t offset
, aoffset
;
7265 struct anon_map
*amp
; /* XXX - for locknest */
7268 anon_sync_obj_t cookie
;
7270 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
7272 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
7273 if (svd
->amp
== NULL
&& svd
->vp
== NULL
) {
7274 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7275 bzero(vec
, btopr(len
));
7276 return (len
); /* no anonymous pages created yet */
7279 p
= seg_page(seg
, addr
);
7280 ep
= seg_page(seg
, addr
+ len
);
7281 start
= svd
->vp
? SEG_PAGE_VNODEBACKED
: 0;
7284 for (; p
< ep
; p
++, addr
+= PAGESIZE
) {
7285 vpp
= (svd
->vpage
) ? &svd
->vpage
[p
]: NULL
;
7289 /* Grab the vnode/offset for the anon slot */
7291 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7292 anon_array_enter(amp
, svd
->anon_index
+ p
, &cookie
);
7293 ap
= anon_get_ptr(amp
->ahp
, svd
->anon_index
+ p
);
7295 swap_xlate(ap
, &avp
, &aoffset
);
7297 anon_array_exit(&cookie
);
7298 ANON_LOCK_EXIT(&
->a_rwlock
);
7300 if ((avp
!= NULL
) && page_exists(&avp
->v_object
, aoffset
)) {
7301 /* A page exists for the anon slot */
7302 ret
|= SEG_PAGE_INCORE
;
7305 * If page is mapped and writable
7308 if ((hat_getattr(seg
->s_as
->a_hat
, addr
,
7309 &attr
) != -1) && (attr
& PROT_WRITE
)) {
7310 ret
|= SEG_PAGE_ANON
;
7313 * Don't get page_struct lock for lckcnt and cowcnt,
7314 * since this is purely advisory.
7316 if ((pp
= page_lookup_nowait(&avp
->v_object
,
7318 SE_SHARED
)) != NULL
) {
7320 ret
|= SEG_PAGE_SOFTLOCK
;
7322 ret
|= SEG_PAGE_HASCOW
;
7327 /* Gather vnode statistics */
7329 offset
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
7333 * Try to obtain a "shared" lock on the page
7334 * without blocking. If this fails, determine
7335 * if the page is in memory.
7337 pp
= page_lookup_nowait(&vp
->v_object
, offset
,
7339 if ((pp
== NULL
) && (page_exists(&vp
->v_object
, offset
))) {
7340 /* Page is incore, and is named */
7341 ret
|= (SEG_PAGE_INCORE
| SEG_PAGE_VNODE
);
7344 * Don't get page_struct lock for lckcnt and cowcnt,
7345 * since this is purely advisory.
7348 ret
|= (SEG_PAGE_INCORE
| SEG_PAGE_VNODE
);
7350 ret
|= SEG_PAGE_SOFTLOCK
;
7352 ret
|= SEG_PAGE_HASCOW
;
7357 /* Gather virtual page information */
7359 if (VPP_ISPPLOCK(vpp
))
7360 ret
|= SEG_PAGE_LOCKED
;
7366 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7371 * Statement for p_cowcnts/p_lckcnts.
7373 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region
7374 * irrespective of the following factors or anything else:
7376 * (1) anon slots are populated or not
7377 * (2) cow is broken or not
7378 * (3) refcnt on ap is 1 or greater than 1
7380 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock
7384 * Handling p_cowcnts/p_lckcnts during copy-on-write fault:
7386 * if vpage has PROT_WRITE
7387 * transfer cowcnt on the oldpage -> cowcnt on the newpage
7389 * transfer lckcnt on the oldpage -> lckcnt on the newpage
7391 * During copy-on-write, decrement p_cowcnt on the oldpage and increment
7392 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE.
7394 * We may also break COW if softlocking on read access in the physio case.
7395 * In this case, vpage may not have PROT_WRITE. So, we need to decrement
7396 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the
7397 * vpage doesn't have PROT_WRITE.
7400 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region:
7402 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and
7403 * increment p_lckcnt by calling page_subclaim() which takes care of
7404 * availrmem accounting and p_lckcnt overflow.
7406 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and
7407 * increment p_cowcnt by calling page_addclaim() which takes care of
7408 * availrmem availability and p_cowcnt overflow.
7412 * Lock down (or unlock) pages mapped by this segment.
7414 * XXX only creates PAGESIZE pages if anon slots are not initialized.
7415 * At fault time they will be relocated into larger pages.
7418 segvn_lockop(struct seg
*seg
, caddr_t addr
, size_t len
,
7419 int attr
, int op
, ulong_t
*lockmap
, size_t pos
)
7421 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
7432 struct anon_map
*amp
;
7435 anon_sync_obj_t cookie
;
7436 struct kshmid
*sp
= NULL
;
7437 struct proc
*p
= curproc
;
7438 kproject_t
*proj
= NULL
;
7440 size_t locked_bytes
= 0;
7441 size_t unlocked_bytes
= 0;
7445 * Hold write lock on address space because may split or concatenate
7448 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
7451 * If this is a shm, use shm's project and zone, else use
7452 * project and zone of calling process
7455 /* Determine if this segment backs a sysV shm */
7456 if (svd
->amp
!= NULL
&& svd
->amp
->a_sp
!= NULL
) {
7457 ASSERT(svd
->type
== MAP_SHARED
);
7458 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
7459 sp
= svd
->amp
->a_sp
;
7460 proj
= sp
->shm_perm
.ipc_proj
;
7464 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
7466 pageprot
= attr
& ~(SHARED
|PRIVATE
);
7467 segtype
= attr
& SHARED
? MAP_SHARED
: MAP_PRIVATE
;
7470 * We are done if the segment types don't match
7471 * or if we have segment level protections and
7474 if (svd
->type
!= segtype
) {
7475 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7478 if (svd
->pageprot
== 0 && svd
->prot
!= pageprot
) {
7479 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7484 if (op
== MC_LOCK
) {
7485 if (svd
->tr_state
== SEGVN_TR_INIT
) {
7486 svd
->tr_state
= SEGVN_TR_OFF
;
7487 } else if (svd
->tr_state
== SEGVN_TR_ON
) {
7488 ASSERT(svd
->amp
!= NULL
);
7489 segvn_textunrepl(seg
, 0);
7490 ASSERT(svd
->amp
== NULL
&&
7491 svd
->tr_state
== SEGVN_TR_OFF
);
7496 * If we're locking, then we must create a vpage structure if
7497 * none exists. If we're unlocking, then check to see if there
7498 * is a vpage -- if not, then we could not have locked anything.
7501 if ((vpp
= svd
->vpage
) == NULL
) {
7502 if (op
== MC_LOCK
) {
7504 if (svd
->vpage
== NULL
) {
7505 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7509 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7515 * The anonymous data vector (i.e., previously
7516 * unreferenced mapping to swap space) can be allocated
7517 * by lazily testing for its existence.
7519 if (op
== MC_LOCK
&& svd
->amp
== NULL
&& svd
->vp
== NULL
) {
7520 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
7521 svd
->amp
= anonmap_alloc(seg
->s_size
, 0, ANON_SLEEP
);
7522 svd
->amp
->a_szc
= seg
->s_szc
;
7525 if ((amp
= svd
->amp
) != NULL
) {
7526 anon_index
= svd
->anon_index
+ seg_page(seg
, addr
);
7529 offset
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
7530 evp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
7533 mutex_enter(&sp
->shm_mlock
);
7535 /* determine number of unlocked bytes in range for lock operation */
7536 if (op
== MC_LOCK
) {
7539 for (vpp
= &svd
->vpage
[seg_page(seg
, addr
)]; vpp
< evp
;
7541 if (!VPP_ISPPLOCK(vpp
))
7542 unlocked_bytes
+= PAGESIZE
;
7545 ulong_t i_idx
, i_edx
;
7546 anon_sync_obj_t i_cookie
;
7551 /* Only count sysV pages once for locked memory */
7552 i_edx
= svd
->anon_index
+ seg_page(seg
, addr
+ len
);
7553 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7554 for (i_idx
= anon_index
; i_idx
< i_edx
; i_idx
++) {
7555 anon_array_enter(amp
, i_idx
, &i_cookie
);
7556 i_ap
= anon_get_ptr(amp
->ahp
, i_idx
);
7558 unlocked_bytes
+= PAGESIZE
;
7559 anon_array_exit(&i_cookie
);
7562 swap_xlate(i_ap
, &i_vp
, &i_off
);
7563 anon_array_exit(&i_cookie
);
7564 pp
= page_lookup(&i_vp
->v_object
, i_off
,
7567 unlocked_bytes
+= PAGESIZE
;
7569 } else if (pp
->p_lckcnt
== 0)
7570 unlocked_bytes
+= PAGESIZE
;
7573 ANON_LOCK_EXIT(&
->a_rwlock
);
7576 mutex_enter(&p
->p_lock
);
7577 err
= rctl_incr_locked_mem(p
, proj
, unlocked_bytes
,
7579 mutex_exit(&p
->p_lock
);
7583 mutex_exit(&sp
->shm_mlock
);
7584 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7589 * Loop over all pages in the range. Process if we're locking and
7590 * page has not already been locked in this mapping; or if we're
7591 * unlocking and the page has been locked.
7593 for (vpp
= &svd
->vpage
[seg_page(seg
, addr
)]; vpp
< evp
;
7594 vpp
++, pos
++, addr
+= PAGESIZE
, offset
+= PAGESIZE
, anon_index
++) {
7595 if ((attr
== 0 || VPP_PROT(vpp
) == pageprot
) &&
7596 ((op
== MC_LOCK
&& !VPP_ISPPLOCK(vpp
)) ||
7597 (op
== MC_UNLOCK
&& VPP_ISPPLOCK(vpp
)))) {
7600 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7602 * If this isn't a MAP_NORESERVE segment and
7603 * we're locking, allocate anon slots if they
7604 * don't exist. The page is brought in later on.
7606 if (op
== MC_LOCK
&& svd
->vp
== NULL
&&
7607 ((svd
->flags
& MAP_NORESERVE
) == 0) &&
7609 ((ap
= anon_get_ptr(amp
->ahp
, anon_index
))
7611 anon_array_enter(amp
, anon_index
, &cookie
);
7613 if ((ap
= anon_get_ptr(amp
->ahp
,
7614 anon_index
)) == NULL
) {
7615 pp
= anon_zero(seg
, addr
, &ap
,
7618 anon_array_exit(&cookie
);
7619 ANON_LOCK_EXIT(&
->a_rwlock
);
7623 ASSERT(anon_get_ptr(amp
->ahp
,
7624 anon_index
) == NULL
);
7625 (void) anon_set_ptr(amp
->ahp
,
7626 anon_index
, ap
, ANON_SLEEP
);
7629 anon_array_exit(&cookie
);
7633 * Get name for page, accounting for
7634 * existence of private copy.
7638 anon_array_enter(amp
, anon_index
, &cookie
);
7639 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
7641 swap_xlate(ap
, &vp
, &off
);
7643 if (svd
->vp
== NULL
&&
7644 (svd
->flags
& MAP_NORESERVE
)) {
7645 anon_array_exit(&cookie
);
7646 ANON_LOCK_EXIT(&
->a_rwlock
);
7652 if (op
!= MC_LOCK
|| ap
== NULL
) {
7653 anon_array_exit(&cookie
);
7654 ANON_LOCK_EXIT(&
->a_rwlock
);
7662 * Get page frame. It's ok if the page is
7663 * not available when we're unlocking, as this
7664 * may simply mean that a page we locked got
7665 * truncated out of existence after we locked it.
7667 * Invoke fop_getpage() to obtain the page struct
7668 * since we may need to read it from disk if its
7672 pp
= page_lookup(&vp
->v_object
, off
,
7680 error
= fop_getpage(vp
, (offset_t
)off
, PAGESIZE
,
7681 (uint_t
*)NULL
, pl
, PAGESIZE
, seg
, addr
,
7682 S_OTHER
, svd
->cred
, NULL
);
7684 if (error
&& ap
!= NULL
) {
7685 anon_array_exit(&cookie
);
7686 ANON_LOCK_EXIT(&
->a_rwlock
);
7690 * If the error is EDEADLK then we must bounce
7691 * up and drop all vm subsystem locks and then
7692 * retry the operation later
7693 * This behavior is a temporary measure because
7694 * ufs/sds logging is badly designed and will
7695 * deadlock if we don't allow this bounce to
7696 * happen. The real solution is to re-design
7697 * the logging code to work properly. See bug
7698 * 4125102 for details of the problem.
7700 if (error
== EDEADLK
) {
7705 * Quit if we fail to fault in the page. Treat
7706 * the failure as an error, unless the addr
7707 * is mapped beyond the end of a file.
7709 if (error
&& svd
->vp
) {
7710 va
.va_mask
= VATTR_SIZE
;
7711 if (fop_getattr(svd
->vp
, &va
, 0,
7712 svd
->cred
, NULL
) != 0) {
7716 if (btopr(va
.va_size
) >=
7732 * See Statement at the beginning of this routine.
7734 * claim is always set if MAP_PRIVATE and PROT_WRITE
7735 * irrespective of following factors:
7737 * (1) anon slots are populated or not
7738 * (2) cow is broken or not
7739 * (3) refcnt on ap is 1 or greater than 1
7741 * See 4140683 for details
7743 claim
= ((VPP_PROT(vpp
) & PROT_WRITE
) &&
7744 (svd
->type
== MAP_PRIVATE
));
7747 * Perform page-level operation appropriate to
7748 * operation. If locking, undo the SOFTLOCK
7749 * performed to bring the page into memory
7750 * after setting the lock. If unlocking,
7751 * and no page was found, account for the claim
7754 if (op
== MC_LOCK
) {
7755 int ret
= 1; /* Assume success */
7757 ASSERT(!VPP_ISPPLOCK(vpp
));
7759 ret
= page_pp_lock(pp
, claim
, 0);
7761 if (ap
->an_pvp
!= NULL
) {
7762 anon_swap_free(ap
, pp
);
7764 anon_array_exit(&cookie
);
7765 ANON_LOCK_EXIT(&
->a_rwlock
);
7768 /* locking page failed */
7775 if (pp
->p_lckcnt
== 1)
7776 locked_bytes
+= PAGESIZE
;
7778 locked_bytes
+= PAGESIZE
;
7780 if (lockmap
!= NULL
)
7781 BT_SET(lockmap
, pos
);
7785 ASSERT(VPP_ISPPLOCK(vpp
));
7787 /* sysV pages should be locked */
7788 ASSERT(sp
== NULL
|| pp
->p_lckcnt
> 0);
7789 page_pp_unlock(pp
, claim
, 0);
7791 if (pp
->p_lckcnt
== 0)
7795 unlocked_bytes
+= PAGESIZE
;
7799 unlocked_bytes
+= PAGESIZE
;
7806 if (op
== MC_LOCK
) {
7807 /* Credit back bytes that did not get locked */
7808 if ((unlocked_bytes
- locked_bytes
) > 0) {
7810 mutex_enter(&p
->p_lock
);
7811 rctl_decr_locked_mem(p
, proj
,
7812 (unlocked_bytes
- locked_bytes
), chargeproc
);
7814 mutex_exit(&p
->p_lock
);
7818 /* Account bytes that were unlocked */
7819 if (unlocked_bytes
> 0) {
7821 mutex_enter(&p
->p_lock
);
7822 rctl_decr_locked_mem(p
, proj
, unlocked_bytes
,
7825 mutex_exit(&p
->p_lock
);
7829 mutex_exit(&sp
->shm_mlock
);
7830 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7836 * Set advice from user for specified pages
7837 * There are 10 types of advice:
7838 * MADV_NORMAL - Normal (default) behavior (whatever that is)
7839 * MADV_RANDOM - Random page references
7840 * do not allow readahead or 'klustering'
7841 * MADV_SEQUENTIAL - Sequential page references
7842 * Pages previous to the one currently being
7843 * accessed (determined by fault) are 'not needed'
7844 * and are freed immediately
7845 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl)
7846 * MADV_DONTNEED - Pages are not needed (synced out in mctl)
7847 * MADV_FREE - Contents can be discarded
7848 * MADV_ACCESS_DEFAULT- Default access
7849 * MADV_ACCESS_LWP - Next LWP will access heavily
7850 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily
7851 * MADV_PURGE - Contents will be immediately discarded
7854 segvn_advise(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t behav
)
7856 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
7860 struct anon_map
*amp
;
7863 lgrp_mem_policy_t policy
;
7867 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
7870 * In case of MADV_FREE/MADV_PURGE, we won't be modifying any segment
7871 * private data structures; so, we only need to grab READER's lock
7873 if (behav
!= MADV_FREE
&& behav
!= MADV_PURGE
) {
7874 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
7875 if (svd
->tr_state
!= SEGVN_TR_OFF
) {
7876 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7880 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
7884 * Large pages are assumed to be only turned on when accesses to the
7885 * segment's address range have spatial and temporal locality. That
7886 * justifies ignoring MADV_SEQUENTIAL for large page segments.
7887 * Also, ignore advice affecting lgroup memory allocation
7888 * if don't need to do lgroup optimizations on this system
7891 if ((behav
== MADV_SEQUENTIAL
&&
7892 (seg
->s_szc
!= 0 || HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
))) ||
7893 (!lgrp_optimizations() && (behav
== MADV_ACCESS_DEFAULT
||
7894 behav
== MADV_ACCESS_LWP
|| behav
== MADV_ACCESS_MANY
))) {
7895 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7899 if (behav
== MADV_SEQUENTIAL
|| behav
== MADV_ACCESS_DEFAULT
||
7900 behav
== MADV_ACCESS_LWP
|| behav
== MADV_ACCESS_MANY
) {
7902 * Since we are going to unload hat mappings
7903 * we first have to flush the cache. Otherwise
7904 * this might lead to system panic if another
7905 * thread is doing physio on the range whose
7906 * mappings are unloaded by madvise(3C).
7908 if (svd
->softlockcnt
> 0) {
7910 * If this is shared segment non 0 softlockcnt
7911 * means locked pages are still in use.
7913 if (svd
->type
== MAP_SHARED
) {
7914 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7918 * Since we do have the segvn writers lock
7919 * nobody can fill the cache with entries
7920 * belonging to this seg during the purge.
7921 * The flush either succeeds or we still
7922 * have pending I/Os. In the later case,
7923 * madvise(3C) fails.
7926 if (svd
->softlockcnt
> 0) {
7928 * Since madvise(3C) is advisory and
7929 * it's not part of UNIX98, madvise(3C)
7930 * failure here doesn't cause any hardship.
7931 * Note that we don't block in "as" layer.
7933 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7936 } else if (svd
->type
== MAP_SHARED
&& svd
->amp
!= NULL
&&
7937 svd
->amp
->a_softlockcnt
> 0) {
7939 * Try to purge this amp's entries from pcache. It
7940 * will succeed only if other segments that share the
7941 * amp have no outstanding softlock's.
7949 if (behav
== MADV_FREE
|| behav
== MADV_PURGE
) {
7952 if (behav
== MADV_FREE
&& (vp
!= NULL
|| amp
== NULL
)) {
7954 * MADV_FREE is not supported for segments with an
7955 * underlying object; if anonmap is NULL, anon slots
7956 * are not yet populated and there is nothing for us
7957 * to do. As MADV_FREE is advisory, we don't return an
7958 * error in either case.
7960 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7966 * If we're here with a NULL anonmap, it's because we
7967 * are doing a MADV_PURGE. We have nothing to do, but
7968 * because MADV_PURGE isn't merely advisory, we return
7969 * an error in this case.
7971 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
7977 page
= seg_page(seg
, addr
);
7978 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
7979 err
= anon_disclaim(amp
,
7980 svd
->anon_index
+ page
, len
, behav
, &purged
);
7982 if (purged
!= 0 && (svd
->flags
& MAP_NORESERVE
)) {
7984 * If we purged pages on a MAP_NORESERVE mapping, we
7985 * need to be sure to now unreserve our reserved swap.
7986 * (We use the atomic operations to manipulate our
7987 * segment and address space counters because we only
7988 * have the corresponding locks held as reader, not
7991 ssize_t bytes
= ptob(purged
);
7993 anon_unresv_zone(bytes
, seg
->s_as
->a_proc
->p_zone
);
7994 atomic_add_long(&svd
->swresv
, -bytes
);
7995 atomic_add_long(&seg
->s_as
->a_resvsize
, -bytes
);
7998 ANON_LOCK_EXIT(&
->a_rwlock
);
7999 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8002 * MADV_PURGE and MADV_FREE differ in their return semantics:
8003 * because MADV_PURGE is designed to be bug-for-bug compatible
8004 * with its clumsy Linux forebear, it will fail where MADV_FREE
8007 return (behav
== MADV_PURGE
? err
: 0);
8011 * If advice is to be applied to entire segment,
8012 * use advice field in seg_data structure
8013 * otherwise use appropriate vpage entry.
8015 if ((addr
== seg
->s_base
) && (len
== seg
->s_size
)) {
8017 case MADV_ACCESS_LWP
:
8018 case MADV_ACCESS_MANY
:
8019 case MADV_ACCESS_DEFAULT
:
8021 * Set memory allocation policy for this segment
8023 policy
= lgrp_madv_to_policy(behav
, len
, svd
->type
);
8024 if (svd
->type
== MAP_SHARED
)
8025 already_set
= lgrp_shm_policy_set(policy
, amp
,
8026 svd
->anon_index
, vp
, svd
->offset
, len
);
8029 * For private memory, need writers lock on
8030 * address space because the segment may be
8031 * split or concatenated when changing policy
8033 if (AS_READ_HELD(seg
->s_as
)) {
8034 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8038 already_set
= lgrp_privm_policy_set(policy
,
8039 &svd
->policy_info
, len
);
8043 * If policy set already and it shouldn't be reapplied,
8044 * don't do anything.
8047 !LGRP_MEM_POLICY_REAPPLICABLE(policy
))
8051 * Mark any existing pages in given range for
8054 page_mark_migrate(seg
, addr
, len
, amp
, svd
->anon_index
,
8055 &vp
->v_object
, svd
->offset
, 1);
8058 * If same policy set already or this is a shared
8059 * memory segment, don't need to try to concatenate
8060 * segment with adjacent ones.
8062 if (already_set
|| svd
->type
== MAP_SHARED
)
8066 * Try to concatenate this segment with previous
8067 * one and next one, since we changed policy for
8068 * this one and it may be compatible with adjacent
8071 prev
= AS_SEGPREV(seg
->s_as
, seg
);
8072 next
= AS_SEGNEXT(seg
->s_as
, seg
);
8074 if (next
&& next
->s_ops
== &segvn_ops
&&
8075 addr
+ len
== next
->s_base
)
8076 (void) segvn_concat(seg
, next
, 1);
8078 if (prev
&& prev
->s_ops
== &segvn_ops
&&
8079 addr
== prev
->s_base
+ prev
->s_size
) {
8081 * Drop lock for private data of current
8082 * segment before concatenating (deleting) it
8083 * and return IE_REATTACH to tell as_ctl() that
8084 * current segment has changed
8086 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8087 if (!segvn_concat(prev
, seg
, 1))
8094 case MADV_SEQUENTIAL
:
8096 * unloading mapping guarantees
8097 * detection in segvn_fault
8099 ASSERT(seg
->s_szc
== 0);
8100 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
8101 hat_unload(seg
->s_as
->a_hat
, addr
, len
,
8106 svd
->advice
= (uchar_t
)behav
;
8107 svd
->pageadvice
= 0;
8109 case MADV_WILLNEED
: /* handled in memcntl */
8110 case MADV_DONTNEED
: /* handled in memcntl */
8111 case MADV_FREE
: /* handled above */
8112 case MADV_PURGE
: /* handled above */
8119 struct seg
*new_seg
;
8120 struct segvn_data
*new_svd
;
8124 page
= seg_page(seg
, addr
);
8127 if (svd
->vpage
== NULL
) {
8128 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8133 struct vpage
*bvpp
, *evpp
;
8135 case MADV_ACCESS_LWP
:
8136 case MADV_ACCESS_MANY
:
8137 case MADV_ACCESS_DEFAULT
:
8139 * Set memory allocation policy for portion of this
8144 * Align address and length of advice to page
8145 * boundaries for large pages
8147 if (seg
->s_szc
!= 0) {
8150 pgsz
= page_get_pagesize(seg
->s_szc
);
8151 addr
= (caddr_t
)P2ALIGN((uintptr_t)addr
, pgsz
);
8152 len
= P2ROUNDUP(len
, pgsz
);
8156 * Check to see whether policy is set already
8158 policy
= lgrp_madv_to_policy(behav
, len
, svd
->type
);
8160 anon_index
= svd
->anon_index
+ page
;
8161 off
= svd
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
8163 if (svd
->type
== MAP_SHARED
)
8164 already_set
= lgrp_shm_policy_set(policy
, amp
,
8165 anon_index
, vp
, off
, len
);
8168 (policy
== svd
->policy_info
.mem_policy
);
8171 * If policy set already and it shouldn't be reapplied,
8172 * don't do anything.
8175 !LGRP_MEM_POLICY_REAPPLICABLE(policy
))
8179 * For private memory, need writers lock on
8180 * address space because the segment may be
8181 * split or concatenated when changing policy
8183 if (svd
->type
== MAP_PRIVATE
&&
8184 AS_READ_HELD(seg
->s_as
)) {
8185 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8190 * Mark any existing pages in given range for
8193 page_mark_migrate(seg
, addr
, len
, amp
, svd
->anon_index
,
8194 &vp
->v_object
, svd
->offset
, 1);
8197 * Don't need to try to split or concatenate
8198 * segments, since policy is same or this is a shared
8201 if (already_set
|| svd
->type
== MAP_SHARED
)
8204 if (HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
)) {
8205 ASSERT(svd
->amp
== NULL
);
8206 ASSERT(svd
->tr_state
== SEGVN_TR_OFF
);
8207 ASSERT(svd
->softlockcnt
== 0);
8208 hat_leave_region(seg
->s_as
->a_hat
, svd
->rcookie
,
8210 svd
->rcookie
= HAT_INVALID_REGION_COOKIE
;
8214 * Split off new segment if advice only applies to a
8215 * portion of existing segment starting in middle
8219 oldeaddr
= seg
->s_base
+ seg
->s_size
;
8220 if (addr
> seg
->s_base
) {
8222 * Must flush I/O page cache
8223 * before splitting segment
8225 if (svd
->softlockcnt
> 0)
8229 * Split segment and return IE_REATTACH to tell
8230 * as_ctl() that current segment changed
8232 new_seg
= segvn_split_seg(seg
, addr
);
8233 new_svd
= (struct segvn_data
*)new_seg
->s_data
;
8237 * If new segment ends where old one
8238 * did, try to concatenate the new
8239 * segment with next one.
8241 if (eaddr
== oldeaddr
) {
8243 * Set policy for new segment
8245 (void) lgrp_privm_policy_set(policy
,
8246 &new_svd
->policy_info
,
8249 next
= AS_SEGNEXT(new_seg
->s_as
,
8253 next
->s_ops
== &segvn_ops
&&
8254 eaddr
== next
->s_base
)
8255 (void) segvn_concat(new_seg
,
8261 * Split off end of existing segment if advice only
8262 * applies to a portion of segment ending before
8263 * end of the existing segment
8265 if (eaddr
< oldeaddr
) {
8267 * Must flush I/O page cache
8268 * before splitting segment
8270 if (svd
->softlockcnt
> 0)
8274 * If beginning of old segment was already
8275 * split off, use new segment to split end off
8278 if (new_seg
!= NULL
&& new_seg
!= seg
) {
8282 (void) segvn_split_seg(new_seg
, eaddr
);
8285 * Set policy for new segment
8287 (void) lgrp_privm_policy_set(policy
,
8288 &new_svd
->policy_info
,
8292 * Split segment and return IE_REATTACH
8293 * to tell as_ctl() that current
8296 (void) segvn_split_seg(seg
, eaddr
);
8299 (void) lgrp_privm_policy_set(policy
,
8300 &svd
->policy_info
, seg
->s_size
);
8303 * If new segment starts where old one
8304 * did, try to concatenate it with
8307 if (addr
== seg
->s_base
) {
8308 prev
= AS_SEGPREV(seg
->s_as
,
8312 * Drop lock for private data
8313 * of current segment before
8314 * concatenating (deleting) it
8319 addr
== prev
->s_base
+
8324 (void) segvn_concat(
8332 case MADV_SEQUENTIAL
:
8333 ASSERT(seg
->s_szc
== 0);
8334 ASSERT(svd
->rcookie
== HAT_INVALID_REGION_COOKIE
);
8335 hat_unload(seg
->s_as
->a_hat
, addr
, len
, HAT_UNLOAD
);
8339 bvpp
= &svd
->vpage
[page
];
8340 evpp
= &svd
->vpage
[page
+ (len
>> PAGESHIFT
)];
8341 for (; bvpp
< evpp
; bvpp
++)
8342 VPP_SETADVICE(bvpp
, behav
);
8343 svd
->advice
= MADV_NORMAL
;
8345 case MADV_WILLNEED
: /* handled in memcntl */
8346 case MADV_DONTNEED
: /* handled in memcntl */
8347 case MADV_FREE
: /* handled above */
8348 case MADV_PURGE
: /* handled above */
8354 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8359 * There is one kind of inheritance that can be specified for pages:
8361 * SEGP_INH_ZERO - Pages should be zeroed in the child
8364 segvn_inherit(struct seg
*seg
, caddr_t addr
, size_t len
, uint_t behav
)
8366 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
8367 struct vpage
*bvpp
, *evpp
;
8371 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
8373 /* Can't support something we don't know about */
8374 if (behav
!= SEGP_INH_ZERO
)
8377 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
);
8380 * This must be a straightforward anonymous segment that is mapped
8381 * privately and is not backed by a vnode.
8383 if (svd
->tr_state
!= SEGVN_TR_OFF
||
8384 svd
->type
!= MAP_PRIVATE
||
8391 * If the entire segment has been marked as inherit zero, then no reason
8392 * to do anything else.
8394 if (svd
->svn_inz
== SEGVN_INZ_ALL
) {
8400 * If this applies to the entire segment, simply mark it and we're done.
8402 if ((addr
== seg
->s_base
) && (len
== seg
->s_size
)) {
8403 svd
->svn_inz
= SEGVN_INZ_ALL
;
8409 * We've been asked to mark a subset of this segment as inherit zero,
8410 * therefore we need to mainpulate its vpages.
8412 if (svd
->vpage
== NULL
) {
8414 if (svd
->vpage
== NULL
) {
8420 svd
->svn_inz
= SEGVN_INZ_VPP
;
8421 page
= seg_page(seg
, addr
);
8422 bvpp
= &svd
->vpage
[page
];
8423 evpp
= &svd
->vpage
[page
+ (len
>> PAGESHIFT
)];
8424 for (; bvpp
< evpp
; bvpp
++)
8425 VPP_SETINHZERO(bvpp
);
8429 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8434 * Create a vpage structure for this seg.
8437 segvn_vpage(struct seg
*seg
)
8439 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
8440 struct vpage
*vp
, *evp
;
8441 static pgcnt_t page_limit
= 0;
8443 ASSERT(SEGVN_WRITE_HELD(seg
->s_as
, &svd
->lock
));
8446 * If no vpage structure exists, allocate one. Copy the protections
8447 * and the advice from the segment itself to the individual pages.
8449 if (svd
->vpage
== NULL
) {
8451 * Start by calculating the number of pages we must allocate to
8452 * track the per-page vpage structs needs for this entire
8453 * segment. If we know now that it will require more than our
8454 * heuristic for the maximum amount of kmem we can consume then
8455 * fail. We do this here, instead of trying to detect this deep
8456 * in page_resv and propagating the error up, since the entire
8457 * memory allocation stack is not amenable to passing this
8458 * back. Instead, it wants to keep trying.
8460 * As a heuristic we set a page limit of 5/8s of total_pages
8461 * for this allocation. We use shifts so that no floating
8462 * point conversion takes place and only need to do the
8465 ulong_t mem_needed
= seg_pages(seg
) * sizeof (struct vpage
);
8466 pgcnt_t npages
= mem_needed
>> PAGESHIFT
;
8468 if (page_limit
== 0)
8469 page_limit
= (total_pages
>> 1) + (total_pages
>> 3);
8471 if (npages
> page_limit
)
8474 svd
->pageadvice
= 1;
8475 svd
->vpage
= kmem_zalloc(mem_needed
, KM_SLEEP
);
8476 evp
= &svd
->vpage
[seg_page(seg
, seg
->s_base
+ seg
->s_size
)];
8477 for (vp
= svd
->vpage
; vp
< evp
; vp
++) {
8478 VPP_SETPROT(vp
, svd
->prot
);
8479 VPP_SETADVICE(vp
, svd
->advice
);
8485 * Dump the pages belonging to this segvn segment.
8488 segvn_dump(struct seg
*seg
)
8490 struct segvn_data
*svd
;
8492 struct anon_map
*amp
;
8497 pgcnt_t page
, npages
;
8500 npages
= seg_pages(seg
);
8501 svd
= (struct segvn_data
*)seg
->s_data
;
8503 off
= offset
= svd
->offset
;
8506 if ((amp
= svd
->amp
) != NULL
) {
8507 anon_index
= svd
->anon_index
;
8508 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
8511 for (page
= 0; page
< npages
; page
++, offset
+= PAGESIZE
) {
8515 if (amp
&& (ap
= anon_get_ptr(svd
->amp
->ahp
, anon_index
++))) {
8516 swap_xlate_nopanic(ap
, &vp
, &off
);
8523 * If pp == NULL, the page either does not exist
8524 * or is exclusively locked. So determine if it
8525 * exists before searching for it.
8528 if ((pp
= page_lookup_nowait(&vp
->v_object
, off
, SE_SHARED
)))
8531 pp
= page_exists(&vp
->v_object
, off
);
8534 pfn
= page_pptonum(pp
);
8535 dump_addpage(seg
->s_as
, addr
, pfn
);
8540 dump_timeleft
= dump_timeout
;
8544 ANON_LOCK_EXIT(&
->a_rwlock
);
8548 static uint32_t segvn_pglock_mtbf
= 0;
8551 #define PCACHE_SHWLIST ((page_t *)-2)
8552 #define NOPCACHE_SHWLIST ((page_t *)-1)
8555 * Lock/Unlock anon pages over a given range. Return shadow list. This routine
8556 * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages
8557 * to avoid the overhead of per page locking, unlocking for subsequent IOs to
8558 * the same parts of the segment. Currently shadow list creation is only
8559 * supported for pure anon segments. MAP_PRIVATE segment pcache entries are
8560 * tagged with segment pointer, starting virtual address and length. This
8561 * approach for MAP_SHARED segments may add many pcache entries for the same
8562 * set of pages and lead to long hash chains that decrease pcache lookup
8563 * performance. To avoid this issue for shared segments shared anon map and
8564 * starting anon index are used for pcache entry tagging. This allows all
8565 * segments to share pcache entries for the same anon range and reduces pcache
8566 * chain's length as well as memory overhead from duplicate shadow lists and
8569 * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd
8570 * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock
8571 * part of softlockcnt accounting is done differently for private and shared
8572 * segments. In private segment case softlock is only incremented when a new
8573 * shadow list is created but not when an existing one is found via
8574 * seg_plookup(). pcache entries have reference count incremented/decremented
8575 * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0
8576 * reference count can be purged (and purging is needed before segment can be
8577 * freed). When a private segment pcache entry is purged segvn_reclaim() will
8578 * decrement softlockcnt. Since in private segment case each of its pcache
8579 * entries only belongs to this segment we can expect that when
8580 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
8581 * segment purge will succeed and softlockcnt will drop to 0. In shared
8582 * segment case reference count in pcache entry counts active locks from many
8583 * different segments so we can't expect segment purging to succeed even when
8584 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this
8585 * segment. To be able to determine when there're no pending pagelocks in
8586 * shared segment case we don't rely on purging to make softlockcnt drop to 0
8587 * but instead softlockcnt is incremented and decremented for every
8588 * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow
8589 * list was created or an existing one was found. When softlockcnt drops to 0
8590 * this segment no longer has any claims for pcached shadow lists and the
8591 * segment can be freed even if there're still active pcache entries
8592 * shared by this segment anon map. Shared segment pcache entries belong to
8593 * anon map and are typically removed when anon map is freed after all
8594 * processes destroy the segments that use this anon map.
8597 segvn_pagelock(struct seg
*seg
, caddr_t addr
, size_t len
, struct page
***ppp
,
8598 enum lock_type type
, enum seg_rw rw
)
8600 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
8602 pgcnt_t adjustpages
;
8605 uint_t protchk
= (rw
== S_READ
) ? PROT_READ
: PROT_WRITE
;
8607 struct anon_map
*amp
;
8609 struct page
**pplist
, **pl
, *pp
;
8612 caddr_t lpgaddr
, lpgeaddr
;
8613 anon_sync_obj_t cookie
;
8615 struct anon_map
*pamp
;
8617 seg_preclaim_cbfunc_t preclaim_callback
;
8622 int sftlck_sbase
= 0;
8623 int sftlck_send
= 0;
8626 if (type
== L_PAGELOCK
&& segvn_pglock_mtbf
) {
8627 hrtime_t ts
= gethrtime();
8628 if ((ts
% segvn_pglock_mtbf
) == 0) {
8631 if ((ts
% segvn_pglock_mtbf
) == 1) {
8637 ASSERT(seg
->s_as
&& AS_LOCK_HELD(seg
->s_as
));
8638 ASSERT(type
== L_PAGELOCK
|| type
== L_PAGEUNLOCK
);
8640 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
8643 * for now we only support pagelock to anon memory. We would have to
8644 * check protections for vnode objects and call into the vnode driver.
8645 * That's too much for a fast path. Let the fault entry point handle
8648 if (svd
->vp
!= NULL
) {
8649 if (type
== L_PAGELOCK
) {
8653 panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL");
8655 if ((amp
= svd
->amp
) == NULL
) {
8656 if (type
== L_PAGELOCK
) {
8660 panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL");
8662 if (rw
!= S_READ
&& rw
!= S_WRITE
) {
8663 if (type
== L_PAGELOCK
) {
8667 panic("segvn_pagelock(L_PAGEUNLOCK): bad rw");
8670 if (seg
->s_szc
!= 0) {
8672 * We are adjusting the pagelock region to the large page size
8673 * boundary because the unlocked part of a large page cannot
8674 * be freed anyway unless all constituent pages of a large
8675 * page are locked. Bigger regions reduce pcache chain length
8676 * and improve lookup performance. The tradeoff is that the
8677 * very first segvn_pagelock() call for a given page is more
8678 * expensive if only 1 page_t is needed for IO. This is only
8679 * an issue if pcache entry doesn't get reused by several
8680 * subsequent calls. We optimize here for the case when pcache
8681 * is heavily used by repeated IOs to the same address range.
8683 * Note segment's page size cannot change while we are holding
8684 * as lock. And then it cannot change while softlockcnt is
8685 * not 0. This will allow us to correctly recalculate large
8686 * page size region for the matching pageunlock/reclaim call
8687 * since as_pageunlock() caller must always match
8688 * as_pagelock() call's addr and len.
8690 * For pageunlock *ppp points to the pointer of page_t that
8691 * corresponds to the real unadjusted start address. Similar
8692 * for pagelock *ppp must point to the pointer of page_t that
8693 * corresponds to the real unadjusted start address.
8695 pgsz
= page_get_pagesize(seg
->s_szc
);
8696 CALC_LPG_REGION(pgsz
, seg
, addr
, len
, lpgaddr
, lpgeaddr
);
8697 adjustpages
= btop((uintptr_t)(addr
- lpgaddr
));
8698 } else if (len
< segvn_pglock_comb_thrshld
) {
8700 lpgeaddr
= addr
+ len
;
8705 * Align the address range of large enough requests to allow
8706 * combining of different shadow lists into 1 to reduce memory
8707 * overhead from potentially overlapping large shadow lists
8708 * (worst case is we have a 1MB IO into buffers with start
8709 * addresses separated by 4K). Alignment is only possible if
8710 * padded chunks have sufficient access permissions. Note
8711 * permissions won't change between L_PAGELOCK and
8712 * L_PAGEUNLOCK calls since non 0 softlockcnt will force
8713 * segvn_setprot() to wait until softlockcnt drops to 0. This
8714 * allows us to determine in L_PAGEUNLOCK the same range we
8715 * computed in L_PAGELOCK.
8717 * If alignment is limited by segment ends set
8718 * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when
8719 * these flags are set bump softlockcnt_sbase/softlockcnt_send
8720 * per segment counters. In L_PAGEUNLOCK case decrease
8721 * softlockcnt_sbase/softlockcnt_send counters if
8722 * sftlck_sbase/sftlck_send flags are set. When
8723 * softlockcnt_sbase/softlockcnt_send are non 0
8724 * segvn_concat()/segvn_extend_prev()/segvn_extend_next()
8725 * won't merge the segments. This restriction combined with
8726 * restriction on segment unmapping and splitting for segments
8727 * that have non 0 softlockcnt allows L_PAGEUNLOCK to
8728 * correctly determine the same range that was previously
8729 * locked by matching L_PAGELOCK.
8731 pflags
= SEGP_PSHIFT
| (segvn_pglock_comb_bshift
<< 16);
8733 if (svd
->type
== MAP_PRIVATE
) {
8734 lpgaddr
= (caddr_t
)P2ALIGN((uintptr_t)addr
,
8735 segvn_pglock_comb_balign
);
8736 if (lpgaddr
< seg
->s_base
) {
8737 lpgaddr
= seg
->s_base
;
8741 ulong_t aix
= svd
->anon_index
+ seg_page(seg
, addr
);
8742 ulong_t aaix
= P2ALIGN(aix
, segvn_pglock_comb_palign
);
8743 if (aaix
< svd
->anon_index
) {
8744 lpgaddr
= seg
->s_base
;
8747 lpgaddr
= addr
- ptob(aix
- aaix
);
8748 ASSERT(lpgaddr
>= seg
->s_base
);
8751 if (svd
->pageprot
&& lpgaddr
!= addr
) {
8752 struct vpage
*vp
= &svd
->vpage
[seg_page(seg
, lpgaddr
)];
8753 struct vpage
*evp
= &svd
->vpage
[seg_page(seg
, addr
)];
8755 if ((VPP_PROT(vp
) & protchk
) == 0) {
8765 lpgeaddr
= addr
+ len
;
8767 if (svd
->type
== MAP_PRIVATE
) {
8768 lpgeaddr
= (caddr_t
)P2ROUNDUP(
8769 (uintptr_t)lpgeaddr
,
8770 segvn_pglock_comb_balign
);
8772 ulong_t aix
= svd
->anon_index
+
8773 seg_page(seg
, lpgeaddr
);
8774 ulong_t aaix
= P2ROUNDUP(aix
,
8775 segvn_pglock_comb_palign
);
8779 lpgeaddr
+= ptob(aaix
- aix
);
8782 if (lpgeaddr
== 0 ||
8783 lpgeaddr
> seg
->s_base
+ seg
->s_size
) {
8784 lpgeaddr
= seg
->s_base
+ seg
->s_size
;
8788 if (svd
->pageprot
&& lpgeaddr
!= addr
+ len
) {
8792 vp
= &svd
->vpage
[seg_page(seg
, addr
+ len
)];
8793 evp
= &svd
->vpage
[seg_page(seg
, lpgeaddr
)];
8796 if ((VPP_PROT(vp
) & protchk
) == 0) {
8802 lpgeaddr
= addr
+ len
;
8805 adjustpages
= btop((uintptr_t)(addr
- lpgaddr
));
8809 * For MAP_SHARED segments we create pcache entries tagged by amp and
8810 * anon index so that we can share pcache entries with other segments
8811 * that map this amp. For private segments pcache entries are tagged
8812 * with segment and virtual address.
8814 if (svd
->type
== MAP_SHARED
) {
8816 paddr
= (caddr_t
)((lpgaddr
- seg
->s_base
) +
8817 ptob(svd
->anon_index
));
8818 preclaim_callback
= shamp_reclaim
;
8822 preclaim_callback
= segvn_reclaim
;
8825 if (type
== L_PAGEUNLOCK
) {
8826 VM_STAT_ADD(segvnvmstats
.pagelock
[0]);
8829 * update hat ref bits for /proc. We need to make sure
8830 * that threads tracing the ref and mod bits of the
8831 * address space get the right data.
8832 * Note: page ref and mod bits are updated at reclaim time
8834 if (seg
->s_as
->a_vbits
) {
8835 for (a
= addr
; a
< addr
+ len
; a
+= PAGESIZE
) {
8836 if (rw
== S_WRITE
) {
8837 hat_setstat(seg
->s_as
, a
,
8838 PAGESIZE
, P_REF
| P_MOD
);
8840 hat_setstat(seg
->s_as
, a
,
8847 * Check the shadow list entry after the last page used in
8848 * this IO request. If it's NOPCACHE_SHWLIST the shadow list
8849 * was not inserted into pcache and is not large page
8850 * adjusted. In this case call reclaim callback directly and
8851 * don't adjust the shadow list start and size for large
8855 if ((*ppp
)[npages
] == NOPCACHE_SHWLIST
) {
8858 ASSERT(svd
->type
== MAP_SHARED
);
8859 ptag
= (void *)pamp
;
8860 paddr
= (caddr_t
)((addr
- seg
->s_base
) +
8861 ptob(svd
->anon_index
));
8866 (*preclaim_callback
)(ptag
, paddr
, len
, *ppp
, rw
, 0);
8868 ASSERT((*ppp
)[npages
] == PCACHE_SHWLIST
||
8869 IS_SWAPFSVP((*ppp
)[npages
]->p_vnode
));
8870 len
= lpgeaddr
- lpgaddr
;
8872 seg_pinactive(seg
, pamp
, paddr
, len
,
8873 *ppp
- adjustpages
, rw
, pflags
, preclaim_callback
);
8877 ASSERT(svd
->type
== MAP_SHARED
);
8878 ASSERT(svd
->softlockcnt
>= npages
);
8879 atomic_add_long((ulong_t
*)&svd
->softlockcnt
, -npages
);
8883 ASSERT(svd
->softlockcnt_sbase
> 0);
8884 atomic_dec_ulong((ulong_t
*)&svd
->softlockcnt_sbase
);
8887 ASSERT(svd
->softlockcnt_send
> 0);
8888 atomic_dec_ulong((ulong_t
*)&svd
->softlockcnt_send
);
8892 * If someone is blocked while unmapping, we purge
8893 * segment page cache and thus reclaim pplist synchronously
8894 * without waiting for seg_pasync_thread. This speeds up
8895 * unmapping in cases where munmap(2) is called, while
8896 * raw async i/o is still in progress or where a thread
8897 * exits on data fault in a multithreaded application.
8899 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
8900 if (svd
->softlockcnt
== 0) {
8901 mutex_enter(&seg
->s_as
->a_contents
);
8902 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
8903 AS_CLRUNMAPWAIT(seg
->s_as
);
8904 cv_broadcast(&seg
->s_as
->a_cv
);
8906 mutex_exit(&seg
->s_as
->a_contents
);
8907 } else if (pamp
== NULL
) {
8909 * softlockcnt is not 0 and this is a
8910 * MAP_PRIVATE segment. Try to purge its
8911 * pcache entries to reduce softlockcnt.
8912 * If it drops to 0 segvn_reclaim()
8913 * will wake up a thread waiting on
8916 * We don't purge MAP_SHARED segments with non
8917 * 0 softlockcnt since IO is still in progress
8918 * for such segments.
8920 ASSERT(svd
->type
== MAP_PRIVATE
);
8924 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8928 /* The L_PAGELOCK case ... */
8930 VM_STAT_ADD(segvnvmstats
.pagelock
[1]);
8933 * For MAP_SHARED segments we have to check protections before
8934 * seg_plookup() since pcache entries may be shared by many segments
8935 * with potentially different page protections.
8938 ASSERT(svd
->type
== MAP_SHARED
);
8939 if (svd
->pageprot
== 0) {
8940 if ((svd
->prot
& protchk
) == 0) {
8946 * check page protections
8957 for (; a
< ea
; a
+= pgsz
) {
8960 ASSERT(seg
->s_szc
== 0 ||
8961 sameprot(seg
, a
, pgsz
));
8962 vp
= &svd
->vpage
[seg_page(seg
, a
)];
8963 if ((VPP_PROT(vp
) & protchk
) == 0) {
8972 * try to find pages in segment page cache
8974 pplist
= seg_plookup(seg
, pamp
, paddr
, lpgeaddr
- lpgaddr
, rw
, pflags
);
8975 if (pplist
!= NULL
) {
8977 npages
= btop((uintptr_t)(lpgeaddr
- lpgaddr
));
8978 ASSERT(svd
->type
== MAP_SHARED
);
8979 atomic_add_long((ulong_t
*)&svd
->softlockcnt
,
8983 atomic_inc_ulong((ulong_t
*)&svd
->softlockcnt_sbase
);
8986 atomic_inc_ulong((ulong_t
*)&svd
->softlockcnt_send
);
8988 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
8989 *ppp
= pplist
+ adjustpages
;
8994 * For MAP_SHARED segments we already verified above that segment
8995 * protections allow this pagelock operation.
8998 ASSERT(svd
->type
== MAP_PRIVATE
);
8999 if (svd
->pageprot
== 0) {
9000 if ((svd
->prot
& protchk
) == 0) {
9004 if (svd
->prot
& PROT_WRITE
) {
9005 wlen
= lpgeaddr
- lpgaddr
;
9008 ASSERT(rw
== S_READ
);
9013 * check page protections
9015 for (a
= lpgaddr
, wlen
= 0; a
< lpgeaddr
; a
+= pgsz
) {
9018 ASSERT(seg
->s_szc
== 0 ||
9019 sameprot(seg
, a
, pgsz
));
9020 vp
= &svd
->vpage
[seg_page(seg
, a
)];
9021 if ((VPP_PROT(vp
) & protchk
) == 0) {
9025 if (wcont
&& (VPP_PROT(vp
) & PROT_WRITE
)) {
9029 ASSERT(rw
== S_READ
);
9033 ASSERT(rw
== S_READ
|| wlen
== lpgeaddr
- lpgaddr
);
9034 ASSERT(rw
== S_WRITE
|| wlen
<= lpgeaddr
- lpgaddr
);
9038 * Only build large page adjusted shadow list if we expect to insert
9039 * it into pcache. For large enough pages it's a big overhead to
9040 * create a shadow list of the entire large page. But this overhead
9041 * should be amortized over repeated pcache hits on subsequent reuse
9042 * of this shadow list (IO into any range within this shadow list will
9043 * find it in pcache since we large page align the request for pcache
9044 * lookups). pcache performance is improved with bigger shadow lists
9045 * as it reduces the time to pcache the entire big segment and reduces
9046 * pcache chain length.
9048 if (seg_pinsert_check(seg
, pamp
, paddr
,
9049 lpgeaddr
- lpgaddr
, pflags
) == SEGP_SUCCESS
) {
9051 len
= lpgeaddr
- lpgaddr
;
9056 * Since this entry will not be inserted into the pcache, we
9057 * will not do any adjustments to the starting address or
9058 * size of the memory to be locked.
9064 pplist
= kmem_alloc(sizeof (page_t
*) * (npages
+ 1), KM_SLEEP
);
9066 *ppp
= pplist
+ adjustpages
;
9068 * If use_pcache is 0 this shadow list is not large page adjusted.
9069 * Record this info in the last entry of shadow array so that
9070 * L_PAGEUNLOCK can determine if it should large page adjust the
9071 * address range to find the real range that was locked.
9073 pl
[npages
] = use_pcache
? PCACHE_SHWLIST
: NOPCACHE_SHWLIST
;
9075 page
= seg_page(seg
, addr
);
9076 anon_index
= svd
->anon_index
+ page
;
9079 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
9080 ASSERT(amp
->a_szc
>= seg
->s_szc
);
9081 anpgcnt
= page_get_pagecnt(amp
->a_szc
);
9082 for (a
= addr
; a
< addr
+ len
; a
+= PAGESIZE
, anon_index
++) {
9088 * Lock and unlock anon array only once per large page.
9089 * anon_array_enter() locks the root anon slot according to
9090 * a_szc which can't change while anon map is locked. We lock
9091 * anon the first time through this loop and each time we
9092 * reach anon index that corresponds to a root of a large
9095 if (a
== addr
|| P2PHASE(anon_index
, anpgcnt
) == 0) {
9096 ASSERT(anlock
== 0);
9097 anon_array_enter(amp
, anon_index
, &cookie
);
9100 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
9103 * We must never use seg_pcache for COW pages
9104 * because we might end up with original page still
9105 * lying in seg_pcache even after private page is
9106 * created. This leads to data corruption as
9107 * aio_write refers to the page still in cache
9108 * while all other accesses refer to the private
9111 if (ap
== NULL
|| ap
->an_refcnt
!= 1) {
9112 struct vpage
*vpage
;
9118 if (svd
->vpage
!= NULL
) {
9119 vpage
= &svd
->vpage
[seg_page(seg
, a
)];
9124 anon_array_exit(&cookie
);
9127 error
= segvn_faultpage(seg
->s_as
->a_hat
, seg
, a
, 0,
9128 vpage
, &pp
, 0, F_INVAL
, rw
, 1);
9130 error
= fc_decode(error
);
9133 anon_array_enter(amp
, anon_index
, &cookie
);
9135 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
9136 if (ap
== NULL
|| ap
->an_refcnt
!= 1) {
9141 swap_xlate(ap
, &vp
, &off
);
9142 pp
= page_lookup_nowait(&vp
->v_object
, off
, SE_SHARED
);
9147 if (ap
->an_pvp
!= NULL
) {
9148 anon_swap_free(ap
, pp
);
9151 * Unlock anon if this is the last slot in a large page.
9153 if (P2PHASE(anon_index
, anpgcnt
) == anpgcnt
- 1) {
9155 anon_array_exit(&cookie
);
9160 if (anlock
) { /* Ensure the lock is dropped */
9161 anon_array_exit(&cookie
);
9163 ANON_LOCK_EXIT(&
->a_rwlock
);
9165 if (a
>= addr
+ len
) {
9166 atomic_add_long((ulong_t
*)&svd
->softlockcnt
, npages
);
9168 ASSERT(svd
->type
== MAP_SHARED
);
9169 atomic_add_long((ulong_t
*)&pamp
->a_softlockcnt
,
9174 atomic_inc_ulong((ulong_t
*)&svd
->softlockcnt_sbase
);
9177 atomic_inc_ulong((ulong_t
*)&svd
->softlockcnt_send
);
9180 (void) seg_pinsert(seg
, pamp
, paddr
, len
, wlen
, pl
,
9181 rw
, pflags
, preclaim_callback
);
9183 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
9188 np
= ((uintptr_t)(a
- addr
)) >> PAGESHIFT
;
9190 ASSERT(PAGE_LOCKED(*pplist
));
9191 page_unlock(*pplist
);
9195 kmem_free(pl
, sizeof (page_t
*) * (npages
+ 1));
9197 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
9203 * purge any cached pages in the I/O page cache
9206 segvn_purge(struct seg
*seg
)
9208 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9211 * pcache is only used by pure anon segments.
9213 if (svd
->amp
== NULL
|| svd
->vp
!= NULL
) {
9218 * For MAP_SHARED segments non 0 segment's softlockcnt means
9219 * active IO is still in progress via this segment. So we only
9220 * purge MAP_SHARED segments when their softlockcnt is 0.
9222 if (svd
->type
== MAP_PRIVATE
) {
9223 if (svd
->softlockcnt
) {
9224 seg_ppurge(seg
, NULL
, 0);
9226 } else if (svd
->softlockcnt
== 0 && svd
->amp
->a_softlockcnt
!= 0) {
9227 seg_ppurge(seg
, svd
->amp
, 0);
9232 * If async argument is not 0 we are called from pcache async thread and don't
9238 segvn_reclaim(void *ptag
, caddr_t addr
, size_t len
, struct page
**pplist
,
9239 enum seg_rw rw
, int async
)
9241 struct seg
*seg
= (struct seg
*)ptag
;
9242 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9246 npages
= np
= btop(len
);
9249 ASSERT(svd
->vp
== NULL
&& svd
->amp
!= NULL
);
9250 ASSERT(svd
->softlockcnt
>= npages
);
9251 ASSERT(async
|| AS_LOCK_HELD(seg
->s_as
));
9255 ASSERT(pl
[np
] == NOPCACHE_SHWLIST
|| pl
[np
] == PCACHE_SHWLIST
);
9256 ASSERT(!async
|| pl
[np
] == PCACHE_SHWLIST
);
9259 if (rw
== S_WRITE
) {
9260 hat_setrefmod(*pplist
);
9262 hat_setref(*pplist
);
9264 page_unlock(*pplist
);
9269 kmem_free(pl
, sizeof (page_t
*) * (npages
+ 1));
9272 * If we are pcache async thread we don't hold AS lock. This means if
9273 * softlockcnt drops to 0 after the decrement below address space may
9274 * get freed. We can't allow it since after softlock derement to 0 we
9275 * still need to access as structure for possible wakeup of unmap
9276 * waiters. To prevent the disappearance of as we take this segment
9277 * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to
9278 * make sure this routine completes before segment is freed.
9280 * The second complication we have to deal with in async case is a
9281 * possibility of missed wake up of unmap wait thread. When we don't
9282 * hold as lock here we may take a_contents lock before unmap wait
9283 * thread that was first to see softlockcnt was still not 0. As a
9284 * result we'll fail to wake up an unmap wait thread. To avoid this
9285 * race we set nounmapwait flag in as structure if we drop softlockcnt
9286 * to 0 when we were called by pcache async thread. unmapwait thread
9287 * will not block if this flag is set.
9290 mutex_enter(&svd
->segfree_syncmtx
);
9293 if (!atomic_add_long_nv((ulong_t
*)&svd
->softlockcnt
, -npages
)) {
9294 if (async
|| AS_ISUNMAPWAIT(seg
->s_as
)) {
9295 mutex_enter(&seg
->s_as
->a_contents
);
9297 AS_SETNOUNMAPWAIT(seg
->s_as
);
9299 if (AS_ISUNMAPWAIT(seg
->s_as
)) {
9300 AS_CLRUNMAPWAIT(seg
->s_as
);
9301 cv_broadcast(&seg
->s_as
->a_cv
);
9303 mutex_exit(&seg
->s_as
->a_contents
);
9308 mutex_exit(&svd
->segfree_syncmtx
);
9315 shamp_reclaim(void *ptag
, caddr_t addr
, size_t len
, struct page
**pplist
,
9316 enum seg_rw rw
, int async
)
9318 amp_t
*amp
= (amp_t
*)ptag
;
9322 npages
= np
= btop(len
);
9324 ASSERT(amp
->a_softlockcnt
>= npages
);
9328 ASSERT(pl
[np
] == NOPCACHE_SHWLIST
|| pl
[np
] == PCACHE_SHWLIST
);
9329 ASSERT(!async
|| pl
[np
] == PCACHE_SHWLIST
);
9332 if (rw
== S_WRITE
) {
9333 hat_setrefmod(*pplist
);
9335 hat_setref(*pplist
);
9337 page_unlock(*pplist
);
9342 kmem_free(pl
, sizeof (page_t
*) * (npages
+ 1));
9345 * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt
9346 * drops to 0. anon map can't be freed until a_softlockcnt drops to 0
9347 * and anonmap_purge() acquires a_purgemtx.
9349 mutex_enter(&
->a_purgemtx
);
9350 if (!atomic_add_long_nv((ulong_t
*)&
->a_softlockcnt
, -npages
) &&
9352 amp
->a_purgewait
= 0;
9353 cv_broadcast(&
->a_purgecv
);
9355 mutex_exit(&
->a_purgemtx
);
9360 * get a memory ID for an addr in a given segment
9362 * XXX only creates PAGESIZE pages if anon slots are not initialized.
9363 * At fault time they will be relocated into larger pages.
9366 segvn_getmemid(struct seg
*seg
, caddr_t addr
, memid_t
*memidp
)
9368 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9369 struct anon
*ap
= NULL
;
9371 struct anon_map
*amp
;
9372 anon_sync_obj_t cookie
;
9374 if (svd
->type
== MAP_PRIVATE
) {
9375 memidp
->val
[0] = (uintptr_t)seg
->s_as
;
9376 memidp
->val
[1] = (uintptr_t)addr
;
9380 if (svd
->type
== MAP_SHARED
) {
9382 memidp
->val
[0] = (uintptr_t)svd
->vp
;
9383 memidp
->val
[1] = (u_longlong_t
)svd
->offset
+
9384 (uintptr_t)(addr
- seg
->s_base
);
9388 SEGVN_LOCK_ENTER(seg
->s_as
, &svd
->lock
, RW_READER
);
9389 if ((amp
= svd
->amp
) != NULL
) {
9390 anon_index
= svd
->anon_index
+
9391 seg_page(seg
, addr
);
9393 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
9395 ASSERT(amp
!= NULL
);
9397 ANON_LOCK_ENTER(&
->a_rwlock
, RW_READER
);
9398 anon_array_enter(amp
, anon_index
, &cookie
);
9399 ap
= anon_get_ptr(amp
->ahp
, anon_index
);
9403 pp
= anon_zero(seg
, addr
, &ap
, svd
->cred
);
9405 anon_array_exit(&cookie
);
9406 ANON_LOCK_EXIT(&
->a_rwlock
);
9409 ASSERT(anon_get_ptr(amp
->ahp
, anon_index
)
9411 (void) anon_set_ptr(amp
->ahp
, anon_index
,
9416 anon_array_exit(&cookie
);
9417 ANON_LOCK_EXIT(&
->a_rwlock
);
9419 memidp
->val
[0] = (uintptr_t)ap
;
9420 memidp
->val
[1] = (uintptr_t)addr
& PAGEOFFSET
;
9428 sameprot(struct seg
*seg
, caddr_t a
, size_t len
)
9430 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9431 struct vpage
*vpage
;
9432 spgcnt_t pages
= btop(len
);
9435 if (svd
->pageprot
== 0)
9438 ASSERT(svd
->vpage
!= NULL
);
9440 vpage
= &svd
->vpage
[seg_page(seg
, a
)];
9441 prot
= VPP_PROT(vpage
);
9444 while (pages
-- > 0) {
9445 if (prot
!= VPP_PROT(vpage
))
9453 * Get memory allocation policy info for specified address in given segment
9455 static lgrp_mem_policy_info_t
*
9456 segvn_getpolicy(struct seg
*seg
, caddr_t addr
)
9458 struct anon_map
*amp
;
9460 lgrp_mem_policy_info_t
*policy_info
;
9461 struct segvn_data
*svn_data
;
9465 ASSERT(seg
!= NULL
);
9467 svn_data
= (struct segvn_data
*)seg
->s_data
;
9468 if (svn_data
== NULL
)
9472 * Get policy info for private or shared memory
9474 if (svn_data
->type
!= MAP_SHARED
) {
9475 if (svn_data
->tr_state
!= SEGVN_TR_ON
) {
9476 policy_info
= &svn_data
->policy_info
;
9478 policy_info
= &svn_data
->tr_policy_info
;
9479 ASSERT(policy_info
->mem_policy
==
9480 LGRP_MEM_POLICY_NEXT_SEG
);
9483 amp
= svn_data
->amp
;
9484 anon_index
= svn_data
->anon_index
+ seg_page(seg
, addr
);
9486 vn_off
= svn_data
->offset
+ (uintptr_t)(addr
- seg
->s_base
);
9487 policy_info
= lgrp_shm_policy_get(amp
, anon_index
, vp
, vn_off
);
9490 return (policy_info
);
9494 * Bind text vnode segment to an amp. If we bind successfully mappings will be
9495 * established to per vnode mapping per lgroup amp pages instead of to vnode
9496 * pages. There's one amp per vnode text mapping per lgroup. Many processes
9497 * may share the same text replication amp. If a suitable amp doesn't already
9498 * exist in svntr hash table create a new one. We may fail to bind to amp if
9499 * segment is not eligible for text replication. Code below first checks for
9500 * these conditions. If binding is successful segment tr_state is set to on
9501 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and
9502 * svd->amp remains as NULL.
9505 segvn_textrepl(struct seg
*seg
)
9507 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9508 vnode_t
*vp
= svd
->vp
;
9509 uoff_t off
= svd
->offset
;
9510 size_t size
= seg
->s_size
;
9511 uoff_t eoff
= off
+ size
;
9512 uint_t szc
= seg
->s_szc
;
9513 ulong_t hash
= SVNTR_HASH_FUNC(vp
);
9516 proc_t
*p
= seg
->s_as
->a_proc
;
9520 struct anon_map
*amp
;
9522 ASSERT(AS_LOCK_HELD(seg
->s_as
));
9523 ASSERT(SEGVN_WRITE_HELD(seg
->s_as
, &svd
->lock
));
9525 ASSERT(svd
->tr_state
== SEGVN_TR_INIT
);
9526 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
));
9527 ASSERT(svd
->flags
& MAP_TEXT
);
9528 ASSERT(svd
->type
== MAP_PRIVATE
);
9529 ASSERT(vp
!= NULL
&& svd
->amp
== NULL
);
9530 ASSERT(!svd
->pageprot
&& !(svd
->prot
& PROT_WRITE
));
9531 ASSERT(!(svd
->flags
& MAP_NORESERVE
) && svd
->swresv
== 0);
9532 ASSERT(seg
->s_as
!= &kas
);
9534 ASSERT(svntr_hashtab
!= NULL
);
9537 * If numa optimizations are no longer desired bail out.
9539 if (!lgrp_optimizations()) {
9540 svd
->tr_state
= SEGVN_TR_OFF
;
9545 * Avoid creating anon maps with size bigger than the file size.
9546 * If fop_getattr() call fails bail out.
9548 va
.va_mask
= VATTR_SIZE
| VATTR_MTIME
| VATTR_CTIME
;
9549 if (fop_getattr(vp
, &va
, 0, svd
->cred
, NULL
) != 0) {
9550 svd
->tr_state
= SEGVN_TR_OFF
;
9551 SEGVN_TR_ADDSTAT(gaerr
);
9554 if (btopr(va
.va_size
) < btopr(eoff
)) {
9555 svd
->tr_state
= SEGVN_TR_OFF
;
9556 SEGVN_TR_ADDSTAT(overmap
);
9561 * VVMEXEC may not be set yet if exec() prefaults text segment. Set
9562 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED
9563 * mapping that checks if trcache for this vnode needs to be
9564 * invalidated can't miss us.
9566 if (!(vp
->v_flag
& VVMEXEC
)) {
9567 mutex_enter(&vp
->v_lock
);
9568 vp
->v_flag
|= VVMEXEC
;
9569 mutex_exit(&vp
->v_lock
);
9571 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
9573 * Bail out if potentially MAP_SHARED writable mappings exist to this
9574 * vnode. We don't want to use old file contents from existing
9575 * replicas if this mapping was established after the original file
9578 if (vn_is_mapped(vp
, V_WRITE
)) {
9579 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9580 svd
->tr_state
= SEGVN_TR_OFF
;
9581 SEGVN_TR_ADDSTAT(wrcnt
);
9584 svntrp
= svntr_hashtab
[hash
].tr_head
;
9585 for (; svntrp
!= NULL
; svntrp
= svntrp
->tr_next
) {
9586 ASSERT(svntrp
->tr_refcnt
!= 0);
9587 if (svntrp
->tr_vp
!= vp
) {
9592 * Bail out if the file or its attributes were changed after
9593 * this replication entry was created since we need to use the
9594 * latest file contents. Note that mtime test alone is not
9595 * sufficient because a user can explicitly change mtime via
9596 * utimes(2) interfaces back to the old value after modifiying
9597 * the file contents. To detect this case we also have to test
9598 * ctime which among other things records the time of the last
9599 * mtime change by utimes(2). ctime is not changed when the file
9600 * is only read or executed so we expect that typically existing
9601 * replication amp's can be used most of the time.
9603 if (!svntrp
->tr_valid
||
9604 svntrp
->tr_mtime
.tv_sec
!= va
.va_mtime
.tv_sec
||
9605 svntrp
->tr_mtime
.tv_nsec
!= va
.va_mtime
.tv_nsec
||
9606 svntrp
->tr_ctime
.tv_sec
!= va
.va_ctime
.tv_sec
||
9607 svntrp
->tr_ctime
.tv_nsec
!= va
.va_ctime
.tv_nsec
) {
9608 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9609 svd
->tr_state
= SEGVN_TR_OFF
;
9610 SEGVN_TR_ADDSTAT(stale
);
9614 * if off, eoff and szc match current segment we found the
9615 * existing entry we can use.
9617 if (svntrp
->tr_off
== off
&& svntrp
->tr_eoff
== eoff
&&
9618 svntrp
->tr_szc
== szc
) {
9622 * Don't create different but overlapping in file offsets
9623 * entries to avoid replication of the same file pages more
9624 * than once per lgroup.
9626 if ((off
>= svntrp
->tr_off
&& off
< svntrp
->tr_eoff
) ||
9627 (eoff
> svntrp
->tr_off
&& eoff
<= svntrp
->tr_eoff
)) {
9628 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9629 svd
->tr_state
= SEGVN_TR_OFF
;
9630 SEGVN_TR_ADDSTAT(overlap
);
9635 * If we didn't find existing entry create a new one.
9637 if (svntrp
== NULL
) {
9638 svntrp
= kmem_cache_alloc(svntr_cache
, KM_NOSLEEP
);
9639 if (svntrp
== NULL
) {
9640 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9641 svd
->tr_state
= SEGVN_TR_OFF
;
9642 SEGVN_TR_ADDSTAT(nokmem
);
9648 for (i
= 0; i
< NLGRPS_MAX
; i
++) {
9649 ASSERT(svntrp
->tr_amp
[i
] == NULL
);
9654 svntrp
->tr_off
= off
;
9655 svntrp
->tr_eoff
= eoff
;
9656 svntrp
->tr_szc
= szc
;
9657 svntrp
->tr_valid
= 1;
9658 svntrp
->tr_mtime
= va
.va_mtime
;
9659 svntrp
->tr_ctime
= va
.va_ctime
;
9660 svntrp
->tr_refcnt
= 0;
9661 svntrp
->tr_next
= svntr_hashtab
[hash
].tr_head
;
9662 svntr_hashtab
[hash
].tr_head
= svntrp
;
9667 * We want to pick a replica with pages on main thread's (t_tid = 1,
9668 * aka T1) lgrp. Currently text replication is only optimized for
9669 * workloads that either have all threads of a process on the same
9670 * lgrp or execute their large text primarily on main thread.
9672 lgrp_id
= p
->p_t1_lgrpid
;
9673 if (lgrp_id
== LGRP_NONE
) {
9675 * In case exec() prefaults text on non main thread use
9676 * current thread lgrpid. It will become main thread anyway
9679 lgrp_id
= lgrp_home_id(curthread
);
9682 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise
9683 * just set it to NLGRPS_MAX if it's different from current process T1
9684 * home lgrp. p_tr_lgrpid is used to detect if process uses text
9685 * replication and T1 new home is different from lgrp used for text
9686 * replication. When this happens asyncronous segvn thread rechecks if
9687 * segments should change lgrps used for text replication. If we fail
9688 * to set p_tr_lgrpid with atomic_cas_32 then set it to NLGRPS_MAX
9689 * without cas if it's not already NLGRPS_MAX and not equal lgrp_id
9690 * we want to use. We don't need to use cas in this case because
9691 * another thread that races in between our non atomic check and set
9692 * may only change p_tr_lgrpid to NLGRPS_MAX at this point.
9694 ASSERT(lgrp_id
!= LGRP_NONE
&& lgrp_id
< NLGRPS_MAX
);
9695 olid
= p
->p_tr_lgrpid
;
9696 if (lgrp_id
!= olid
&& olid
!= NLGRPS_MAX
) {
9697 lgrp_id_t nlid
= (olid
== LGRP_NONE
) ? lgrp_id
: NLGRPS_MAX
;
9698 if (atomic_cas_32((uint32_t *)&p
->p_tr_lgrpid
, olid
, nlid
) !=
9700 olid
= p
->p_tr_lgrpid
;
9701 ASSERT(olid
!= LGRP_NONE
);
9702 if (olid
!= lgrp_id
&& olid
!= NLGRPS_MAX
) {
9703 p
->p_tr_lgrpid
= NLGRPS_MAX
;
9706 ASSERT(p
->p_tr_lgrpid
!= LGRP_NONE
);
9709 * lgrp_move_thread() won't schedule async recheck after
9710 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not
9711 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid
9714 if (first
&& p
->p_t1_lgrpid
!= LGRP_NONE
&&
9715 p
->p_t1_lgrpid
!= lgrp_id
) {
9721 * If no amp was created yet for lgrp_id create a new one as long as
9722 * we have enough memory to afford it.
9724 if ((amp
= svntrp
->tr_amp
[lgrp_id
]) == NULL
) {
9725 size_t trmem
= atomic_add_long_nv(&segvn_textrepl_bytes
, size
);
9726 if (trmem
> segvn_textrepl_max_bytes
) {
9727 SEGVN_TR_ADDSTAT(normem
);
9730 if (anon_try_resv_zone(size
, NULL
) == 0) {
9731 SEGVN_TR_ADDSTAT(noanon
);
9734 amp
= anonmap_alloc(size
, size
, ANON_NOSLEEP
);
9736 anon_unresv_zone(size
, NULL
);
9737 SEGVN_TR_ADDSTAT(nokmem
);
9740 ASSERT(amp
->refcnt
== 1);
9742 svntrp
->tr_amp
[lgrp_id
] = amp
;
9743 SEGVN_TR_ADDSTAT(newamp
);
9745 svntrp
->tr_refcnt
++;
9746 ASSERT(svd
->svn_trnext
== NULL
);
9747 ASSERT(svd
->svn_trprev
== NULL
);
9748 svd
->svn_trnext
= svntrp
->tr_svnhead
;
9749 svd
->svn_trprev
= NULL
;
9750 if (svntrp
->tr_svnhead
!= NULL
) {
9751 svntrp
->tr_svnhead
->svn_trprev
= svd
;
9753 svntrp
->tr_svnhead
= svd
;
9754 ASSERT(amp
->a_szc
== szc
&& amp
->size
== size
&& amp
->swresv
== size
);
9755 ASSERT(amp
->refcnt
>= 1);
9757 svd
->anon_index
= 0;
9758 svd
->tr_policy_info
.mem_policy
= LGRP_MEM_POLICY_NEXT_SEG
;
9759 svd
->tr_policy_info
.mem_lgrpid
= lgrp_id
;
9760 svd
->tr_state
= SEGVN_TR_ON
;
9761 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9762 SEGVN_TR_ADDSTAT(repl
);
9765 ASSERT(segvn_textrepl_bytes
>= size
);
9766 atomic_add_long(&segvn_textrepl_bytes
, -size
);
9767 ASSERT(svntrp
!= NULL
);
9768 ASSERT(svntrp
->tr_amp
[lgrp_id
] == NULL
);
9769 if (svntrp
->tr_refcnt
== 0) {
9770 ASSERT(svntrp
== svntr_hashtab
[hash
].tr_head
);
9771 svntr_hashtab
[hash
].tr_head
= svntrp
->tr_next
;
9772 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9773 kmem_cache_free(svntr_cache
, svntrp
);
9775 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9777 svd
->tr_state
= SEGVN_TR_OFF
;
9781 * Convert seg back to regular vnode mapping seg by unbinding it from its text
9782 * replication amp. This routine is most typically called when segment is
9783 * unmapped but can also be called when segment no longer qualifies for text
9784 * replication (e.g. due to protection changes). If unload_unmap is set use
9785 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of
9786 * svntr free all its anon maps and remove it from the hash table.
9789 segvn_textunrepl(struct seg
*seg
, int unload_unmap
)
9791 struct segvn_data
*svd
= (struct segvn_data
*)seg
->s_data
;
9792 vnode_t
*vp
= svd
->vp
;
9793 uoff_t off
= svd
->offset
;
9794 size_t size
= seg
->s_size
;
9795 uoff_t eoff
= off
+ size
;
9796 uint_t szc
= seg
->s_szc
;
9797 ulong_t hash
= SVNTR_HASH_FUNC(vp
);
9799 svntr_t
**prv_svntrp
;
9800 lgrp_id_t lgrp_id
= svd
->tr_policy_info
.mem_lgrpid
;
9803 ASSERT(AS_LOCK_HELD(seg
->s_as
));
9804 ASSERT(AS_WRITE_HELD(seg
->s_as
) ||
9805 SEGVN_WRITE_HELD(seg
->s_as
, &svd
->lock
));
9806 ASSERT(svd
->tr_state
== SEGVN_TR_ON
);
9807 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
));
9808 ASSERT(svd
->amp
!= NULL
);
9809 ASSERT(svd
->amp
->refcnt
>= 1);
9810 ASSERT(svd
->anon_index
== 0);
9811 ASSERT(lgrp_id
!= LGRP_NONE
&& lgrp_id
< NLGRPS_MAX
);
9812 ASSERT(svntr_hashtab
!= NULL
);
9814 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
9815 prv_svntrp
= &svntr_hashtab
[hash
].tr_head
;
9816 for (; (svntrp
= *prv_svntrp
) != NULL
; prv_svntrp
= &svntrp
->tr_next
) {
9817 ASSERT(svntrp
->tr_refcnt
!= 0);
9818 if (svntrp
->tr_vp
== vp
&& svntrp
->tr_off
== off
&&
9819 svntrp
->tr_eoff
== eoff
&& svntrp
->tr_szc
== szc
) {
9823 if (svntrp
== NULL
) {
9824 panic("segvn_textunrepl: svntr record not found");
9826 if (svntrp
->tr_amp
[lgrp_id
] != svd
->amp
) {
9827 panic("segvn_textunrepl: amp mismatch");
9829 svd
->tr_state
= SEGVN_TR_OFF
;
9831 if (svd
->svn_trprev
== NULL
) {
9832 ASSERT(svntrp
->tr_svnhead
== svd
);
9833 svntrp
->tr_svnhead
= svd
->svn_trnext
;
9834 if (svntrp
->tr_svnhead
!= NULL
) {
9835 svntrp
->tr_svnhead
->svn_trprev
= NULL
;
9837 svd
->svn_trnext
= NULL
;
9839 svd
->svn_trprev
->svn_trnext
= svd
->svn_trnext
;
9840 if (svd
->svn_trnext
!= NULL
) {
9841 svd
->svn_trnext
->svn_trprev
= svd
->svn_trprev
;
9842 svd
->svn_trnext
= NULL
;
9844 svd
->svn_trprev
= NULL
;
9846 if (--svntrp
->tr_refcnt
) {
9847 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9850 *prv_svntrp
= svntrp
->tr_next
;
9851 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9852 for (i
= 0; i
< NLGRPS_MAX
; i
++) {
9853 struct anon_map
*amp
= svntrp
->tr_amp
[i
];
9857 ASSERT(amp
->refcnt
== 1);
9858 ASSERT(amp
->swresv
== size
);
9859 ASSERT(amp
->size
== size
);
9860 ASSERT(amp
->a_szc
== szc
);
9861 if (amp
->a_szc
!= 0) {
9862 anon_free_pages(amp
->ahp
, 0, size
, szc
);
9864 anon_free(amp
->ahp
, 0, size
);
9866 svntrp
->tr_amp
[i
] = NULL
;
9867 ASSERT(segvn_textrepl_bytes
>= size
);
9868 atomic_add_long(&segvn_textrepl_bytes
, -size
);
9869 anon_unresv_zone(amp
->swresv
, NULL
);
9873 kmem_cache_free(svntr_cache
, svntrp
);
9875 hat_unload_callback(seg
->s_as
->a_hat
, seg
->s_base
, size
,
9876 unload_unmap
? HAT_UNLOAD_UNMAP
: 0, NULL
);
9880 * This is called when a MAP_SHARED writable mapping is created to a vnode
9881 * that is currently used for execution (VVMEXEC flag is set). In this case we
9882 * need to prevent further use of existing replicas.
9885 segvn_inval_trcache(vnode_t
*vp
)
9887 ulong_t hash
= SVNTR_HASH_FUNC(vp
);
9890 ASSERT(vp
->v_flag
& VVMEXEC
);
9892 if (svntr_hashtab
== NULL
) {
9896 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
9897 svntrp
= svntr_hashtab
[hash
].tr_head
;
9898 for (; svntrp
!= NULL
; svntrp
= svntrp
->tr_next
) {
9899 ASSERT(svntrp
->tr_refcnt
!= 0);
9900 if (svntrp
->tr_vp
== vp
&& svntrp
->tr_valid
) {
9901 svntrp
->tr_valid
= 0;
9904 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9908 segvn_trasync_thread(void)
9910 callb_cpr_t cpr_info
;
9911 kmutex_t cpr_lock
; /* just for CPR stuff */
9913 mutex_init(&cpr_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
9915 CALLB_CPR_INIT(&cpr_info
, &cpr_lock
,
9916 callb_generic_cpr
, "segvn_async");
9918 if (segvn_update_textrepl_interval
== 0) {
9919 segvn_update_textrepl_interval
= segvn_update_tr_time
* hz
;
9921 segvn_update_textrepl_interval
*= hz
;
9923 (void) timeout(segvn_trupdate_wakeup
, NULL
,
9924 segvn_update_textrepl_interval
);
9927 mutex_enter(&cpr_lock
);
9928 CALLB_CPR_SAFE_BEGIN(&cpr_info
);
9929 mutex_exit(&cpr_lock
);
9930 sema_p(&segvn_trasync_sem
);
9931 mutex_enter(&cpr_lock
);
9932 CALLB_CPR_SAFE_END(&cpr_info
, &cpr_lock
);
9933 mutex_exit(&cpr_lock
);
9938 static uint64_t segvn_lgrp_trthr_migrs_snpsht
= 0;
9941 segvn_trupdate_wakeup(void *dummy
)
9943 uint64_t cur_lgrp_trthr_migrs
= lgrp_get_trthr_migrations();
9945 if (cur_lgrp_trthr_migrs
!= segvn_lgrp_trthr_migrs_snpsht
) {
9946 segvn_lgrp_trthr_migrs_snpsht
= cur_lgrp_trthr_migrs
;
9947 sema_v(&segvn_trasync_sem
);
9950 if (!segvn_disable_textrepl_update
&&
9951 segvn_update_textrepl_interval
!= 0) {
9952 (void) timeout(segvn_trupdate_wakeup
, dummy
,
9953 segvn_update_textrepl_interval
);
9958 segvn_trupdate(void)
9964 ASSERT(svntr_hashtab
!= NULL
);
9966 for (hash
= 0; hash
< svntr_hashtab_sz
; hash
++) {
9967 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
9968 svntrp
= svntr_hashtab
[hash
].tr_head
;
9969 for (; svntrp
!= NULL
; svntrp
= svntrp
->tr_next
) {
9970 ASSERT(svntrp
->tr_refcnt
!= 0);
9971 svd
= svntrp
->tr_svnhead
;
9972 for (; svd
!= NULL
; svd
= svd
->svn_trnext
) {
9973 segvn_trupdate_seg(svd
->seg
, svd
, svntrp
,
9977 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
9982 segvn_trupdate_seg(struct seg
*seg
, segvn_data_t
*svd
, svntr_t
*svntrp
,
9989 struct anon_map
*amp
;
9991 ASSERT(svd
->vp
!= NULL
);
9992 ASSERT(svd
->vp
== svntrp
->tr_vp
);
9993 ASSERT(svd
->offset
== svntrp
->tr_off
);
9994 ASSERT(svd
->offset
+ seg
->s_size
== svntrp
->tr_eoff
);
9995 ASSERT(seg
!= NULL
);
9996 ASSERT(svd
->seg
== seg
);
9997 ASSERT(seg
->s_data
== (void *)svd
);
9998 ASSERT(seg
->s_szc
== svntrp
->tr_szc
);
9999 ASSERT(svd
->tr_state
== SEGVN_TR_ON
);
10000 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd
->rcookie
));
10001 ASSERT(svd
->amp
!= NULL
);
10002 ASSERT(svd
->tr_policy_info
.mem_policy
== LGRP_MEM_POLICY_NEXT_SEG
);
10003 ASSERT(svd
->tr_policy_info
.mem_lgrpid
!= LGRP_NONE
);
10004 ASSERT(svd
->tr_policy_info
.mem_lgrpid
< NLGRPS_MAX
);
10005 ASSERT(svntrp
->tr_amp
[svd
->tr_policy_info
.mem_lgrpid
] == svd
->amp
);
10006 ASSERT(svntrp
->tr_refcnt
!= 0);
10007 ASSERT(mutex_owned(&svntr_hashtab
[hash
].tr_lock
));
10010 ASSERT(as
!= NULL
&& as
!= &kas
);
10013 ASSERT(p
->p_tr_lgrpid
!= LGRP_NONE
);
10014 lgrp_id
= p
->p_t1_lgrpid
;
10015 if (lgrp_id
== LGRP_NONE
) {
10018 ASSERT(lgrp_id
< NLGRPS_MAX
);
10019 if (svd
->tr_policy_info
.mem_lgrpid
== lgrp_id
) {
10024 * Use tryenter locking since we are locking as/seg and svntr hash
10025 * lock in reverse from syncrounous thread order.
10027 if (!AS_LOCK_TRYENTER(as
, RW_READER
)) {
10028 SEGVN_TR_ADDSTAT(nolock
);
10029 if (segvn_lgrp_trthr_migrs_snpsht
) {
10030 segvn_lgrp_trthr_migrs_snpsht
= 0;
10034 if (!SEGVN_LOCK_TRYENTER(seg
->s_as
, &svd
->lock
, RW_WRITER
)) {
10036 SEGVN_TR_ADDSTAT(nolock
);
10037 if (segvn_lgrp_trthr_migrs_snpsht
) {
10038 segvn_lgrp_trthr_migrs_snpsht
= 0;
10042 size
= seg
->s_size
;
10043 if (svntrp
->tr_amp
[lgrp_id
] == NULL
) {
10044 size_t trmem
= atomic_add_long_nv(&segvn_textrepl_bytes
, size
);
10045 if (trmem
> segvn_textrepl_max_bytes
) {
10046 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
10048 atomic_add_long(&segvn_textrepl_bytes
, -size
);
10049 SEGVN_TR_ADDSTAT(normem
);
10052 if (anon_try_resv_zone(size
, NULL
) == 0) {
10053 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
10055 atomic_add_long(&segvn_textrepl_bytes
, -size
);
10056 SEGVN_TR_ADDSTAT(noanon
);
10059 amp
= anonmap_alloc(size
, size
, KM_NOSLEEP
);
10061 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
10063 atomic_add_long(&segvn_textrepl_bytes
, -size
);
10064 anon_unresv_zone(size
, NULL
);
10065 SEGVN_TR_ADDSTAT(nokmem
);
10068 ASSERT(amp
->refcnt
== 1);
10069 amp
->a_szc
= seg
->s_szc
;
10070 svntrp
->tr_amp
[lgrp_id
] = amp
;
10073 * We don't need to drop the bucket lock but here we give other
10074 * threads a chance. svntr and svd can't be unlinked as long as
10075 * segment lock is held as a writer and AS held as well. After we
10076 * retake bucket lock we'll continue from where we left. We'll be able
10077 * to reach the end of either list since new entries are always added
10078 * to the beginning of the lists.
10080 mutex_exit(&svntr_hashtab
[hash
].tr_lock
);
10081 hat_unload_callback(as
->a_hat
, seg
->s_base
, size
, 0, NULL
);
10082 mutex_enter(&svntr_hashtab
[hash
].tr_lock
);
10084 ASSERT(svd
->tr_state
== SEGVN_TR_ON
);
10085 ASSERT(svd
->amp
!= NULL
);
10086 ASSERT(svd
->tr_policy_info
.mem_policy
== LGRP_MEM_POLICY_NEXT_SEG
);
10087 ASSERT(svd
->tr_policy_info
.mem_lgrpid
!= lgrp_id
);
10088 ASSERT(svd
->amp
!= svntrp
->tr_amp
[lgrp_id
]);
10090 svd
->tr_policy_info
.mem_lgrpid
= lgrp_id
;
10091 svd
->amp
= svntrp
->tr_amp
[lgrp_id
];
10092 p
->p_tr_lgrpid
= NLGRPS_MAX
;
10093 SEGVN_LOCK_EXIT(seg
->s_as
, &svd
->lock
);
10096 ASSERT(svntrp
->tr_refcnt
!= 0);
10097 ASSERT(svd
->vp
== svntrp
->tr_vp
);
10098 ASSERT(svd
->tr_policy_info
.mem_lgrpid
== lgrp_id
);
10099 ASSERT(svd
->amp
!= NULL
&& svd
->amp
== svntrp
->tr_amp
[lgrp_id
]);
10100 ASSERT(svd
->seg
== seg
);
10101 ASSERT(svd
->tr_state
== SEGVN_TR_ON
);
10103 SEGVN_TR_ADDSTAT(asyncrepl
);