4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
40 * VM - anonymous pages.
42 * This layer sits immediately above the vm_swap layer. It manages
43 * physical pages that have no permanent identity in the file system
44 * name space, using the services of the vm_swap layer to allocate
45 * backing storage for these pages. Since these pages have no external
46 * identity, they are discarded when the last reference is removed.
48 * An important function of this layer is to manage low-level sharing
49 * of pages that are logically distinct but that happen to be
50 * physically identical (e.g., the corresponding pages of the processes
51 * resulting from a fork before one process or the other changes their
52 * contents). This pseudo-sharing is present only as an optimization
53 * and is not to be confused with true sharing in which multiple
54 * address spaces deliberately contain references to the same object;
55 * such sharing is managed at a higher level.
57 * The key data structure here is the anon struct, which contains a
58 * reference count for its associated physical page and a hint about
59 * the identity of that page. Anon structs typically live in arrays,
60 * with an instance's position in its array determining where the
61 * corresponding backing storage is allocated; however, the swap_xlate()
62 * routine abstracts away this representation information so that the
63 * rest of the anon layer need not know it. (See the swap layer for
64 * more details on anon struct layout.)
66 * In the future versions of the system, the association between an
67 * anon struct and its position on backing store will change so that
68 * we don't require backing store all anonymous pages in the system.
69 * This is important for consideration for large memory systems.
70 * We can also use this technique to delay binding physical locations
71 * to anonymous pages until pageout time where we can make smarter
72 * allocation decisions to improve anonymous klustering.
74 * Many of the routines defined here take a (struct anon **) argument,
75 * which allows the code at this level to manage anon pages directly,
76 * so that callers can regard anon structs as opaque objects and not be
77 * concerned with assigning or inspecting their contents.
79 * Clients of this layer refer to anon pages indirectly. That is, they
80 * maintain arrays of pointers to anon structs rather than maintaining
81 * anon structs themselves. The (struct anon **) arguments mentioned
82 * above are pointers to entries in these arrays. It is these arrays
83 * that capture the mapping between offsets within a given segment and
84 * the corresponding anonymous backing storage address.
91 #include <sys/types.h>
92 #include <sys/t_lock.h>
93 #include <sys/param.h>
94 #include <sys/systm.h>
97 #include <sys/thread.h>
98 #include <sys/vnode.h>
99 #include <sys/cpuvar.h>
100 #include <sys/swap.h>
101 #include <sys/cmn_err.h>
102 #include <sys/vtrace.h>
103 #include <sys/kmem.h>
104 #include <sys/sysmacros.h>
105 #include <sys/bitmap.h>
106 #include <sys/vmsystm.h>
107 #include <sys/tuneable.h>
108 #include <sys/debug.h>
109 #include <sys/fs/swapnode.h>
110 #include <sys/tnf_probe.h>
111 #include <sys/lgrp.h>
112 #include <sys/policy.h>
113 #include <sys/condvar_impl.h>
114 #include <sys/mutex_impl.h>
115 #include <sys/rctl.h>
121 #include <vm/vpage.h>
125 #include <sys/fs_subr.h>
127 struct vnode
*anon_vp
;
131 kmutex_t anoninfo_lock
;
132 struct k_anoninfo k_anoninfo
;
133 ani_free_t
*ani_free_pool
;
134 pad_mutex_t anon_array_lock
[ANON_LOCKSIZE
];
135 kcondvar_t anon_array_cv
[ANON_LOCKSIZE
];
138 * Global hash table for (vp, off) -> anon slot
140 extern int swap_maxcontig
;
141 size_t anon_hash_size
;
142 unsigned int anon_hash_shift
;
143 struct anon
**anon_hash
;
145 static struct kmem_cache
*anon_cache
;
146 static struct kmem_cache
*anonmap_cache
;
148 pad_mutex_t
*anonhash_lock
;
151 * Used to make the increment of all refcnts of all anon slots of a large
152 * page appear to be atomic. The lock is grabbed for the first anon slot of
155 pad_mutex_t
*anonpages_hash_lock
;
157 #define APH_MUTEX(vp, off) \
158 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \
159 (AH_LOCK_SIZE - 1))].pad_mutex)
162 static struct anonvmstats_str
{
163 ulong_t getpages
[30];
164 ulong_t privatepages
[10];
165 ulong_t demotepages
[9];
166 ulong_t decrefpages
[9];
167 ulong_t dupfillholes
[4];
168 ulong_t freepages
[1];
170 #endif /* VM_STATS */
174 anonmap_cache_constructor(void *buf
, void *cdrarg
, int kmflags
)
176 struct anon_map
*amp
= buf
;
178 rw_init(&
->a_rwlock
, NULL
, RW_DEFAULT
, NULL
);
179 cv_init(&
->a_purgecv
, NULL
, CV_DEFAULT
, NULL
);
180 mutex_init(&
->a_pmtx
, NULL
, MUTEX_DEFAULT
, NULL
);
181 mutex_init(&
->a_purgemtx
, NULL
, MUTEX_DEFAULT
, NULL
);
187 anonmap_cache_destructor(void *buf
, void *cdrarg
)
189 struct anon_map
*amp
= buf
;
191 rw_destroy(&
->a_rwlock
);
192 cv_destroy(&
->a_purgecv
);
193 mutex_destroy(&
->a_pmtx
);
194 mutex_destroy(&
->a_purgemtx
);
203 /* These both need to be powers of 2 so round up to the next power */
204 anon_hash_shift
= highbit((physmem
/ ANON_HASHAVELEN
) - 1);
205 anon_hash_size
= 1L << anon_hash_shift
;
208 * We need to align the anonhash_lock and anonpages_hash_lock arrays
209 * to a 64B boundary to avoid false sharing. We add 63B to our
210 * allocation so that we can get a 64B aligned address to use.
211 * We allocate both of these together to avoid wasting an additional
214 tmp
= kmem_zalloc((2 * AH_LOCK_SIZE
* sizeof (pad_mutex_t
)) + 63,
216 anonhash_lock
= (pad_mutex_t
*)P2ROUNDUP((uintptr_t)tmp
, 64);
217 anonpages_hash_lock
= anonhash_lock
+ AH_LOCK_SIZE
;
219 for (i
= 0; i
< AH_LOCK_SIZE
; i
++) {
220 mutex_init(&anonhash_lock
[i
].pad_mutex
, NULL
, MUTEX_DEFAULT
,
222 mutex_init(&anonpages_hash_lock
[i
].pad_mutex
, NULL
,
223 MUTEX_DEFAULT
, NULL
);
226 for (i
= 0; i
< ANON_LOCKSIZE
; i
++) {
227 mutex_init(&anon_array_lock
[i
].pad_mutex
, NULL
,
228 MUTEX_DEFAULT
, NULL
);
229 cv_init(&anon_array_cv
[i
], NULL
, CV_DEFAULT
, NULL
);
232 anon_hash
= (struct anon
**)
233 kmem_zalloc(sizeof (struct anon
*) * anon_hash_size
, KM_SLEEP
);
234 anon_cache
= kmem_cache_create("anon_cache", sizeof (struct anon
),
235 AN_CACHE_ALIGN
, NULL
, NULL
, NULL
, NULL
, NULL
, KMC_PREFILL
);
236 anonmap_cache
= kmem_cache_create("anonmap_cache",
237 sizeof (struct anon_map
), 0,
238 anonmap_cache_constructor
, anonmap_cache_destructor
, NULL
,
240 swap_maxcontig
= (1024 * 1024) >> PAGESHIFT
; /* 1MB of pages */
242 tmp
= kmem_zalloc((ANI_MAX_POOL
* sizeof (ani_free_t
)) + 63, KM_SLEEP
);
243 /* Round ani_free_pool to cacheline boundary to avoid false sharing. */
244 ani_free_pool
= (ani_free_t
*)P2ROUNDUP((uintptr_t)tmp
, 64);
246 anon_vp
= vn_alloc(KM_SLEEP
);
247 vn_setops(anon_vp
, &swap_vnodeops
);
248 anon_vp
->v_type
= VREG
;
249 anon_vp
->v_flag
|= (VISSWAP
|VISSWAPFS
);
253 * Global anon slot hash table manipulation.
257 anon_addhash(struct anon
*ap
)
261 ASSERT(MUTEX_HELD(AH_MUTEX(ap
->an_vp
, ap
->an_off
)));
262 index
= ANON_HASH(ap
->an_vp
, ap
->an_off
);
263 ap
->an_hash
= anon_hash
[index
];
264 anon_hash
[index
] = ap
;
268 anon_rmhash(struct anon
*ap
)
272 ASSERT(MUTEX_HELD(AH_MUTEX(ap
->an_vp
, ap
->an_off
)));
274 for (app
= &anon_hash
[ANON_HASH(ap
->an_vp
, ap
->an_off
)];
275 *app
; app
= &((*app
)->an_hash
)) {
284 * The anon array interfaces. Functions allocating,
285 * freeing array of pointers, and returning/setting
286 * entries in the array of pointers for a given offset.
288 * Create the list of pointers
291 anon_create(pgcnt_t npages
, int flags
)
293 struct anon_hdr
*ahp
;
295 int kmemflags
= (flags
& ANON_NOSLEEP
) ? KM_NOSLEEP
: KM_SLEEP
;
297 if ((ahp
= kmem_zalloc(sizeof (struct anon_hdr
), kmemflags
)) == NULL
) {
301 mutex_init(&ahp
->serial_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
306 if (npages
<= ANON_CHUNK_SIZE
|| (flags
& ANON_ALLOC_FORCE
)) {
308 if (flags
& ANON_ALLOC_FORCE
)
309 ahp
->flags
|= ANON_ALLOC_FORCE
;
311 ahp
->array_chunk
= kmem_zalloc(
312 ahp
->size
* sizeof (struct anon
*), kmemflags
);
314 if (ahp
->array_chunk
== NULL
) {
315 kmem_free(ahp
, sizeof (struct anon_hdr
));
321 * anon hdr size needs to be rounded off to be a multiple
322 * of ANON_CHUNK_SIZE. This is important as various anon
323 * related functions depend on this.
325 * anon_grow() makes anon hdr size a multiple of
327 * amp size is <= anon hdr size.
328 * anon_index + seg_pgs <= anon hdr size.
330 ahp
->size
= P2ROUNDUP(npages
, ANON_CHUNK_SIZE
);
331 nchunks
= ahp
->size
>> ANON_CHUNK_SHIFT
;
333 ahp
->array_chunk
= kmem_zalloc(nchunks
* sizeof (ulong_t
*),
336 if (ahp
->array_chunk
== NULL
) {
337 kmem_free(ahp
, sizeof (struct anon_hdr
));
345 * Free the array of pointers
348 anon_release(struct anon_hdr
*ahp
, pgcnt_t npages
)
354 ASSERT(npages
<= ahp
->size
);
359 if (npages
<= ANON_CHUNK_SIZE
|| (ahp
->flags
& ANON_ALLOC_FORCE
)) {
360 kmem_free(ahp
->array_chunk
, ahp
->size
* sizeof (struct anon
*));
365 nchunks
= ahp
->size
>> ANON_CHUNK_SHIFT
;
366 for (i
= 0; i
< nchunks
; i
++) {
367 ppp
= &ahp
->array_chunk
[i
];
369 kmem_free(*ppp
, PAGESIZE
);
371 kmem_free(ahp
->array_chunk
, nchunks
* sizeof (ulong_t
*));
373 mutex_destroy(&ahp
->serial_lock
);
374 kmem_free(ahp
, sizeof (struct anon_hdr
));
378 * Return the pointer from the list for a
379 * specified anon index.
382 anon_get_ptr(struct anon_hdr
*ahp
, ulong_t an_idx
)
386 ASSERT(an_idx
< ahp
->size
);
391 if ((ahp
->size
<= ANON_CHUNK_SIZE
) || (ahp
->flags
& ANON_ALLOC_FORCE
)) {
392 return ((struct anon
*)
393 ((uintptr_t)ahp
->array_chunk
[an_idx
] & ANON_PTRMASK
));
399 app
= ahp
->array_chunk
[an_idx
>> ANON_CHUNK_SHIFT
];
401 return ((struct anon
*)
402 ((uintptr_t)app
[an_idx
& ANON_CHUNK_OFF
] &
411 * Return the anon pointer for the first valid entry in the anon list,
412 * starting from the given index.
415 anon_get_next_ptr(struct anon_hdr
*ahp
, ulong_t
*index
)
429 if ((size
<= ANON_CHUNK_SIZE
) || (ahp
->flags
& ANON_ALLOC_FORCE
)) {
435 ((uintptr_t)ahp
->array_chunk
[i
] & ANON_PTRMASK
);
446 chunkoff
= i
& ANON_CHUNK_OFF
;
448 app
= ahp
->array_chunk
[i
>> ANON_CHUNK_SHIFT
];
450 for (j
= chunkoff
; j
< ANON_CHUNK_SIZE
; j
++) {
452 ((uintptr_t)app
[j
] & ANON_PTRMASK
);
454 *index
= i
+ (j
- chunkoff
);
459 i
= (i
+ ANON_CHUNK_SIZE
) & ~ANON_CHUNK_OFF
;
467 * Set list entry with a given pointer for a specified offset
470 anon_set_ptr(struct anon_hdr
*ahp
, ulong_t an_idx
, struct anon
*ap
, int flags
)
474 int kmemflags
= (flags
& ANON_NOSLEEP
) ? KM_NOSLEEP
: KM_SLEEP
;
477 ASSERT(an_idx
< ahp
->size
);
482 if (ahp
->size
<= ANON_CHUNK_SIZE
|| (ahp
->flags
& ANON_ALLOC_FORCE
)) {
483 ap_addr
= (uintptr_t *)&ahp
->array_chunk
[an_idx
];
489 ppp
= &ahp
->array_chunk
[an_idx
>> ANON_CHUNK_SHIFT
];
493 mutex_enter(&ahp
->serial_lock
);
494 ppp
= &ahp
->array_chunk
[an_idx
>> ANON_CHUNK_SHIFT
];
496 *ppp
= kmem_zalloc(PAGESIZE
, kmemflags
);
498 mutex_exit(&ahp
->serial_lock
);
502 mutex_exit(&ahp
->serial_lock
);
505 ap_addr
= (uintptr_t *)&app
[an_idx
& ANON_CHUNK_OFF
];
507 *ap_addr
= (*ap_addr
& ~ANON_PTRMASK
) | (uintptr_t)ap
;
512 * Copy anon array into a given new anon array
515 anon_copy_ptr(struct anon_hdr
*sahp
, ulong_t s_idx
,
516 struct anon_hdr
*dahp
, ulong_t d_idx
,
517 pgcnt_t npages
, int flags
)
521 int kmemflags
= (flags
& ANON_NOSLEEP
) ? KM_NOSLEEP
: KM_SLEEP
;
523 ASSERT((s_idx
< sahp
->size
) && (d_idx
< dahp
->size
));
524 ASSERT((npages
<= sahp
->size
) && (npages
<= dahp
->size
));
527 * Both arrays are 1 level.
529 if (((sahp
->size
<= ANON_CHUNK_SIZE
) &&
530 (dahp
->size
<= ANON_CHUNK_SIZE
)) ||
531 ((sahp
->flags
& ANON_ALLOC_FORCE
) &&
532 (dahp
->flags
& ANON_ALLOC_FORCE
))) {
534 bcopy(&sahp
->array_chunk
[s_idx
], &dahp
->array_chunk
[d_idx
],
535 npages
* sizeof (struct anon
*));
540 * Both arrays are 2 levels.
542 if (sahp
->size
> ANON_CHUNK_SIZE
&&
543 dahp
->size
> ANON_CHUNK_SIZE
&&
544 ((sahp
->flags
& ANON_ALLOC_FORCE
) == 0) &&
545 ((dahp
->flags
& ANON_ALLOC_FORCE
) == 0)) {
547 ulong_t sapidx
, dapidx
;
551 while (npages
!= 0) {
553 sapidx
= s_idx
& ANON_CHUNK_OFF
;
554 dapidx
= d_idx
& ANON_CHUNK_OFF
;
555 chknp
= ANON_CHUNK_SIZE
- MAX(sapidx
, dapidx
);
559 sapp
= &sahp
->array_chunk
[s_idx
>> ANON_CHUNK_SHIFT
];
560 if ((sap
= *sapp
) != NULL
) {
561 dapp
= &dahp
->array_chunk
[d_idx
562 >> ANON_CHUNK_SHIFT
];
563 if ((dap
= *dapp
) == NULL
) {
564 *dapp
= kmem_zalloc(PAGESIZE
,
566 if ((dap
= *dapp
) == NULL
)
569 bcopy((sap
+ sapidx
), (dap
+ dapidx
),
570 chknp
<< ANON_PTRSHIFT
);
580 * At least one of the arrays is 2 level.
583 if ((ap
= anon_get_ptr(sahp
, s_idx
)) != NULL
) {
584 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp
, s_idx
)));
585 if (anon_set_ptr(dahp
, d_idx
, ap
, flags
) == ENOMEM
)
596 * ANON_INITBUF is a convenience macro for anon_grow() below. It
597 * takes a buffer dst, which is at least as large as buffer src. It
598 * does a bcopy from src into dst, and then bzeros the extra bytes
599 * of dst. If tail is set, the data in src is tail aligned within
600 * dst instead of head aligned.
603 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \
605 bzero((dst), (dstsize) - (srclen)); \
606 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
608 bcopy((src), (dst), (srclen)); \
609 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \
612 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8)
613 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
616 * anon_grow() is used to efficiently extend an existing anon array.
617 * startidx_p points to the index into the anon array of the first page
618 * that is in use. oldseg_pgs is the number of pages in use, starting at
619 * *startidx_p. newpages is the number of additional pages desired.
621 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
623 * The growth is done by creating a new top level of the anon array,
624 * and (if the array is 2-level) reusing the existing second level arrays.
626 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
628 * Returns the new number of pages in the anon array.
631 anon_grow(struct anon_hdr
*ahp
, ulong_t
*startidx_p
, pgcnt_t oldseg_pgs
,
632 pgcnt_t newseg_pgs
, int flags
)
634 ulong_t startidx
= startidx_p
? *startidx_p
: 0;
635 pgcnt_t oldamp_pgs
= ahp
->size
, newamp_pgs
;
636 pgcnt_t oelems
, nelems
, totpages
;
638 int kmemflags
= (flags
& ANON_NOSLEEP
) ? KM_NOSLEEP
: KM_SLEEP
;
639 int growdown
= (flags
& ANON_GROWDOWN
);
640 size_t newarrsz
, oldarrsz
;
643 ASSERT(!(startidx_p
== NULL
&& growdown
));
644 ASSERT(startidx
+ oldseg_pgs
<= ahp
->size
);
647 * Determine the total number of pages needed in the new
648 * anon array. If growing down, totpages is all pages from
649 * startidx through the end of the array, plus <newseg_pgs>
650 * pages. If growing up, keep all pages from page 0 through
651 * the last page currently in use, plus <newseg_pgs> pages.
654 totpages
= oldamp_pgs
- startidx
+ newseg_pgs
;
656 totpages
= startidx
+ oldseg_pgs
+ newseg_pgs
;
658 /* If the array is already large enough, just return. */
660 if (oldamp_pgs
>= totpages
) {
662 *startidx_p
= oldamp_pgs
- totpages
;
667 * oldamp_pgs/newamp_pgs are the total numbers of pages represented
668 * by the corresponding arrays.
669 * oelems/nelems are the number of pointers in the top level arrays
670 * which may be either level 1 or level 2.
671 * Will the new anon array be one level or two levels?
673 if (totpages
<= ANON_CHUNK_SIZE
|| (ahp
->flags
& ANON_ALLOC_FORCE
)) {
674 newamp_pgs
= P2ROUNDUP(totpages
, ANON_1_LEVEL_INC
);
678 newamp_pgs
= P2ROUNDUP(totpages
, ANON_2_LEVEL_INC
);
679 oelems
= (oldamp_pgs
+ ANON_CHUNK_OFF
) >> ANON_CHUNK_SHIFT
;
680 nelems
= newamp_pgs
>> ANON_CHUNK_SHIFT
;
683 newarrsz
= nelems
* sizeof (void *);
684 level1
= kmem_alloc(newarrsz
, kmemflags
);
688 /* Are we converting from a one level to a two level anon array? */
690 if (newamp_pgs
> ANON_CHUNK_SIZE
&& oldamp_pgs
<= ANON_CHUNK_SIZE
&&
691 !(ahp
->flags
& ANON_ALLOC_FORCE
)) {
694 * Yes, we're converting to a two level. Reuse old level 1
695 * as new level 2 if it is exactly PAGESIZE. Otherwise
696 * alloc a new level 2 and copy the old level 1 data into it.
698 if (oldamp_pgs
== ANON_CHUNK_SIZE
) {
699 level2
= (void *)ahp
->array_chunk
;
701 level2
= kmem_alloc(PAGESIZE
, kmemflags
);
702 if (level2
== NULL
) {
703 kmem_free(level1
, newarrsz
);
706 oldarrsz
= oldamp_pgs
* sizeof (void *);
708 ANON_INITBUF(ahp
->array_chunk
, oldarrsz
,
709 level2
, PAGESIZE
, growdown
);
710 kmem_free(ahp
->array_chunk
, oldarrsz
);
712 bzero(level1
, newarrsz
);
714 level1
[nelems
- 1] = level2
;
718 oldarrsz
= oelems
* sizeof (void *);
720 ANON_INITBUF(ahp
->array_chunk
, oldarrsz
,
721 level1
, newarrsz
, growdown
);
722 kmem_free(ahp
->array_chunk
, oldarrsz
);
725 ahp
->array_chunk
= level1
;
726 ahp
->size
= newamp_pgs
;
728 *startidx_p
= newamp_pgs
- totpages
;
735 * Called to sync ani_free value.
741 processorid_t ix
, max_seqid
;
743 static clock_t last_time
;
746 if (ani_free_pool
== NULL
)
750 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to
751 * identify the maximum number of CPUs were ever online.
753 new_time
= ddi_get_lbolt();
754 if (new_time
> last_time
) {
756 max_seqid
= max_cpu_seqid_ever
;
757 ASSERT(ANI_MAX_POOL
> max_seqid
);
758 for (ix
= 0; ix
<= max_seqid
; ix
++)
759 total
+= ani_free_pool
[ix
].ani_count
;
761 last_time
= new_time
;
762 k_anoninfo
.ani_free
= total
;
767 * Reserve anon space.
769 * It's no longer simply a matter of incrementing ani_resv to
770 * reserve swap space, we need to check memory-based as well
771 * as disk-backed (physical) swap. The following algorithm
773 * Check the space on physical swap
774 * i.e. amount needed < ani_max - ani_phys_resv
775 * If we are swapping on swapfs check
776 * amount needed < (availrmem - swapfs_minfree)
777 * Since the algorithm to check for the quantity of swap space is
778 * almost the same as that for reserving it, we'll just use anon_resvmem
779 * with a flag to decrement availrmem.
781 * Return non-zero on success.
784 anon_resvmem(size_t size
, boolean_t takemem
, zone_t
*zone
, int tryhard
)
786 pgcnt_t npages
= btopr(size
);
787 pgcnt_t mswap_pages
= 0;
788 pgcnt_t pswap_pages
= 0;
792 /* test zone.max-swap resource control */
793 mutex_enter(&p
->p_lock
);
794 if (rctl_incr_swap(p
, zone
, ptob(npages
)) != 0) {
795 mutex_exit(&p
->p_lock
);
798 atomic_add_64(&zone
->zone_anon_alloc_fail
, 1);
804 rctl_decr_swap(zone
, ptob(npages
));
806 mutex_exit(&p
->p_lock
);
808 mutex_enter(&anoninfo_lock
);
811 * pswap_pages is the number of pages we can take from
812 * physical (i.e. disk-backed) swap.
814 ASSERT(k_anoninfo
.ani_max
>= k_anoninfo
.ani_phys_resv
);
815 pswap_pages
= k_anoninfo
.ani_max
- k_anoninfo
.ani_phys_resv
;
818 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
819 npages
, takemem
, pswap_pages
, (void *)caller()));
821 if (npages
<= pswap_pages
) {
823 * we have enough space on a physical swap
826 k_anoninfo
.ani_phys_resv
+= npages
;
827 mutex_exit(&anoninfo_lock
);
829 } else if (pswap_pages
!= 0) {
831 * we have some space on a physical swap
835 * use up remainder of phys swap
837 k_anoninfo
.ani_phys_resv
+= pswap_pages
;
838 ASSERT(k_anoninfo
.ani_phys_resv
== k_anoninfo
.ani_max
);
842 * since (npages > pswap_pages) we need mem swap
843 * mswap_pages is the number of pages needed from availrmem
845 ASSERT(npages
> pswap_pages
);
846 mswap_pages
= npages
- pswap_pages
;
848 ANON_PRINT(A_RESV
, ("anon_resvmem: need %ld pages from memory\n",
852 * priv processes can reserve memory as swap as long as availrmem
853 * remains greater than swapfs_minfree; in the case of non-priv
854 * processes, memory can be reserved as swap only if availrmem
855 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
856 * swapfs_reserve amount of memswap is not available to non-priv
857 * processes. This protects daemons such as automounter dying
858 * as a result of application processes eating away almost entire
859 * membased swap. This safeguard becomes useless if apps are run
862 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
868 if (secpolicy_resource_anon_mem(CRED())) {
869 floor_pages
= swapfs_minfree
;
871 floor_pages
= swapfs_minfree
+ swapfs_reserve
;
874 mutex_exit(&anoninfo_lock
);
875 (void) page_reclaim_mem(mswap_pages
, floor_pages
, 0);
876 mutex_enter(&anoninfo_lock
);
879 mutex_enter(&freemem_lock
);
880 if (availrmem
> (swapfs_minfree
+ swapfs_reserve
+ mswap_pages
) ||
881 (availrmem
> (swapfs_minfree
+ mswap_pages
) &&
882 secpolicy_resource(CRED()) == 0)) {
886 * Take the memory from the rest of the system.
888 availrmem
-= mswap_pages
;
889 mutex_exit(&freemem_lock
);
890 k_anoninfo
.ani_mem_resv
+= mswap_pages
;
891 ANI_ADD(mswap_pages
);
892 ANON_PRINT((A_RESV
| A_MRESV
),
893 ("anon_resvmem: took %ld pages of availrmem\n",
896 mutex_exit(&freemem_lock
);
899 ASSERT(k_anoninfo
.ani_max
>= k_anoninfo
.ani_phys_resv
);
900 mutex_exit(&anoninfo_lock
);
904 * Fail if not enough memory
907 k_anoninfo
.ani_phys_resv
-= pswap_pages
;
910 mutex_exit(&freemem_lock
);
911 mutex_exit(&anoninfo_lock
);
913 ("anon_resvmem: not enough space from swapfs\n"));
914 if (zone
!= NULL
&& takemem
)
915 rctl_decr_swap(zone
, ptob(npages
));
921 * Give back an anon reservation.
924 anon_unresvmem(size_t size
, zone_t
*zone
)
926 pgcnt_t npages
= btopr(size
);
927 spgcnt_t mem_free_pages
= 0;
928 pgcnt_t phys_free_slots
;
933 rctl_decr_swap(zone
, ptob(npages
));
935 mutex_enter(&anoninfo_lock
);
937 ASSERT(k_anoninfo
.ani_mem_resv
>= k_anoninfo
.ani_locked_swap
);
940 * If some of this reservation belonged to swapfs
941 * give it back to availrmem.
942 * ani_mem_resv is the amount of availrmem swapfs has reserved.
943 * but some of that memory could be locked by segspt so we can only
944 * return non locked ani_mem_resv back to availrmem
946 if (k_anoninfo
.ani_mem_resv
> k_anoninfo
.ani_locked_swap
) {
947 ANON_PRINT((A_RESV
| A_MRESV
),
948 ("anon_unresv: growing availrmem by %ld pages\n",
949 MIN(k_anoninfo
.ani_mem_resv
, npages
)));
951 mem_free_pages
= MIN((spgcnt_t
)(k_anoninfo
.ani_mem_resv
-
952 k_anoninfo
.ani_locked_swap
), npages
);
953 mutex_enter(&freemem_lock
);
954 availrmem
+= mem_free_pages
;
955 mutex_exit(&freemem_lock
);
956 k_anoninfo
.ani_mem_resv
-= mem_free_pages
;
958 ANI_ADD(-mem_free_pages
);
961 * The remainder of the pages is returned to phys swap
963 ASSERT(npages
>= mem_free_pages
);
964 phys_free_slots
= npages
- mem_free_pages
;
966 if (phys_free_slots
) {
967 k_anoninfo
.ani_phys_resv
-= phys_free_slots
;
971 mem_resv
= k_anoninfo
.ani_mem_resv
;
974 ASSERT(k_anoninfo
.ani_mem_resv
>= k_anoninfo
.ani_locked_swap
);
975 ASSERT(k_anoninfo
.ani_max
>= k_anoninfo
.ani_phys_resv
);
977 mutex_exit(&anoninfo_lock
);
979 ANON_PRINT(A_RESV
, ("anon_unresv: %lu, tot %lu, caller %p\n",
980 npages
, mem_resv
, (void *)caller()));
984 * Allocate an anon slot and return it with the lock held.
987 anon_alloc(struct vnode
*vp
, anoff_t off
)
992 ap
= kmem_cache_alloc(anon_cache
, KM_SLEEP
);
1002 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1007 ANON_PRINT(A_ANON
, ("anon_alloc: returning ap %p, vp %p\n",
1008 (void *)ap
, (ap
? (void *)ap
->an_vp
: NULL
)));
1013 * Called for pages locked in memory via softlock/pagelock/mlock to make sure
1014 * such pages don't consume any physical swap resources needed for swapping
1018 anon_swap_free(struct anon
*ap
, page_t
*pp
)
1024 ASSERT(PAGE_LOCKED(pp
));
1025 VERIFY(pp
->p_object
!= NULL
);
1026 ASSERT(pp
->p_vnode
!= NULL
);
1027 ASSERT(IS_SWAPFSVP(pp
->p_vnode
));
1028 ASSERT(ap
->an_refcnt
!= 0);
1029 VERIFY(pp
->p_object
== &ap
->an_vp
->v_object
);
1030 ASSERT(pp
->p_vnode
== ap
->an_vp
);
1031 ASSERT(pp
->p_offset
== ap
->an_off
);
1033 if (ap
->an_pvp
== NULL
)
1037 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1040 ASSERT(ap
->an_refcnt
!= 0);
1041 VERIFY(pp
->p_object
== &ap
->an_vp
->v_object
);
1042 ASSERT(pp
->p_vnode
== ap
->an_vp
);
1043 ASSERT(pp
->p_offset
== ap
->an_off
);
1045 if (ap
->an_pvp
!= NULL
) {
1046 swap_phys_free(ap
->an_pvp
, ap
->an_poff
, PAGESIZE
);
1058 * Decrement the reference count of an anon page.
1059 * If reference count goes to zero, free it and
1060 * its associated page (if any).
1063 anon_decref(struct anon
*ap
)
1070 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1072 ASSERT(ap
->an_refcnt
!= 0);
1073 if (ap
->an_refcnt
== 0)
1074 panic("anon_decref: slot count 0");
1075 if (--ap
->an_refcnt
== 0) {
1076 swap_xlate(ap
, &vp
, &off
);
1078 if (ap
->an_pvp
!= NULL
)
1079 swap_phys_free(ap
->an_pvp
, ap
->an_poff
, PAGESIZE
);
1083 * If there is a page for this anon slot we will need to
1084 * call VN_DISPOSE to get rid of the vp association and
1085 * put the page back on the free list as really free.
1086 * Acquire the "exclusive" lock to ensure that any
1087 * pending i/o always completes before the swap slot
1090 pp
= page_lookup(&vp
->v_object
, (uoff_t
)off
, SE_EXCL
);
1092 VN_DISPOSE(pp
, B_INVAL
, 0, kcred
);
1094 ANON_PRINT(A_ANON
, ("anon_decref: free ap %p, vp %p\n",
1095 (void *)ap
, (void *)ap
->an_vp
));
1097 kmem_cache_free(anon_cache
, ap
);
1107 * check an_refcnt of the root anon slot (anon_index argument is aligned at
1108 * seg->s_szc level) to determine whether COW processing is required.
1109 * anonpages_hash_lock[] held on the root ap ensures that if root's
1110 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
1111 * later since this process can't fork while its AS lock is held).
1113 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
1116 anon_szcshare(struct anon_hdr
*ahp
, ulong_t anon_index
)
1119 kmutex_t
*ahmpages
= NULL
;
1121 ap
= anon_get_ptr(ahp
, anon_index
);
1125 ahmpages
= APH_MUTEX(ap
->an_vp
, ap
->an_off
);
1126 mutex_enter(ahmpages
);
1127 ASSERT(ap
->an_refcnt
>= 1);
1128 if (ap
->an_refcnt
== 1) {
1129 mutex_exit(ahmpages
);
1132 mutex_exit(ahmpages
);
1136 * Check 'nslots' anon slots for refcnt > 1.
1138 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
1142 anon_share(struct anon_hdr
*ahp
, ulong_t anon_index
, pgcnt_t nslots
)
1146 while (nslots
-- > 0) {
1147 if ((ap
= anon_get_ptr(ahp
, anon_index
)) != NULL
&&
1158 struct anon_hdr
*ahp
,
1162 struct anon
*ap
= anon_get_ptr(ahp
, an_idx
);
1163 kmutex_t
*ahmpages
= NULL
;
1165 pgcnt_t pgcnt
= page_get_pagecnt(szc
);
1175 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
1176 ASSERT(IS_P2ALIGNED(an_idx
, pgcnt
));
1177 ASSERT(an_idx
< ahp
->size
);
1179 if (ahp
->size
- an_idx
< pgcnt
) {
1181 * In case of shared mappings total anon map size may not be
1182 * the largest page size aligned.
1184 pgcnt
= ahp
->size
- an_idx
;
1187 VM_STAT_ADD(anonvmstats
.decrefpages
[0]);
1190 ahmpages
= APH_MUTEX(ap
->an_vp
, ap
->an_off
);
1191 mutex_enter(ahmpages
);
1192 ASSERT((refcnt
= ap
->an_refcnt
) != 0);
1193 VM_STAT_ADD(anonvmstats
.decrefpages
[1]);
1194 if (ap
->an_refcnt
== 1) {
1195 VM_STAT_ADD(anonvmstats
.decrefpages
[2]);
1196 ASSERT(!anon_share(ahp
, an_idx
, pgcnt
));
1197 mutex_exit(ahmpages
);
1204 if ((ap
= anon_get_ptr(ahp
, an_idx
+ i
)) == NULL
) {
1205 ASSERT(refcnt
== 1 && ahmpages
== NULL
);
1209 ASSERT(ap
->an_refcnt
== refcnt
);
1210 ASSERT(ahmpages
!= NULL
|| ap
->an_refcnt
== 1);
1211 ASSERT(ahmpages
== NULL
|| ap
->an_refcnt
> 1);
1213 if (ahmpages
== NULL
) {
1214 swap_xlate(ap
, &vp
, &off
);
1215 pp
= page_lookup(&vp
->v_object
, (uoff_t
)off
, SE_EXCL
);
1216 if (pp
== NULL
|| pp
->p_szc
== 0) {
1217 VM_STAT_ADD(anonvmstats
.decrefpages
[3]);
1218 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1219 (void) anon_set_ptr(ahp
, an_idx
+ i
, NULL
,
1223 ASSERT(ap
->an_refcnt
== 0);
1226 swap_phys_free(ap
->an_pvp
, ap
->an_poff
,
1230 pp
= page_lookup(&vp
->v_object
,
1231 (uoff_t
)off
, SE_EXCL
);
1232 ASSERT(pp
== NULL
|| pp
->p_szc
== 0);
1235 VM_STAT_ADD(anonvmstats
.decrefpages
[4]);
1236 VN_DISPOSE(pp
, B_INVAL
, 0, kcred
);
1238 kmem_cache_free(anon_cache
, ap
);
1244 page_get_pagecnt(pp
->p_szc
);
1245 size_t ppasize
= curpgcnt
* sizeof (page_t
*);
1246 page_t
**ppa
= kmem_alloc(ppasize
, KM_SLEEP
);
1249 VM_STAT_ADD(anonvmstats
.decrefpages
[5]);
1251 ASSERT(pp
->p_szc
<= szc
);
1252 ASSERT(IS_P2ALIGNED(curpgcnt
, curpgcnt
));
1253 ASSERT(IS_P2ALIGNED(i
, curpgcnt
));
1254 ASSERT(i
+ curpgcnt
<= pgcnt
);
1255 ASSERT(!(page_pptonum(pp
) & (curpgcnt
- 1)));
1257 for (j
= i
+ 1; j
< i
+ curpgcnt
; j
++) {
1258 ap
= anon_get_ptr(ahp
, an_idx
+ j
);
1259 ASSERT(ap
!= NULL
&&
1260 ap
->an_refcnt
== 1);
1261 swap_xlate(ap
, &vp
, &off
);
1262 pp
= page_lookup(&vp
->v_object
,
1263 (uoff_t
)off
, SE_EXCL
);
1265 panic("anon_decref_pages: "
1268 (void) hat_pageunload(pp
,
1269 HAT_FORCE_PGUNLOAD
);
1270 ASSERT(pp
->p_szc
== ppa
[0]->p_szc
);
1271 ASSERT(page_pptonum(pp
) - 1 ==
1272 page_pptonum(ppa
[j
- i
- 1]));
1274 if (ap
->an_pvp
!= NULL
&&
1275 (ap
->an_pvp
->v_op
->vop_dispose
!= fs_dispose
&&
1276 ap
->an_pvp
->v_op
->vop_dispose
!= NULL
))
1279 for (j
= i
; j
< i
+ curpgcnt
; j
++) {
1280 ap
= anon_get_ptr(ahp
, an_idx
+ j
);
1281 ASSERT(ap
!= NULL
&&
1282 ap
->an_refcnt
== 1);
1283 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1284 (void) anon_set_ptr(ahp
, an_idx
+ j
,
1288 ASSERT(ap
->an_refcnt
== 0);
1291 swap_phys_free(ap
->an_pvp
,
1292 ap
->an_poff
, PAGESIZE
);
1294 kmem_cache_free(anon_cache
, ap
);
1298 VM_STAT_ADD(anonvmstats
.decrefpages
[6]);
1299 page_destroy_pages(ppa
[0]);
1301 VM_STAT_ADD(anonvmstats
.decrefpages
[7]);
1302 for (j
= 0; j
< curpgcnt
; j
++) {
1303 ASSERT(PAGE_EXCL(ppa
[j
]));
1306 for (j
= 0; j
< curpgcnt
; j
++) {
1307 ASSERT(!hat_page_is_mapped(
1309 VN_DISPOSE(ppa
[j
], B_INVAL
, 0,
1313 kmem_free(ppa
, ppasize
);
1317 VM_STAT_ADD(anonvmstats
.decrefpages
[8]);
1318 (void) anon_set_ptr(ahp
, an_idx
+ i
, NULL
, ANON_SLEEP
);
1319 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1327 if (ahmpages
!= NULL
) {
1328 mutex_exit(ahmpages
);
1333 * Duplicate references to size bytes worth of anon pages.
1334 * Used when duplicating a segment that contains private anon pages.
1335 * This code assumes that procedure calling this one has already used
1336 * hat_chgprot() to disable write access to the range of addresses that
1337 * that *old actually refers to.
1340 anon_dup(struct anon_hdr
*old
, ulong_t old_idx
, struct anon_hdr
*new,
1341 ulong_t new_idx
, size_t size
)
1349 npages
= btopr(size
);
1350 while (npages
> 0) {
1352 if ((ap
= anon_get_next_ptr(old
, &index
)) == NULL
)
1355 ASSERT(!ANON_ISBUSY(anon_get_slot(old
, index
)));
1356 off
= index
- old_idx
;
1361 (void) anon_set_ptr(new, new_idx
+ off
, ap
, ANON_SLEEP
);
1362 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1376 * Just like anon_dup but also guarantees there are no holes (unallocated anon
1377 * slots) within any large page region. That means if a large page region is
1378 * empty in the old array it will skip it. If there are 1 or more valid slots
1379 * in the large page region of the old array it will make sure to fill in any
1380 * unallocated ones and also copy them to the new array. If noalloc is 1 large
1381 * page region should either have no valid anon slots or all slots should be
1385 anon_dup_fill_holes(
1386 struct anon_hdr
*old
,
1388 struct anon_hdr
*new,
1396 kmutex_t
*ahm
, *ahmpages
= NULL
;
1404 pgcnt
= page_get_pagecnt(szc
);
1405 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
1406 npages
= btopr(size
);
1407 ASSERT(IS_P2ALIGNED(npages
, pgcnt
));
1408 ASSERT(IS_P2ALIGNED(old_idx
, pgcnt
));
1410 VM_STAT_ADD(anonvmstats
.dupfillholes
[0]);
1412 while (npages
> 0) {
1416 * Find the next valid slot.
1418 if (anon_get_next_ptr(old
, &index
) == NULL
)
1421 ASSERT(!ANON_ISBUSY(anon_get_slot(old
, index
)));
1423 * Now backup index to the beginning of the
1424 * current large page region of the old array.
1426 index
= P2ALIGN(index
, pgcnt
);
1427 off
= index
- old_idx
;
1428 ASSERT(IS_P2ALIGNED(off
, pgcnt
));
1434 * Fill and copy a large page regions worth
1437 for (i
= 0; i
< pgcnt
; i
++) {
1438 if ((ap
= anon_get_ptr(old
, index
+ i
)) == NULL
) {
1440 panic("anon_dup_fill_holes: "
1441 "empty anon slot\n");
1443 VM_STAT_ADD(anonvmstats
.dupfillholes
[1]);
1444 ap
= anon_alloc(NULL
, 0);
1445 (void) anon_set_ptr(old
, index
+ i
, ap
,
1447 } else if (i
== 0) {
1449 * make the increment of all refcnts of all
1450 * anon slots of a large page appear atomic by
1451 * getting an anonpages_hash_lock for the
1452 * first anon slot of a large page.
1454 VM_STAT_ADD(anonvmstats
.dupfillholes
[2]);
1456 ahmpages
= APH_MUTEX(ap
->an_vp
, ap
->an_off
);
1457 mutex_enter(ahmpages
);
1459 ASSERT(refcnt
= ap
->an_refcnt
);
1461 VM_STAT_COND_ADD(ap
->an_refcnt
> 1,
1462 anonvmstats
.dupfillholes
[3]);
1464 (void) anon_set_ptr(new, new_idx
+ off
+ i
, ap
,
1466 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1468 ASSERT(ahmpages
!= NULL
|| ap
->an_refcnt
== 1);
1469 ASSERT(i
== 0 || ahmpages
== NULL
||
1470 refcnt
== ap
->an_refcnt
);
1474 if (ahmpages
!= NULL
) {
1475 mutex_exit(ahmpages
);
1486 * Used when a segment with a vnode changes szc. similarly to
1487 * anon_dup_fill_holes() makes sure each large page region either has no anon
1488 * slots or all of them. but new slots are created by COWing the file
1489 * pages. on entrance no anon slots should be shared.
1492 anon_fill_cow_holes(
1495 struct anon_hdr
*ahp
,
1502 struct vpage vpage
[],
1513 pgcnt
= page_get_pagecnt(szc
);
1514 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
1515 npages
= btopr(size
);
1516 ASSERT(IS_P2ALIGNED(npages
, pgcnt
));
1517 ASSERT(IS_P2ALIGNED(an_idx
, pgcnt
));
1519 while (npages
> 0) {
1523 * Find the next valid slot.
1525 if (anon_get_next_ptr(ahp
, &index
) == NULL
) {
1529 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp
, index
)));
1531 * Now backup index to the beginning of the
1532 * current large page region of the anon array.
1534 index
= P2ALIGN(index
, pgcnt
);
1535 off
= index
- an_idx
;
1536 ASSERT(IS_P2ALIGNED(off
, pgcnt
));
1541 vp_off
+= ptob(off
);
1543 if (vpage
!= NULL
) {
1547 for (i
= 0; i
< pgcnt
; i
++, an_idx
++, vp_off
+= PAGESIZE
) {
1548 if ((ap
= anon_get_ptr(ahp
, an_idx
)) == NULL
) {
1552 err
= fop_getpage(vp
, vp_off
, PAGESIZE
, NULL
,
1553 pl
, PAGESIZE
, seg
, addr
, S_READ
, cred
,
1558 if (vpage
!= NULL
) {
1559 prot
= VPP_PROT(vpage
);
1560 pageflags
= VPP_ISPPLOCK(vpage
) ?
1563 pp
= anon_private(&ap
, seg
, addr
, prot
, pl
[0],
1569 (void) anon_set_ptr(ahp
, an_idx
, ap
,
1573 ASSERT(ap
->an_refcnt
== 1);
1575 if (vpage
!= NULL
) {
1586 * Free a group of "size" anon pages, size in bytes,
1587 * and clear out the pointers to the anon entries.
1590 anon_free(struct anon_hdr
*ahp
, ulong_t index
, size_t size
)
1596 npages
= btopr(size
);
1598 while (npages
> 0) {
1600 if ((ap
= anon_get_next_ptr(ahp
, &index
)) == NULL
)
1603 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp
, index
)));
1604 npages
-= index
- old
;
1608 (void) anon_set_ptr(ahp
, index
, NULL
, ANON_SLEEP
);
1611 * Bump index and decrement page count
1620 struct anon_hdr
*ahp
,
1630 pgcnt
= page_get_pagecnt(szc
);
1631 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
1632 npages
= btopr(size
);
1633 ASSERT(IS_P2ALIGNED(npages
, pgcnt
));
1634 ASSERT(IS_P2ALIGNED(an_idx
, pgcnt
));
1635 ASSERT(an_idx
< ahp
->size
);
1637 VM_STAT_ADD(anonvmstats
.freepages
[0]);
1639 while (npages
> 0) {
1643 * Find the next valid slot.
1645 if (anon_get_next_ptr(ahp
, &index
) == NULL
)
1648 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp
, index
)));
1650 * Now backup index to the beginning of the
1651 * current large page region of the old array.
1653 index
= P2ALIGN(index
, pgcnt
);
1654 off
= index
- an_idx
;
1655 ASSERT(IS_P2ALIGNED(off
, pgcnt
));
1660 anon_decref_pages(ahp
, index
, szc
);
1669 * Make anonymous pages discardable
1672 anon_disclaim(struct anon_map
*amp
, ulong_t index
, size_t size
,
1673 uint_t behav
, pgcnt_t
*purged
)
1675 spgcnt_t npages
= btopr(size
);
1679 page_t
*pp
, *root_pp
;
1681 pgcnt_t pgcnt
, npurged
= 0;
1682 ulong_t old_idx
, idx
, i
;
1683 struct anon_hdr
*ahp
= amp
->ahp
;
1684 anon_sync_obj_t cookie
;
1687 VERIFY(behav
== MADV_FREE
|| behav
== MADV_PURGE
);
1688 ASSERT(RW_READ_HELD(&
->a_rwlock
));
1690 for (; npages
> 0; index
= (pgcnt
== 1) ? index
+ 1 :
1691 P2ROUNDUP(index
+ 1, pgcnt
), npages
-= pgcnt
) {
1694 * get anon pointer and index for the first valid entry
1695 * in the anon list, starting from "index"
1698 if ((ap
= anon_get_next_ptr(ahp
, &index
)) == NULL
)
1702 * decrement npages by number of NULL anon slots we skipped
1704 npages
-= index
- old_idx
;
1708 anon_array_enter(amp
, index
, &cookie
);
1709 ap
= anon_get_ptr(ahp
, index
);
1713 * Get anonymous page and try to lock it SE_EXCL;
1714 * if we couldn't grab the lock we skip to next page.
1716 swap_xlate(ap
, &vp
, &off
);
1717 pp
= page_lookup_nowait(&vp
->v_object
, (uoff_t
)off
, SE_EXCL
);
1719 segadvstat
.MADV_FREE_miss
.value
.ul
++;
1721 anon_array_exit(&cookie
);
1724 pgcnt
= page_get_pagecnt(pp
->p_szc
);
1727 * we cannot free a page which is permanently locked.
1728 * The page_struct_lock need not be acquired to examine
1729 * these fields since the page has an "exclusive" lock.
1731 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
1733 segadvstat
.MADV_FREE_miss
.value
.ul
++;
1734 anon_array_exit(&cookie
);
1739 ahm
= AH_MUTEX(vp
, off
);
1741 ASSERT(ap
->an_refcnt
!= 0);
1743 * skip this one if copy-on-write is not yet broken.
1745 if (ap
->an_refcnt
> 1) {
1748 segadvstat
.MADV_FREE_miss
.value
.ul
++;
1749 anon_array_exit(&cookie
);
1753 if (behav
== MADV_PURGE
&& pp
->p_szc
!= 0) {
1755 * If we're purging and we have a large page, simplify
1756 * things a bit by demoting ourselves into the base
1759 (void) page_try_demote_pages(pp
);
1762 if (pp
->p_szc
== 0) {
1769 swap_phys_free(ap
->an_pvp
, ap
->an_poff
,
1775 if (behav
== MADV_PURGE
) {
1777 * If we're purging (instead of merely freeing),
1778 * rip out this anon structure entirely to
1779 * assure that any subsequent fault pulls from
1780 * the backing vnode (if any).
1782 if (--ap
->an_refcnt
== 0)
1786 (void) anon_set_ptr(ahp
, index
,
1790 kmem_cache_free(anon_cache
, ap
);
1795 segadvstat
.MADV_FREE_hit
.value
.ul
++;
1798 * while we are at it, unload all the translations
1799 * and attempt to free the page.
1801 (void) hat_pageunload(pp
, HAT_FORCE_PGUNLOAD
);
1804 behav
== MADV_FREE
? B_FREE
: B_INVAL
, 0, kcred
);
1806 anon_array_exit(&cookie
);
1810 pgcnt
= page_get_pagecnt(pp
->p_szc
);
1811 if (!IS_P2ALIGNED(index
, pgcnt
) || npages
< pgcnt
) {
1812 if (!page_try_demote_pages(pp
)) {
1815 segadvstat
.MADV_FREE_miss
.value
.ul
++;
1816 anon_array_exit(&cookie
);
1822 swap_phys_free(ap
->an_pvp
,
1823 ap
->an_poff
, PAGESIZE
);
1828 (void) hat_pageunload(pp
, HAT_FORCE_PGUNLOAD
);
1830 VN_DISPOSE(pp
, B_FREE
, 0, kcred
);
1831 segadvstat
.MADV_FREE_hit
.value
.ul
++;
1832 anon_array_exit(&cookie
);
1840 * try to lock remaining pages
1842 for (idx
= 1; idx
< pgcnt
; idx
++) {
1844 if (!page_trylock(pp
, SE_EXCL
))
1846 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
1853 for (i
= 0; i
< pgcnt
; i
++) {
1854 ap
= anon_get_ptr(ahp
, index
+ i
);
1857 swap_xlate(ap
, &vp
, &off
);
1858 ahm
= AH_MUTEX(vp
, off
);
1860 ASSERT(ap
->an_refcnt
!= 0);
1863 * skip this one if copy-on-write
1864 * is not yet broken.
1866 if (ap
->an_refcnt
> 1) {
1871 swap_phys_free(ap
->an_pvp
,
1872 ap
->an_poff
, PAGESIZE
);
1878 page_destroy_pages(root_pp
);
1879 segadvstat
.MADV_FREE_hit
.value
.ul
+= pgcnt
;
1880 anon_array_exit(&cookie
);
1884 segadvstat
.MADV_FREE_miss
.value
.ul
+= pgcnt
;
1885 for (i
= 0, pp
= root_pp
; i
< idx
; pp
++, i
++)
1887 anon_array_exit(&cookie
);
1897 * Return the kept page(s) and protections back to the segment driver.
1911 struct anon
*ap
= *app
;
1917 swap_xlate(ap
, &vp
, &off
);
1920 * Lookup the page. If page is being paged in,
1921 * wait for it to finish as we must return a list of
1922 * pages since this routine acts like the fop_getpage
1925 if (pl
!= NULL
&& (pp
= page_lookup(&vp
->v_object
, (uoff_t
)off
, SE_SHARED
))) {
1926 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1928 if (ap
->an_refcnt
== 1)
1931 *protp
= PROT_ALL
& ~PROT_WRITE
;
1939 * Simply treat it as a vnode fault on the anon vp.
1942 err
= fop_getpage(vp
, (uoff_t
)off
, PAGESIZE
, protp
, pl
, plsz
,
1943 seg
, addr
, rw
, cred
, NULL
);
1945 if (err
== 0 && pl
!= NULL
) {
1946 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
1948 if (ap
->an_refcnt
!= 1)
1949 *protp
&= ~PROT_WRITE
; /* make read-only */
1956 * Creates or returns kept pages to the segment driver. returns -1 if a large
1957 * page cannot be allocated. returns -2 if some other process has allocated a
1960 * For cowfault it will allocate any size pages to fill the requested area to
1961 * avoid partially overwriting anon slots (i.e. sharing only some of the anon
1962 * slots within a large page with other processes). This policy greatly
1963 * simplifies large page freeing (which is only freed when all anon slot
1968 struct anon_map
*amp
,
1977 struct vpage vpage
[],
1988 page_t
*pp
, *pl
[2], *conpp
= NULL
;
1990 ulong_t pg_idx
, an_idx
, i
;
1991 spgcnt_t nreloc
= 0;
1993 int err
, slotcreate
;
1995 int upsize
= (szc
< seg
->s_szc
);
1997 #if !defined(__i386) && !defined(__amd64)
1998 ASSERT(seg
->s_szc
!= 0);
2000 ASSERT(szc
<= seg
->s_szc
);
2001 ASSERT(ppa_szc
!= NULL
);
2002 ASSERT(rw
!= S_CREATE
);
2006 VM_STAT_ADD(anonvmstats
.getpages
[0]);
2009 VM_STAT_ADD(anonvmstats
.getpages
[1]);
2010 if ((ap
= anon_get_ptr(amp
->ahp
, start_idx
)) != NULL
) {
2011 err
= anon_getpage(&ap
, protp
, pl
, PAGESIZE
, seg
,
2016 if (brkcow
== 0 || (*protp
& PROT_WRITE
)) {
2017 VM_STAT_ADD(anonvmstats
.getpages
[2]);
2018 if (ppa
[0]->p_szc
!= 0 && upsize
) {
2019 VM_STAT_ADD(anonvmstats
.getpages
[3]);
2020 *ppa_szc
= MIN(ppa
[0]->p_szc
,
2022 page_unlock(ppa
[0]);
2027 panic("anon_map_getpages: cowfault for szc 0");
2029 VM_STAT_ADD(anonvmstats
.getpages
[4]);
2030 ppa
[0] = anon_zero(seg
, addr
, &ap
, cred
);
2033 (void) anon_set_ptr(amp
->ahp
, start_idx
, ap
,
2039 pgcnt
= page_get_pagecnt(szc
);
2040 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
2041 ASSERT(IS_P2ALIGNED(start_idx
, pgcnt
));
2044 * First we check for the case that the requtested large
2045 * page or larger page already exists in the system.
2046 * Actually we only check if the first constituent page
2047 * exists and only preallocate if it's not found.
2049 ap
= anon_get_ptr(amp
->ahp
, start_idx
);
2052 swap_xlate(ap
, &vp
, &off
);
2053 if (page_exists_forreal(&vp
->v_object
, (uoff_t
)off
, &pszc
)) {
2054 if (pszc
> szc
&& upsize
) {
2055 *ppa_szc
= MIN(pszc
, seg
->s_szc
);
2064 VM_STAT_COND_ADD(prealloc
== 0, anonvmstats
.getpages
[5]);
2065 VM_STAT_COND_ADD(prealloc
!= 0, anonvmstats
.getpages
[6]);
2069 * If a smaller page or no page at all was found,
2070 * grab a large page off the freelist.
2073 ASSERT(conpp
== NULL
);
2074 if (page_alloc_pages(&anon_vp
->v_object
, seg
, addr
, NULL
, ppa
,
2075 szc
, 0, pgflags
) != 0) {
2076 VM_STAT_ADD(anonvmstats
.getpages
[7]);
2077 if (brkcow
== 0 || szc
< seg
->s_szc
||
2078 !anon_szcshare(amp
->ahp
, start_idx
)) {
2080 * If the refcnt's of all anon slots are <= 1
2081 * they can't increase since we are holding
2082 * the address space's lock. So segvn can
2083 * safely decrease szc without risking to
2084 * generate a cow fault for the region smaller
2085 * than the segment's largest page size.
2087 VM_STAT_ADD(anonvmstats
.getpages
[8]);
2092 * This is a cow fault. Copy away the entire 1 large
2093 * page region of this segment.
2095 if (szc
!= seg
->s_szc
)
2096 panic("anon_map_getpages: cowfault for szc %d",
2099 for (pg_idx
= 0, an_idx
= start_idx
; pg_idx
< pgcnt
;
2100 pg_idx
++, an_idx
++, vaddr
+= PAGESIZE
) {
2101 if ((ap
= anon_get_ptr(amp
->ahp
, an_idx
)) !=
2103 err
= anon_getpage(&ap
, &vpprot
, pl
,
2104 PAGESIZE
, seg
, vaddr
, rw
, cred
);
2106 for (i
= 0; i
< pg_idx
; i
++) {
2107 if ((pp
= ppa
[i
]) !=
2113 ppa
[pg_idx
] = pl
[0];
2116 * Since this is a cowfault we know
2117 * that this address space has a
2118 * parent or children which means
2119 * anon_dup_fill_holes() has initialized
2120 * all anon slots within a large page
2121 * region that had at least one anon
2122 * slot at the time of fork().
2124 panic("anon_map_getpages: "
2125 "cowfault but anon slot is empty");
2128 VM_STAT_ADD(anonvmstats
.getpages
[9]);
2130 return (anon_map_privatepages(amp
, start_idx
, szc
, seg
,
2131 addr
, prot
, ppa
, vpage
, anypgsz
, pgflags
, cred
));
2135 VM_STAT_ADD(anonvmstats
.getpages
[10]);
2140 while (pg_idx
< pgcnt
) {
2142 if ((ap
= anon_get_ptr(amp
->ahp
, an_idx
)) == NULL
) {
2143 VM_STAT_ADD(anonvmstats
.getpages
[11]);
2145 * For us to have decided not to preallocate
2146 * would have meant that a large page
2147 * was found. Which also means that all of the
2148 * anon slots for that page would have been
2149 * already created for us.
2152 panic("anon_map_getpages: prealloc = 0");
2155 ap
= anon_alloc(NULL
, 0);
2157 swap_xlate(ap
, &vp
, &off
);
2160 * Now setup our preallocated page to pass down
2161 * to swap_getpage().
2164 ASSERT(ppa
[pg_idx
]->p_szc
== szc
);
2165 conpp
= ppa
[pg_idx
];
2167 ASSERT(prealloc
|| conpp
== NULL
);
2170 * If we just created this anon slot then call
2171 * with S_CREATE to prevent doing IO on the page.
2172 * Similar to the anon_zero case.
2174 err
= swap_getconpage(vp
, (uoff_t
)off
, PAGESIZE
,
2175 NULL
, pl
, PAGESIZE
, conpp
, ppa_szc
, &nreloc
, seg
, vaddr
,
2176 slotcreate
== 1 ? S_CREATE
: rw
, cred
);
2179 ASSERT(err
!= -2 || upsize
);
2180 VM_STAT_ADD(anonvmstats
.getpages
[12]);
2181 ASSERT(slotcreate
== 0);
2187 if (pp
->p_szc
< szc
|| (pp
->p_szc
> szc
&& upsize
)) {
2188 VM_STAT_ADD(anonvmstats
.getpages
[13]);
2189 ASSERT(slotcreate
== 0);
2190 ASSERT(prealloc
== 0);
2191 ASSERT(pg_idx
== 0);
2192 if (pp
->p_szc
> szc
) {
2194 *ppa_szc
= MIN(pp
->p_szc
, seg
->s_szc
);
2196 VM_STAT_ADD(anonvmstats
.getpages
[14]);
2205 * If we decided to preallocate but fop_getpage
2206 * found a page in the system that satisfies our
2207 * request then free up our preallocated large page
2208 * and continue looping accross the existing large
2209 * page via fop_getpage.
2211 if (prealloc
&& pp
!= ppa
[pg_idx
]) {
2212 VM_STAT_ADD(anonvmstats
.getpages
[15]);
2213 ASSERT(slotcreate
== 0);
2214 ASSERT(pg_idx
== 0);
2217 page_free_pages(ppa
[0]);
2220 if (prealloc
&& nreloc
> 1) {
2222 * we have relocated out of a smaller large page.
2223 * skip npgs - 1 iterations and continue which will
2224 * increment by one the loop indices.
2226 spgcnt_t npgs
= nreloc
;
2228 VM_STAT_ADD(anonvmstats
.getpages
[16]);
2230 ASSERT(pp
== ppa
[pg_idx
]);
2231 ASSERT(slotcreate
== 0);
2232 ASSERT(pg_idx
+ npgs
<= pgcnt
);
2233 if ((*protp
& PROT_WRITE
) &&
2234 anon_share(amp
->ahp
, an_idx
, npgs
)) {
2235 *protp
&= ~PROT_WRITE
;
2239 vaddr
+= PAGESIZE
* npgs
;
2243 VM_STAT_ADD(anonvmstats
.getpages
[17]);
2250 pagezero(pp
, 0, PAGESIZE
);
2251 CPU_STATS_ADD_K(vm
, zfod
, 1);
2255 ASSERT(prealloc
== 0 || ppa
[pg_idx
] == pp
);
2256 ASSERT(prealloc
!= 0 || PAGE_SHARED(pp
));
2257 ASSERT(prealloc
== 0 || PAGE_EXCL(pp
));
2260 ((page_pptonum(pp
) != page_pptonum(ppa
[pg_idx
- 1]) + 1) ||
2261 (pp
->p_szc
!= ppa
[pg_idx
- 1]->p_szc
))) {
2262 panic("anon_map_getpages: unexpected page");
2263 } else if (pg_idx
== 0 && (page_pptonum(pp
) & (pgcnt
- 1))) {
2264 panic("anon_map_getpages: unaligned page");
2267 if (prealloc
== 0) {
2271 if (ap
->an_refcnt
> 1) {
2272 VM_STAT_ADD(anonvmstats
.getpages
[18]);
2273 *protp
&= ~PROT_WRITE
;
2277 * If this is a new anon slot then initialize
2278 * the anon array entry.
2281 (void) anon_set_ptr(amp
->ahp
, an_idx
, ap
, ANON_SLEEP
);
2289 * Since preallocated pages come off the freelist
2290 * they are locked SE_EXCL. Simply downgrade and return.
2293 VM_STAT_ADD(anonvmstats
.getpages
[19]);
2295 for (pg_idx
= 0; pg_idx
< pgcnt
; pg_idx
++) {
2296 page_downgrade(ppa
[pg_idx
]);
2299 ASSERT(conpp
== NULL
);
2301 if (brkcow
== 0 || (*protp
& PROT_WRITE
)) {
2302 VM_STAT_ADD(anonvmstats
.getpages
[20]);
2306 if (szc
< seg
->s_szc
)
2307 panic("anon_map_getpages: cowfault for szc %d", szc
);
2309 VM_STAT_ADD(anonvmstats
.getpages
[21]);
2312 return (anon_map_privatepages(amp
, start_idx
, szc
, seg
, addr
, prot
,
2313 ppa
, vpage
, anypgsz
, pgflags
, cred
));
2316 * We got an IO error somewhere in our large page.
2317 * If we were using a preallocated page then just demote
2318 * all the constituent pages that we've succeeded with sofar
2319 * to PAGESIZE pages and leave them in the system
2323 ASSERT(err
!= -2 || ((pg_idx
== 0) && upsize
));
2325 VM_STAT_COND_ADD(err
> 0, anonvmstats
.getpages
[22]);
2326 VM_STAT_COND_ADD(err
== -1, anonvmstats
.getpages
[23]);
2327 VM_STAT_COND_ADD(err
== -2, anonvmstats
.getpages
[24]);
2332 VM_STAT_ADD(anonvmstats
.getpages
[25]);
2333 for (i
= 0; i
< pgcnt
; i
++) {
2335 ASSERT(PAGE_EXCL(pp
));
2336 ASSERT(pp
->p_szc
== szc
);
2339 for (i
= 0; i
< pg_idx
; i
++) {
2340 ASSERT(!hat_page_is_mapped(ppa
[i
]));
2341 page_unlock(ppa
[i
]);
2344 * Now free up the remaining unused constituent
2347 while (pg_idx
< pgcnt
) {
2348 ASSERT(!hat_page_is_mapped(ppa
[pg_idx
]));
2349 page_free(ppa
[pg_idx
], 0);
2353 VM_STAT_ADD(anonvmstats
.getpages
[26]);
2354 page_free_pages(ppa
[0]);
2357 VM_STAT_ADD(anonvmstats
.getpages
[27]);
2359 for (i
= 0; i
< pg_idx
; i
++)
2360 page_unlock(ppa
[i
]);
2362 ASSERT(conpp
== NULL
);
2366 * we are here because we failed to relocate.
2369 if (brkcow
== 0 || szc
< seg
->s_szc
||
2370 !anon_szcshare(amp
->ahp
, start_idx
)) {
2371 VM_STAT_ADD(anonvmstats
.getpages
[28]);
2374 VM_STAT_ADD(anonvmstats
.getpages
[29]);
2380 * Turn a reference to an object or shared anon page
2381 * into a private page with a copy of the data from the
2382 * original page which is always locked by the caller.
2383 * This routine unloads the translation and unlocks the
2384 * original page, if it isn't being stolen, before returning
2387 * NOTE: The original anon slot is not freed by this routine
2388 * It must be freed by the caller while holding the
2389 * "anon_map" lock to prevent races which can occur if
2390 * a process has multiple lwps in its address space.
2402 struct anon
*old
= *app
;
2407 page_t
*anon_pl
[1 + 1];
2410 if (oppflags
& STEAL_PAGE
)
2411 ASSERT(PAGE_EXCL(opp
));
2413 ASSERT(PAGE_LOCKED(opp
));
2415 CPU_STATS_ADD_K(vm
, cow_fault
, 1);
2417 *app
= new = anon_alloc(NULL
, 0);
2418 swap_xlate(new, &vp
, &off
);
2420 if (oppflags
& STEAL_PAGE
) {
2421 page_rename(opp
, &vp
->v_object
, (uoff_t
)off
);
2431 * Call the fop_getpage routine to create the page, thereby
2432 * enabling the vnode driver to allocate any filesystem
2433 * space (e.g., disk block allocation for UFS). This also
2434 * prevents more than one page from being added to the
2435 * vnode at the same time.
2437 err
= fop_getpage(vp
, (uoff_t
)off
, PAGESIZE
, NULL
,
2438 anon_pl
, PAGESIZE
, seg
, addr
, S_CREATE
, cred
, NULL
);
2445 * If the original page was locked, we need to move the lock
2446 * to the new page by transfering 'cowcnt/lckcnt' of the original
2447 * page to 'cowcnt/lckcnt' of the new page.
2449 * See Statement at the beginning of segvn_lockop() and
2450 * comments in page_pp_useclaim() regarding the way
2451 * cowcnts/lckcnts are handled.
2453 * Also availrmem must be decremented up front for read only mapping
2454 * before calling page_pp_useclaim. page_pp_useclaim will bump it back
2455 * if availrmem did not need to be decremented after all.
2457 if (oppflags
& LOCK_PAGE
) {
2458 if ((prot
& PROT_WRITE
) == 0) {
2459 mutex_enter(&freemem_lock
);
2460 if (availrmem
> pages_pp_maximum
) {
2464 mutex_exit(&freemem_lock
);
2467 mutex_exit(&freemem_lock
);
2469 page_pp_useclaim(opp
, pp
, prot
& PROT_WRITE
);
2473 * Now copy the contents from the original page,
2474 * which is locked and loaded in the MMU by
2475 * the caller to prevent yet another page fault.
2477 /* XXX - should set mod bit in here */
2478 if (ppcopy(opp
, pp
) == 0) {
2480 * Before ppcopy could hanlde UE or other faults, we
2481 * would have panicked here, and still have no option
2484 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
2485 (void *)opp
, (void *)pp
);
2488 hat_setrefmod(pp
); /* mark as modified */
2491 * Unload the old translation.
2493 hat_unload(seg
->s_as
->a_hat
, addr
, PAGESIZE
, HAT_UNLOAD
);
2496 * Free unmapped, unmodified original page.
2497 * or release the lock on the original page,
2498 * otherwise the process will sleep forever in
2499 * anon_decref() waiting for the "exclusive" lock
2502 (void) page_release(opp
, 1);
2505 * we are done with page creation so downgrade the new
2506 * page's selock to shared, this helps when multiple
2507 * as_fault(...SOFTLOCK...) are done to the same
2513 * NOTE: The original anon slot must be freed by the
2514 * caller while holding the "anon_map" lock, if we
2515 * copied away from an anonymous page.
2529 anon_map_privatepages(
2530 struct anon_map
*amp
,
2537 struct vpage vpage
[],
2545 page_t
*pl
[2], *conpp
= NULL
;
2548 struct anon
*ap
, *oldap
;
2550 page_t
*pplist
, *pp
;
2551 ulong_t pg_idx
, an_idx
;
2552 spgcnt_t nreloc
= 0;
2554 kmutex_t
*ahmpages
= NULL
;
2560 ASSERT(szc
== seg
->s_szc
);
2562 VM_STAT_ADD(anonvmstats
.privatepages
[0]);
2564 pgcnt
= page_get_pagecnt(szc
);
2565 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
2566 ASSERT(IS_P2ALIGNED(start_idx
, pgcnt
));
2568 ASSERT(amp
!= NULL
);
2569 ap
= anon_get_ptr(amp
->ahp
, start_idx
);
2570 ASSERT(ap
== NULL
|| ap
->an_refcnt
>= 1);
2572 VM_STAT_COND_ADD(ap
== NULL
, anonvmstats
.privatepages
[1]);
2575 * Now try and allocate the large page. If we fail then just
2576 * let fop_getpage give us PAGESIZE pages. Normally we let
2577 * the caller make this decision but to avoid added complexity
2578 * it's simplier to handle that case here.
2580 if (anypgsz
== -1) {
2581 VM_STAT_ADD(anonvmstats
.privatepages
[2]);
2583 } else if (page_alloc_pages(&anon_vp
->v_object
, seg
, addr
, &pplist
,
2584 NULL
, szc
, anypgsz
, pgflags
) != 0) {
2585 VM_STAT_ADD(anonvmstats
.privatepages
[3]);
2590 * make the decrement of all refcnts of all
2591 * anon slots of a large page appear atomic by
2592 * getting an anonpages_hash_lock for the
2593 * first anon slot of a large page.
2596 ahmpages
= APH_MUTEX(ap
->an_vp
, ap
->an_off
);
2597 mutex_enter(ahmpages
);
2598 if (ap
->an_refcnt
== 1) {
2599 VM_STAT_ADD(anonvmstats
.privatepages
[4]);
2600 ASSERT(!anon_share(amp
->ahp
, start_idx
, pgcnt
));
2601 mutex_exit(ahmpages
);
2604 page_free_replacement_page(pplist
);
2605 page_create_putback(pgcnt
);
2607 ASSERT(ppa
[0]->p_szc
<= szc
);
2608 if (ppa
[0]->p_szc
== szc
) {
2609 VM_STAT_ADD(anonvmstats
.privatepages
[5]);
2612 for (pg_idx
= 0; pg_idx
< pgcnt
; pg_idx
++) {
2613 ASSERT(ppa
[pg_idx
] != NULL
);
2614 page_unlock(ppa
[pg_idx
]);
2621 * If we are passed in the vpage array and this is
2622 * not PROT_WRITE then we need to decrement availrmem
2623 * up front before we try anything. If we need to and
2624 * can't decrement availrmem then its better to fail now
2625 * than in the middle of processing the new large page.
2626 * page_pp_usclaim() on behalf of each constituent page
2627 * below will adjust availrmem back for the cases not needed.
2629 if (vpage
!= NULL
&& (prot
& PROT_WRITE
) == 0) {
2630 for (pg_idx
= 0; pg_idx
< pgcnt
; pg_idx
++) {
2631 if (VPP_ISPPLOCK(&vpage
[pg_idx
])) {
2637 VM_STAT_ADD(anonvmstats
.privatepages
[6]);
2638 mutex_enter(&freemem_lock
);
2639 if (availrmem
>= pages_pp_maximum
+ pgcnt
) {
2641 pages_useclaim
+= pgcnt
;
2643 VM_STAT_ADD(anonvmstats
.privatepages
[7]);
2644 mutex_exit(&freemem_lock
);
2645 if (ahmpages
!= NULL
) {
2646 mutex_exit(ahmpages
);
2649 page_free_replacement_page(pplist
);
2650 page_create_putback(pgcnt
);
2652 for (pg_idx
= 0; pg_idx
< pgcnt
; pg_idx
++)
2653 if (ppa
[pg_idx
] != NULL
)
2654 page_unlock(ppa
[pg_idx
]);
2657 mutex_exit(&freemem_lock
);
2661 CPU_STATS_ADD_K(vm
, cow_fault
, pgcnt
);
2663 VM_STAT_ADD(anonvmstats
.privatepages
[8]);
2668 for (; pg_idx
< pgcnt
; pg_idx
++, an_idx
++, vaddr
+= PAGESIZE
) {
2669 ASSERT(ppa
[pg_idx
] != NULL
);
2670 oldap
= anon_get_ptr(amp
->ahp
, an_idx
);
2671 ASSERT(ahmpages
!= NULL
|| oldap
== NULL
);
2672 ASSERT(ahmpages
== NULL
|| oldap
!= NULL
);
2673 ASSERT(ahmpages
== NULL
|| oldap
->an_refcnt
> 1);
2674 ASSERT(ahmpages
== NULL
|| pg_idx
!= 0 ||
2675 (refcnt
= oldap
->an_refcnt
));
2676 ASSERT(ahmpages
== NULL
|| pg_idx
== 0 ||
2677 refcnt
== oldap
->an_refcnt
);
2679 ap
= anon_alloc(NULL
, 0);
2681 swap_xlate(ap
, &vp
, &off
);
2684 * Now setup our preallocated page to pass down to
2689 page_sub(&pplist
, pp
);
2693 err
= swap_getconpage(vp
, (uoff_t
)off
, PAGESIZE
, NULL
, pl
,
2694 PAGESIZE
, conpp
, NULL
, &nreloc
, seg
, vaddr
,
2698 * Impossible to fail this is S_CREATE.
2701 panic("anon_map_privatepages: fop_getpage failed");
2703 ASSERT(prealloc
? pp
== pl
[0] : pl
[0]->p_szc
== 0);
2704 ASSERT(prealloc
== 0 || nreloc
== 1);
2709 * If the original page was locked, we need to move
2710 * the lock to the new page by transfering
2711 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
2712 * of the new page. pg_idx can be used to index
2713 * into the vpage array since the caller will guarentee
2714 * that vpage struct passed in corresponds to addr
2717 if (vpage
!= NULL
&& VPP_ISPPLOCK(&vpage
[pg_idx
])) {
2718 page_pp_useclaim(ppa
[pg_idx
], pp
, prot
& PROT_WRITE
);
2719 } else if (pagelock
) {
2720 mutex_enter(&freemem_lock
);
2723 mutex_exit(&freemem_lock
);
2727 * Now copy the contents from the original page.
2729 if (ppcopy(ppa
[pg_idx
], pp
) == 0) {
2731 * Before ppcopy could hanlde UE or other faults, we
2732 * would have panicked here, and still have no option
2735 panic("anon_map_privatepages, ppcopy failed");
2738 hat_setrefmod(pp
); /* mark as modified */
2741 * Release the lock on the original page,
2742 * derement the old slot, and down grade the lock
2745 page_unlock(ppa
[pg_idx
]);
2753 * Now reflect the copy in the new anon array.
2755 ASSERT(ahmpages
== NULL
|| oldap
->an_refcnt
> 1);
2758 (void) anon_set_ptr(amp
->ahp
, an_idx
, ap
, ANON_SLEEP
);
2762 * Unload the old large page translation.
2764 hat_unload(seg
->s_as
->a_hat
, addr
, pgcnt
<< PAGESHIFT
, HAT_UNLOAD
);
2766 if (ahmpages
!= NULL
) {
2767 mutex_exit(ahmpages
);
2769 ASSERT(prealloc
== 0 || pplist
== NULL
);
2771 VM_STAT_ADD(anonvmstats
.privatepages
[9]);
2772 for (pg_idx
= 0; pg_idx
< pgcnt
; pg_idx
++) {
2773 page_downgrade(ppa
[pg_idx
]);
2781 * Allocate a private zero-filled anon page.
2784 anon_zero(struct seg
*seg
, caddr_t addr
, struct anon
**app
, struct cred
*cred
)
2790 page_t
*anon_pl
[1 + 1];
2793 *app
= ap
= anon_alloc(NULL
, 0);
2794 swap_xlate(ap
, &vp
, &off
);
2797 * Call the fop_getpage routine to create the page, thereby
2798 * enabling the vnode driver to allocate any filesystem
2799 * dependent structures (e.g., disk block allocation for UFS).
2800 * This also prevents more than on page from being added to
2801 * the vnode at the same time since it is locked.
2803 err
= fop_getpage(vp
, off
, PAGESIZE
, NULL
,
2804 anon_pl
, PAGESIZE
, seg
, addr
, S_CREATE
, cred
, NULL
);
2812 pagezero(pp
, 0, PAGESIZE
); /* XXX - should set mod bit */
2814 CPU_STATS_ADD_K(vm
, zfod
, 1);
2815 hat_setrefmod(pp
); /* mark as modified so pageout writes back */
2821 * Allocate array of private zero-filled anon pages for empty slots
2822 * and kept pages for non empty slots within given range.
2824 * NOTE: This rontine will try and use large pages
2825 * if available and supported by underlying platform.
2828 anon_map_createpages(
2829 struct anon_map
*amp
,
2830 ulong_t start_index
,
2840 struct vnode
*ap_vp
;
2841 page_t
*pp
, *pplist
, *anon_pl
[1 + 1], *conpp
= NULL
;
2843 ulong_t p_index
, index
;
2844 pgcnt_t npgs
, pg_cnt
;
2845 spgcnt_t nreloc
= 0;
2846 uint_t l_szc
, szc
, prot
;
2853 * XXX For now only handle S_CREATE.
2855 ASSERT(rw
== S_CREATE
);
2857 index
= start_index
;
2862 * If this platform supports multiple page sizes
2863 * then try and allocate directly from the free
2864 * list for pages larger than PAGESIZE.
2866 * NOTE:When we have page_create_ru we can stop
2867 * directly allocating from the freelist.
2870 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
2874 * if anon slot already exists
2875 * (means page has been created)
2876 * so 1) look up the page
2877 * 2) if the page is still in memory, get it.
2878 * 3) if not, create a page and
2879 * page in from physical swap device.
2880 * These are done in anon_getpage().
2882 ap
= anon_get_ptr(amp
->ahp
, index
);
2884 err
= anon_getpage(&ap
, &prot
, anon_pl
, PAGESIZE
,
2885 seg
, addr
, S_READ
, cred
);
2887 ANON_LOCK_EXIT(&
->a_rwlock
);
2888 panic("anon_map_createpages: anon_getpage");
2891 ppa
[p_index
++] = pp
;
2894 * an_pvp can become non-NULL after SysV's page was
2895 * paged out before ISM was attached to this SysV
2896 * shared memory segment. So free swap slot if needed.
2898 if (ap
->an_pvp
!= NULL
) {
2900 ahm
= AH_MUTEX(ap
->an_vp
, ap
->an_off
);
2902 if (ap
->an_pvp
!= NULL
) {
2903 swap_phys_free(ap
->an_pvp
,
2904 ap
->an_poff
, PAGESIZE
);
2921 * Now try and allocate the largest page possible
2922 * for the current address and range.
2923 * Keep dropping down in page size until:
2925 * 1) Properly aligned
2926 * 2) Does not overlap existing anon pages
2927 * 3) Fits in remaining range.
2928 * 4) able to allocate one.
2930 * NOTE: XXX When page_create_ru is completed this code
2937 pgsz
= page_get_pagesize(szc
);
2938 pg_cnt
= pgsz
>> PAGESHIFT
;
2939 if (IS_P2ALIGNED(addr
, pgsz
) && pg_cnt
<= npgs
&&
2940 anon_pages(amp
->ahp
, index
, pg_cnt
) == 0) {
2943 * Since we are faking page_create()
2944 * we also need to do the freemem and
2947 (void) page_create_wait(pg_cnt
, PG_WAIT
);
2950 * Get lgroup to allocate next page of shared
2951 * memory from and use it to specify where to
2952 * allocate the physical memory
2954 lgrp
= lgrp_mem_choose(seg
, addr
, pgsz
);
2956 pplist
= page_get_freelist(
2957 &anon_vp
->v_object
, 0, seg
,
2958 addr
, pgsz
, 0, lgrp
);
2960 if (pplist
== NULL
) {
2961 page_create_putback(pg_cnt
);
2965 * If a request for a page of size
2966 * larger than PAGESIZE failed
2967 * then don't try that size anymore.
2969 if (pplist
== NULL
) {
2979 * If just using PAGESIZE pages then don't
2980 * directly allocate from the free list.
2982 if (pplist
== NULL
) {
2984 pp
= anon_zero(seg
, addr
, &ap
, cred
);
2986 ANON_LOCK_EXIT(&
->a_rwlock
);
2987 panic("anon_map_createpages: anon_zero");
2989 ppa
[p_index
++] = pp
;
2991 ASSERT(anon_get_ptr(amp
->ahp
, index
) == NULL
);
2992 (void) anon_set_ptr(amp
->ahp
, index
, ap
, ANON_SLEEP
);
3001 * pplist is a list of pg_cnt PAGESIZE pages.
3002 * These pages are locked SE_EXCL since they
3003 * came directly off the free list.
3005 ASSERT(IS_P2ALIGNED(pg_cnt
, pg_cnt
));
3006 ASSERT(IS_P2ALIGNED(index
, pg_cnt
));
3007 ASSERT(conpp
== NULL
);
3010 ap
= anon_alloc(NULL
, 0);
3011 swap_xlate(ap
, &ap_vp
, &ap_off
);
3013 ASSERT(pplist
!= NULL
);
3015 page_sub(&pplist
, pp
);
3020 err
= swap_getconpage(ap_vp
, ap_off
, PAGESIZE
,
3021 (uint_t
*)NULL
, anon_pl
, PAGESIZE
, conpp
, NULL
,
3022 &nreloc
, seg
, addr
, S_CREATE
, cred
);
3025 ANON_LOCK_EXIT(&
->a_rwlock
);
3026 panic("anon_map_createpages: S_CREATE");
3029 ASSERT(anon_pl
[0] == pp
);
3030 ASSERT(nreloc
== 1);
3031 pagezero(pp
, 0, PAGESIZE
);
3032 CPU_STATS_ADD_K(vm
, zfod
, 1);
3035 ASSERT(anon_get_ptr(amp
->ahp
, index
) == NULL
);
3036 (void) anon_set_ptr(amp
->ahp
, index
, ap
, ANON_SLEEP
);
3038 ppa
[p_index
++] = pp
;
3045 pg_cnt
= pgsz
>> PAGESHIFT
;
3046 p_index
= p_index
- pg_cnt
;
3048 page_downgrade(ppa
[p_index
++]);
3051 ANON_LOCK_EXIT(&
->a_rwlock
);
3056 anon_try_demote_pages(
3057 struct anon_hdr
*ahp
,
3064 pgcnt_t pgcnt
= page_get_pagecnt(szc
);
3067 kmutex_t
*ahmpages
= NULL
;
3070 pgcnt_t curnpgs
= 0;
3074 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
3075 ASSERT(IS_P2ALIGNED(sidx
, pgcnt
));
3076 ASSERT(sidx
< ahp
->size
);
3079 ppasize
= pgcnt
* sizeof (page_t
*);
3080 ppa
= kmem_alloc(ppasize
, KM_SLEEP
);
3083 ap
= anon_get_ptr(ahp
, sidx
);
3084 if (ap
!= NULL
&& private) {
3085 VM_STAT_ADD(anonvmstats
.demotepages
[1]);
3086 ahmpages
= APH_MUTEX(ap
->an_vp
, ap
->an_off
);
3087 mutex_enter(ahmpages
);
3090 if (ap
!= NULL
&& ap
->an_refcnt
> 1) {
3091 if (ahmpages
!= NULL
) {
3092 VM_STAT_ADD(anonvmstats
.demotepages
[2]);
3093 mutex_exit(ahmpages
);
3096 kmem_free(ppa
, ppasize
);
3100 if (ahmpages
!= NULL
) {
3101 mutex_exit(ahmpages
);
3103 if (ahp
->size
- sidx
< pgcnt
) {
3104 ASSERT(private == 0);
3105 pgcnt
= ahp
->size
- sidx
;
3107 for (i
= 0; i
< pgcnt
; i
++, sidx
++) {
3108 ap
= anon_get_ptr(ahp
, sidx
);
3110 if (ap
->an_refcnt
!= 1) {
3111 panic("anon_try_demote_pages: an_refcnt != 1");
3113 pp
= ppa
[i
] = page_lookup(&ap
->an_vp
->v_object
,
3114 ap
->an_off
, SE_EXCL
);
3116 (void) hat_pageunload(pp
,
3117 HAT_FORCE_PGUNLOAD
);
3123 for (i
= 0; i
< pgcnt
; i
++) {
3124 if ((pp
= ppa
[i
]) != NULL
&& pp
->p_szc
!= 0) {
3125 ASSERT(pp
->p_szc
<= szc
);
3127 VM_STAT_ADD(anonvmstats
.demotepages
[3]);
3129 panic("anon_try_demote_pages: "
3134 page_get_pagecnt(pp
->p_szc
);
3136 ASSERT(npgs
<= pgcnt
);
3137 ASSERT(IS_P2ALIGNED(npgs
, npgs
));
3138 ASSERT(!(page_pptonum(pp
) & (npgs
- 1)));
3141 ASSERT(page_pptonum(pp
) - 1 ==
3142 page_pptonum(ppa
[i
- 1]));
3143 if ((page_pptonum(pp
) & (npgs
- 1)) ==
3147 ASSERT(PAGE_EXCL(pp
));
3149 ASSERT(curnpgs
> 0);
3153 if (root
!= 0 || curnpgs
!= 0)
3154 panic("anon_try_demote_pages: bad large page");
3156 for (i
= 0; i
< pgcnt
; i
++) {
3157 if ((pp
= ppa
[i
]) != NULL
) {
3158 ASSERT(!hat_page_is_mapped(pp
));
3159 ASSERT(pp
->p_szc
== 0);
3164 kmem_free(ppa
, ppasize
);
3170 * anon_map_demotepages() can only be called by MAP_PRIVATE segments.
3173 anon_map_demotepages(
3174 struct anon_map
*amp
,
3179 struct vpage vpage
[],
3183 uint_t szc
= seg
->s_szc
;
3184 pgcnt_t pgcnt
= page_get_pagecnt(szc
);
3185 size_t ppasize
= pgcnt
* sizeof (page_t
*);
3186 page_t
**ppa
= kmem_alloc(ppasize
, KM_SLEEP
);
3196 ASSERT(RW_WRITE_HELD(&
->a_rwlock
));
3197 ASSERT(IS_P2ALIGNED(pgcnt
, pgcnt
));
3198 ASSERT(IS_P2ALIGNED(start_idx
, pgcnt
));
3199 ASSERT(ppa
!= NULL
);
3201 ASSERT(szc
== amp
->a_szc
);
3203 VM_STAT_ADD(anonvmstats
.demotepages
[0]);
3206 if (anon_try_demote_pages(amp
->ahp
, start_idx
, szc
, ppa
, 1)) {
3207 kmem_free(ppa
, ppasize
);
3211 VM_STAT_ADD(anonvmstats
.demotepages
[4]);
3213 ASSERT(retry
== 0); /* we can be here only once */
3216 for (pg_idx
= 0, an_idx
= start_idx
; pg_idx
< pgcnt
;
3217 pg_idx
++, an_idx
++, vaddr
+= PAGESIZE
) {
3218 ap
= anon_get_ptr(amp
->ahp
, an_idx
);
3220 panic("anon_map_demotepages: no anon slot");
3221 err
= anon_getpage(&ap
, &vpprot
, pl
, PAGESIZE
, seg
, vaddr
,
3224 for (i
= 0; i
< pg_idx
; i
++) {
3225 if ((pp
= ppa
[i
]) != NULL
)
3228 kmem_free(ppa
, ppasize
);
3231 ppa
[pg_idx
] = pl
[0];
3234 err
= anon_map_privatepages(amp
, start_idx
, szc
, seg
, addr
, prot
, ppa
,
3235 vpage
, -1, 0, cred
);
3237 VM_STAT_ADD(anonvmstats
.demotepages
[5]);
3238 kmem_free(ppa
, ppasize
);
3241 ASSERT(err
== 0 || err
== -1);
3243 VM_STAT_ADD(anonvmstats
.demotepages
[6]);
3247 for (i
= 0; i
< pgcnt
; i
++) {
3248 ASSERT(ppa
[i
] != NULL
);
3249 if (ppa
[i
]->p_szc
!= 0)
3251 page_unlock(ppa
[i
]);
3254 VM_STAT_ADD(anonvmstats
.demotepages
[7]);
3258 VM_STAT_ADD(anonvmstats
.demotepages
[8]);
3260 kmem_free(ppa
, ppasize
);
3266 * Free pages of shared anon map. It's assumed that anon maps don't share anon
3267 * structures with private anon maps. Therefore all anon structures should
3268 * have at most one reference at this point. This means underlying pages can
3269 * be exclusively locked and demoted or freed. If not freeing the entire
3270 * large pages demote the ends of the region we free to be able to free
3271 * subpages. Page roots correspond to aligned index positions in anon map.
3274 anon_shmap_free_pages(struct anon_map
*amp
, ulong_t sidx
, size_t len
)
3276 ulong_t eidx
= sidx
+ btopr(len
);
3277 pgcnt_t pages
= page_get_pagecnt(amp
->a_szc
);
3278 struct anon_hdr
*ahp
= amp
->ahp
;
3281 ulong_t sidx_aligned
;
3282 ulong_t eidx_aligned
;
3284 ASSERT(ANON_WRITE_HELD(&
->a_rwlock
));
3285 ASSERT(amp
->refcnt
<= 1);
3286 ASSERT(amp
->a_szc
> 0);
3287 ASSERT(eidx
<= ahp
->size
);
3288 ASSERT(!anon_share(ahp
, sidx
, btopr(len
)));
3290 if (len
== 0) { /* XXX */
3294 sidx_aligned
= P2ALIGN(sidx
, pages
);
3295 if (sidx_aligned
!= sidx
||
3296 (eidx
< sidx_aligned
+ pages
&& eidx
< ahp
->size
)) {
3297 if (!anon_try_demote_pages(ahp
, sidx_aligned
,
3298 amp
->a_szc
, NULL
, 0)) {
3299 panic("anon_shmap_free_pages: demote failed");
3301 size
= (eidx
<= sidx_aligned
+ pages
) ? (eidx
- sidx
) :
3302 P2NPHASE(sidx
, pages
);
3304 anon_free(ahp
, sidx
, size
);
3305 sidx
= sidx_aligned
+ pages
;
3310 eidx_aligned
= P2ALIGN(eidx
, pages
);
3311 if (sidx
< eidx_aligned
) {
3312 anon_free_pages(ahp
, sidx
,
3313 (eidx_aligned
- sidx
) << PAGESHIFT
,
3315 sidx
= eidx_aligned
;
3317 ASSERT(sidx
== eidx_aligned
);
3318 if (eidx
== eidx_aligned
) {
3322 if (eidx
!= ahp
->size
&& anon_get_next_ptr(ahp
, &tidx
) != NULL
&&
3323 tidx
- sidx
< pages
) {
3324 if (!anon_try_demote_pages(ahp
, sidx
, amp
->a_szc
, NULL
, 0)) {
3325 panic("anon_shmap_free_pages: demote failed");
3327 size
= (eidx
- sidx
) << PAGESHIFT
;
3328 anon_free(ahp
, sidx
, size
);
3330 anon_free_pages(ahp
, sidx
, pages
<< PAGESHIFT
, amp
->a_szc
);
3335 * This routine should be called with amp's writer lock when there're no other
3336 * users of amp. All pcache entries of this amp must have been already
3337 * inactivated. We must not drop a_rwlock here to prevent new users from
3338 * attaching to this amp.
3341 anonmap_purge(struct anon_map
*amp
)
3343 ASSERT(ANON_WRITE_HELD(&
->a_rwlock
));
3344 ASSERT(amp
->refcnt
<= 1);
3346 if (amp
->a_softlockcnt
!= 0) {
3347 seg_ppurge(NULL
, amp
, 0);
3351 * Since all pcache entries were already inactive before this routine
3352 * was called seg_ppurge() couldn't return while there're still
3353 * entries that can be found via the list anchored at a_phead. So we
3354 * can assert this list is empty now. a_softlockcnt may be still non 0
3355 * if asynchronous thread that manages pcache already removed pcache
3356 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
3357 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
3358 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
3359 * before shamp_reclaim() is done with it. a_purgemtx also taken by
3360 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
3361 * barrier that prevents anonmap_purge() to complete while
3362 * shamp_reclaim() may still be referencing this amp.
3364 ASSERT(amp
->a_phead
.p_lnext
== &
->a_phead
);
3365 ASSERT(amp
->a_phead
.p_lprev
== &
->a_phead
);
3367 mutex_enter(&
->a_purgemtx
);
3368 while (amp
->a_softlockcnt
!= 0) {
3369 ASSERT(amp
->a_phead
.p_lnext
== &
->a_phead
);
3370 ASSERT(amp
->a_phead
.p_lprev
== &
->a_phead
);
3371 amp
->a_purgewait
= 1;
3372 cv_wait(&
->a_purgecv
, &
->a_purgemtx
);
3374 mutex_exit(&
->a_purgemtx
);
3376 ASSERT(amp
->a_phead
.p_lnext
== &
->a_phead
);
3377 ASSERT(amp
->a_phead
.p_lprev
== &
->a_phead
);
3378 ASSERT(amp
->a_softlockcnt
== 0);
3382 * Allocate and initialize an anon_map structure for seg
3383 * associating the given swap reservation with the new anon_map.
3386 anonmap_alloc(size_t size
, size_t swresv
, int flags
)
3388 struct anon_map
*amp
;
3389 int kmflags
= (flags
& ANON_NOSLEEP
) ? KM_NOSLEEP
: KM_SLEEP
;
3391 amp
= kmem_cache_alloc(anonmap_cache
, kmflags
);
3393 ASSERT(kmflags
== KM_NOSLEEP
);
3397 amp
->ahp
= anon_create(btopr(size
), flags
);
3398 if (amp
->ahp
== NULL
) {
3399 ASSERT(flags
== ANON_NOSLEEP
);
3400 kmem_cache_free(anonmap_cache
, amp
);
3405 amp
->swresv
= swresv
;
3409 amp
->a_softlockcnt
= 0;
3410 amp
->a_purgewait
= 0;
3411 amp
->a_phead
.p_lnext
= &
->a_phead
;
3412 amp
->a_phead
.p_lprev
= &
->a_phead
;
3418 anonmap_free(struct anon_map
*amp
)
3420 ASSERT(amp
->ahp
!= NULL
);
3421 ASSERT(amp
->refcnt
== 0);
3422 ASSERT(amp
->a_softlockcnt
== 0);
3423 ASSERT(amp
->a_phead
.p_lnext
== &
->a_phead
);
3424 ASSERT(amp
->a_phead
.p_lprev
== &
->a_phead
);
3426 lgrp_shm_policy_fini(amp
, NULL
);
3427 anon_release(amp
->ahp
, btopr(amp
->size
));
3428 kmem_cache_free(anonmap_cache
, amp
);
3432 * Returns true if the app array has some empty slots.
3433 * The offp and lenp parameters are in/out parameters. On entry
3434 * these values represent the starting offset and length of the
3435 * mapping. When true is returned, these values may be modified
3436 * to be the largest range which includes empty slots.
3439 non_anon(struct anon_hdr
*ahp
, ulong_t anon_idx
, uoff_t
*offp
,
3447 for (i
= 0, el
= *lenp
; i
< el
; i
+= PAGESIZE
, anon_idx
++) {
3448 ap
= anon_get_ptr(ahp
, anon_idx
);
3457 * Found at least one non-anon page.
3458 * Set up the off and len return values.
3462 *lenp
= high
- low
+ PAGESIZE
;
3469 * Return a count of the number of existing anon pages in the anon array
3470 * app in the range (off, off+len). The array and slots must be guaranteed
3471 * stable by the caller.
3474 anon_pages(struct anon_hdr
*ahp
, ulong_t anon_index
, pgcnt_t nslots
)
3478 while (nslots
-- > 0) {
3479 if ((anon_get_ptr(ahp
, anon_index
)) != NULL
)
3487 * Move reserved phys swap into memory swap (unreserve phys swap
3488 * and reserve mem swap by the same amount).
3489 * Used by segspt when it needs to lock reserved swap npages in memory
3492 anon_swap_adjust(pgcnt_t npages
)
3494 pgcnt_t unlocked_mem_swap
;
3496 mutex_enter(&anoninfo_lock
);
3498 ASSERT(k_anoninfo
.ani_mem_resv
>= k_anoninfo
.ani_locked_swap
);
3499 ASSERT(k_anoninfo
.ani_max
>= k_anoninfo
.ani_phys_resv
);
3501 unlocked_mem_swap
= k_anoninfo
.ani_mem_resv
3502 - k_anoninfo
.ani_locked_swap
;
3503 if (npages
> unlocked_mem_swap
) {
3504 spgcnt_t adjusted_swap
= npages
- unlocked_mem_swap
;
3507 * if there is not enough unlocked mem swap we take missing
3508 * amount from phys swap and give it to mem swap
3510 if (!page_reclaim_mem(adjusted_swap
, segspt_minfree
, 1)) {
3511 mutex_exit(&anoninfo_lock
);
3515 k_anoninfo
.ani_mem_resv
+= adjusted_swap
;
3516 ASSERT(k_anoninfo
.ani_phys_resv
>= adjusted_swap
);
3517 k_anoninfo
.ani_phys_resv
-= adjusted_swap
;
3519 ANI_ADD(adjusted_swap
);
3521 k_anoninfo
.ani_locked_swap
+= npages
;
3523 ASSERT(k_anoninfo
.ani_mem_resv
>= k_anoninfo
.ani_locked_swap
);
3524 ASSERT(k_anoninfo
.ani_max
>= k_anoninfo
.ani_phys_resv
);
3526 mutex_exit(&anoninfo_lock
);
3532 * 'unlocked' reserved mem swap so when it is unreserved it
3533 * can be moved back phys (disk) swap
3536 anon_swap_restore(pgcnt_t npages
)
3538 mutex_enter(&anoninfo_lock
);
3540 ASSERT(k_anoninfo
.ani_locked_swap
<= k_anoninfo
.ani_mem_resv
);
3542 ASSERT(k_anoninfo
.ani_locked_swap
>= npages
);
3543 k_anoninfo
.ani_locked_swap
-= npages
;
3545 ASSERT(k_anoninfo
.ani_locked_swap
<= k_anoninfo
.ani_mem_resv
);
3547 mutex_exit(&anoninfo_lock
);
3551 * Return the pointer from the list for a
3552 * specified anon index.
3555 anon_get_slot(struct anon_hdr
*ahp
, ulong_t an_idx
)
3560 ASSERT(an_idx
< ahp
->size
);
3563 * Single level case.
3565 if ((ahp
->size
<= ANON_CHUNK_SIZE
) || (ahp
->flags
& ANON_ALLOC_FORCE
)) {
3566 return ((ulong_t
*)&ahp
->array_chunk
[an_idx
]);
3572 ppp
= &ahp
->array_chunk
[an_idx
>> ANON_CHUNK_SHIFT
];
3574 mutex_enter(&ahp
->serial_lock
);
3575 ppp
= &ahp
->array_chunk
[an_idx
>> ANON_CHUNK_SHIFT
];
3577 *ppp
= kmem_zalloc(PAGESIZE
, KM_SLEEP
);
3578 mutex_exit(&ahp
->serial_lock
);
3581 return ((ulong_t
*)&app
[an_idx
& ANON_CHUNK_OFF
]);
3586 anon_array_enter(struct anon_map
*amp
, ulong_t an_idx
, anon_sync_obj_t
*sobj
)
3594 * Use szc to determine anon slot(s) to appear atomic.
3595 * If szc = 0, then lock the anon slot and mark it busy.
3596 * If szc > 0, then lock the range of slots by getting the
3597 * anon_array_lock for the first anon slot, and mark only the
3598 * first anon slot busy to represent whole range being busy.
3601 ASSERT(RW_READ_HELD(&
->a_rwlock
));
3602 an_idx
= P2ALIGN(an_idx
, page_get_pagecnt(amp
->a_szc
));
3603 hash
= ANON_ARRAY_HASH(amp
, an_idx
);
3604 sobj
->sync_mutex
= mtx
= &anon_array_lock
[hash
].pad_mutex
;
3605 sobj
->sync_cv
= cv
= &anon_array_cv
[hash
];
3607 ap_slot
= anon_get_slot(amp
->ahp
, an_idx
);
3608 while (ANON_ISBUSY(ap_slot
))
3610 ANON_SETBUSY(ap_slot
);
3611 sobj
->sync_data
= ap_slot
;
3616 anon_array_exit(anon_sync_obj_t
*sobj
)
3618 mutex_enter(sobj
->sync_mutex
);
3619 ASSERT(ANON_ISBUSY(sobj
->sync_data
));
3620 ANON_CLRBUSY(sobj
->sync_data
);
3621 if (CV_HAS_WAITERS(sobj
->sync_cv
))
3622 cv_broadcast(sobj
->sync_cv
);
3623 mutex_exit(sobj
->sync_mutex
);