4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2018 Joyent, Inc.
24 * Copyright 2023 Oxide Computer Company
25 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
29 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
30 * Use is subject to license terms.
33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
34 /* All Rights Reserved */
37 * University Copyright- Copyright (c) 1982, 1986, 1988
38 * The Regents of the University of California
41 * University Acknowledgment- Portions of this document are derived from
42 * software developed by the University of California, Berkeley, and its
46 #include <sys/types.h>
47 #include <sys/t_lock.h>
48 #include <sys/param.h>
52 #include <sys/systm.h>
55 #include <sys/vnode.h>
57 #include <sys/vmparam.h>
58 #include <sys/vtrace.h>
59 #include <sys/cmn_err.h>
60 #include <sys/cpuvar.h>
63 #include <sys/debug.h>
64 #include <sys/callb.h>
65 #include <sys/mem_cage.h>
67 #include <sys/stdbool.h>
74 #include <vm/seg_kmem.h>
77 * FREE MEMORY MANAGEMENT
79 * Management of the pool of free pages is a tricky business. There are
80 * several critical threshold values which constrain our allocation of new
81 * pages and inform the rate of paging out of memory to swap. These threshold
82 * values, and the behaviour they induce, are described below in descending
83 * order of size -- and thus increasing order of severity!
85 * +---------------------------------------------------- physmem (all memory)
87 * | Ordinarily there are no particular constraints placed on page
88 * v allocation. The page scanner is not running and page_create_va()
89 * | will effectively grant all page requests (whether from the kernel
90 * | or from user processes) without artificial delay.
92 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
94 * | When we have less than "lotsfree" pages, pageout_scanner() is
95 * v signalled by schedpaging() to begin looking for pages that can
96 * | be evicted to disk to bring us back above lotsfree. At this
97 * | stage there is still no constraint on allocation of free pages.
99 * | For small systems, we set a lower bound of 16MB for lotsfree;
100 * v this is the natural value for a system with 1GB memory. This is
101 * | to ensure that the pageout reserve pool contains at least 4MB
104 * | For systems with a large amount of memory, we constrain lotsfree
105 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as
106 * v at some point the required slack relates more closely to the
107 * | rate at which paging can occur than to the total amount of memory.
109 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
111 * | When we drop below desfree, a number of kernel facilities will
112 * v wait before allocating more memory, under the assumption that
113 * | pageout or reaping will make progress and free up some memory.
114 * | This behaviour is not especially coordinated; look for comparisons
115 * | of desfree and freemem.
117 * | In addition to various attempts at advisory caution, clock()
118 * | will wake up the thread that is ordinarily parked in sched().
119 * | This routine is responsible for the heavy-handed swapping out
120 * v of entire processes in an attempt to arrest the slide of free
121 * | memory. See comments in sched.c for more details.
123 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
125 * | These two separate tunables have, by default, the same value.
126 * v Various parts of the kernel use minfree to signal the need for
127 * | more aggressive reclamation of memory, and sched() is more
128 * | aggressive at swapping processes out.
130 * | If free memory falls below throttlefree, page_create_va() will
131 * | use page_create_throttle() to begin holding most requests for
132 * | new pages while pageout and reaping free up memory. Sleeping
133 * v allocations (e.g., KM_SLEEP) are held here while we wait for
134 * | more memory. Non-sleeping allocations are generally allowed to
135 * | proceed, unless their priority is explicitly lowered with
136 * | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
138 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
140 * | When we hit throttlefree, the situation is already dire. The
141 * v system is generally paging out memory and swapping out entire
142 * | processes in order to free up memory for continued operation.
144 * | Unfortunately, evicting memory to disk generally requires short
145 * | term use of additional memory; e.g., allocation of buffers for
146 * | storage drivers, updating maps of free and used blocks, etc.
147 * | As such, pageout_reserve is the number of pages that we keep in
148 * | special reserve for use by pageout() and sched() and by any
149 * v other parts of the kernel that need to be working for those to
150 * | make forward progress such as the ZFS I/O pipeline.
152 * | When we are below pageout_reserve, we fail or hold any allocation
153 * | that has not explicitly requested access to the reserve pool.
154 * | Access to the reserve is generally granted via the KM_PUSHPAGE
155 * | flag, or by marking a thread T_PUSHPAGE such that all allocations
156 * | can implicitly tap the reserve. For more details, see the
157 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
158 * | and VM_PUSHPAGE allocation flags, and page_create_throttle().
160 * +---------------------------------------------------------- no free memory
162 * | If we have arrived here, things are very bad indeed. It is
163 * v surprisingly difficult to tell if this condition is even fatal,
164 * | as enough memory may have been granted to pageout() and to the
165 * | ZFS I/O pipeline that requests for eviction that have already been
166 * | made will complete and free up memory some time soon.
168 * | If free memory does not materialise, the system generally remains
169 * | deadlocked. The pageout_deadman() below is run once per second
170 * | from clock(), seeking to limit the amount of time a single request
171 * v to page out can be blocked before the system panics to get a crash
172 * | dump and return to service.
174 * +-------------------------------------------------------------------------
178 * The following parameters control operation of the page replacement
179 * algorithm. They are initialized to 0, and then computed at boot time based
180 * on the size of the system; see setupclock(). If they are patched non-zero
181 * in a loaded vmunix they are left alone and may thus be changed per system
182 * using "mdb -kw" on the loaded system.
184 pgcnt_t slowscan
= 0;
185 pgcnt_t fastscan
= 0;
187 static pgcnt_t handspreadpages
= 0;
191 * Cached copy of the total number of pages in the system (total_pages).
194 * Divisor used to relate fastscan to looppages in setupclock().
196 static uint_t loopfraction
= 2;
197 static pgcnt_t looppages
;
199 static uint_t min_percent_cpu
= 4;
200 static uint_t max_percent_cpu
= 80;
201 static pgcnt_t maxfastscan
= 0;
202 static pgcnt_t maxslowscan
= 100;
204 #define MEGABYTES (1024ULL * 1024ULL)
207 * pageout_threshold_style:
208 * set to 1 to use the previous default threshold size calculation;
209 * i.e., each threshold is half of the next largest value.
211 uint_t pageout_threshold_style
= 0;
214 * The operator may override these tunables to request a different minimum or
215 * maximum lotsfree value, or to change the divisor we use for automatic
218 * By default, we make lotsfree 1/64th of the total memory in the machine. The
219 * minimum and maximum are specified in bytes, rather than pages; a zero value
220 * means the default values (below) are used.
222 uint_t lotsfree_fraction
= 64;
223 pgcnt_t lotsfree_min
= 0;
224 pgcnt_t lotsfree_max
= 0;
226 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
227 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
230 * If these tunables are set to non-zero values in /etc/system, and provided
231 * the value is not larger than the threshold above, the specified value will
232 * be used directly without any additional calculation or adjustment. The boot
233 * time value of these overrides is preserved in the "clockinit" struct. More
234 * detail is available in the comment at the top of the file.
239 pgcnt_t lotsfree
= 0;
240 pgcnt_t needfree
= 0;
241 pgcnt_t throttlefree
= 0;
242 pgcnt_t pageout_reserve
= 0;
249 uint64_t low_mem_scan
;
251 /* The maximum supported number of page_scanner() threads */
252 #define MAX_PSCAN_THREADS 16
255 * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
256 * number of nanoseconds in each wakeup cycle that gives the equivalent of some
257 * underlying %CPU duty cycle.
260 * nanoseconds/wakeup equivalent of min_percent_cpu.
263 * nanoseconds/wakeup equivalent of max_percent_cpu.
266 * Number of nanoseconds budgeted for each wakeup cycle.
267 * Computed each time around by schedpaging().
268 * Varies between min_pageout_nsec and max_pageout_nsec,
269 * depending on memory pressure.
271 static hrtime_t min_pageout_nsec
;
272 static hrtime_t max_pageout_nsec
;
273 static hrtime_t pageout_nsec
;
275 static bool reset_hands
[MAX_PSCAN_THREADS
];
277 #define PAGES_POLL_MASK 1023
280 * Pageout scheduling.
282 * Schedpaging controls the rate at which the page out daemon runs by
283 * setting the global variables nscan and desscan SCHEDPAGING_HZ
284 * times a second. Nscan records the number of pages pageout has examined
285 * in its current pass; schedpaging() resets this value to zero each time
286 * it runs. Desscan records the number of pages pageout should examine
287 * in its next pass; schedpaging() sets this value based on the amount of
288 * currently available memory.
290 #define SCHEDPAGING_HZ 4
294 * The desired number of page scanner threads. For testing purposes, this
295 * value can be set in /etc/system or tuned directly with mdb(1). The
296 * system will bring the actual number of threads into line with the
297 * desired number. If set to an invalid value, the system will correct the
300 uint_t despagescanners
= 0;
303 * pageout_sample_lim:
304 * The limit on the number of samples needed to establish a value for new
305 * pageout parameters: fastscan, slowscan, pageout_new_spread, and
308 * pageout_sample_cnt:
309 * Current sample number. Once the sample gets large enough, set new
310 * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
312 * pageout_sample_pages:
313 * The accumulated number of pages scanned during sampling.
315 * pageout_sample_etime:
316 * The accumulated nanoseconds for the sample.
319 * True while sampling is still in progress.
322 * Rate in pages/nanosecond, computed at the end of sampling.
324 * pageout_new_spread:
325 * Initially zero while the system scan rate is measured by
326 * pageout_scanner(), which then sets this value once per system boot after
327 * enough samples have been recorded (pageout_sample_cnt). Once set, this
328 * new value is used for fastscan and handspreadpages.
330 typedef hrtime_t hrrate_t
;
332 static uint64_t pageout_sample_lim
= 4;
333 static uint64_t pageout_sample_cnt
= 0;
334 static pgcnt_t pageout_sample_pages
= 0;
335 static hrtime_t pageout_sample_etime
= 0;
336 static bool pageout_sampling
= true;
337 static hrrate_t pageout_rate
= 0;
338 static pgcnt_t pageout_new_spread
= 0;
340 /* The current number of page scanner threads */
341 static uint_t n_page_scanners
= 1;
342 /* The number of page scanner threads that are actively scanning. */
343 static uint_t pageouts_running
;
346 * Record number of times a pageout_scanner() wakeup cycle finished because it
347 * timed out (exceeded its CPU budget), rather than because it visited
348 * its budgeted number of pages.
350 uint64_t pageout_timeouts
= 0;
353 static struct pageoutvmstats_str
{
354 ulong_t checkpage
[3];
356 #endif /* VM_STATS */
359 * Threads waiting for free memory use this condition variable and lock until
360 * memory becomes available.
362 kmutex_t memavail_lock
;
363 kcondvar_t memavail_cv
;
365 typedef enum pageout_hand
{
374 } checkpage_result_t
;
376 static checkpage_result_t
checkpage(page_t
*, pageout_hand_t
);
378 static struct clockinit
{
380 pgcnt_t ci_lotsfree_min
;
381 pgcnt_t ci_lotsfree_max
;
385 pgcnt_t ci_throttlefree
;
386 pgcnt_t ci_pageout_reserve
;
388 pgcnt_t ci_maxfastscan
;
391 pgcnt_t ci_handspreadpages
;
392 uint_t ci_despagescanners
;
393 } clockinit
= { .ci_init
= false };
395 static inline pgcnt_t
396 clamp(pgcnt_t value
, pgcnt_t minimum
, pgcnt_t maximum
)
400 else if (value
> maximum
)
407 tune(pgcnt_t initval
, pgcnt_t initval_ceiling
, pgcnt_t defval
)
409 if (initval
== 0 || initval
>= initval_ceiling
)
416 * On large memory systems, multiple instances of the page scanner are run,
417 * each responsible for a separate region of memory. This speeds up page
418 * invalidation under low memory conditions.
420 * For testing purposes, despagescanners can be set in /etc/system or via
421 * mdb(1) and it will be used as a guide for how many page scanners to create;
422 * the value will be adjusted if it is not sensible. Otherwise, the number of
423 * page scanners is determined dynamically based on handspreadpages.
426 recalc_pagescanners(void)
430 /* If the initial calibration has not been done, take no action. */
431 if (pageout_new_spread
== 0)
435 * If `clockinit.ci_despagescanners` is non-zero, then a value for
436 * `despagescanners` was set during initial boot. In this case, if
437 * `despagescanners` has been reset to 0 then we want to revert to
438 * that initial boot value.
440 if (despagescanners
== 0)
441 despagescanners
= clockinit
.ci_despagescanners
;
443 if (despagescanners
!= 0) {
445 * We have a desired number of page scanners, either from
446 * /etc/system or set via mdb. Try and use it (it will be
447 * adjusted below if necessary).
449 des
= despagescanners
;
452 * Calculate the number of desired scanners based on the
453 * system's memory size.
455 * A 64GiB region size is used as the basis for calculating how
456 * many scanner threads should be created. For systems with up
457 * to 64GiB of RAM, a single thread is used; for very large
458 * memory systems the threads are limited to MAX_PSCAN_THREADS.
460 des
= (looppages
- 1) / btop(64ULL << 30) + 1;
464 * Clamp the number of scanners so that we have no more than
465 * MAX_PSCAN_THREADS and so that each scanner covers at least 10% more
466 * than handspreadpages.
468 pgcnt_t min_scanner_pages
= handspreadpages
+ handspreadpages
/ 10;
469 pgcnt_t max_scanners
= looppages
/ min_scanner_pages
;
470 despagescanners
= clamp(des
, 1,
471 clamp(max_scanners
, 1, MAX_PSCAN_THREADS
));
475 * Set up the paging constants for the clock algorithm used by
476 * pageout_scanner(), and by the virtual memory system overall. See the
477 * comments at the top of this file for more information about the threshold
478 * values and system responses to memory pressure.
480 * This routine is called once by main() at startup, after the initial size of
481 * physical memory is determined. It may be called again later if memory is
482 * added to or removed from the system, or if new measurements of the page scan
483 * rate become available.
488 bool half
= (pageout_threshold_style
== 1);
491 looppages
= total_pages
;
494 * The operator may have provided specific values for some of the
495 * tunables via /etc/system. On our first call, we preserve those
496 * values so that they can be used for subsequent recalculations.
498 * A value of zero for any tunable means we will use the default
501 if (!clockinit
.ci_init
) {
502 clockinit
.ci_init
= true;
504 clockinit
.ci_lotsfree_min
= lotsfree_min
;
505 clockinit
.ci_lotsfree_max
= lotsfree_max
;
506 clockinit
.ci_lotsfree
= lotsfree
;
507 clockinit
.ci_desfree
= desfree
;
508 clockinit
.ci_minfree
= minfree
;
509 clockinit
.ci_throttlefree
= throttlefree
;
510 clockinit
.ci_pageout_reserve
= pageout_reserve
;
511 clockinit
.ci_maxpgio
= maxpgio
;
512 clockinit
.ci_maxfastscan
= maxfastscan
;
513 clockinit
.ci_fastscan
= fastscan
;
514 clockinit
.ci_slowscan
= slowscan
;
515 clockinit
.ci_handspreadpages
= handspreadpages
;
516 clockinit
.ci_despagescanners
= despagescanners
;
519 * The first call does not trigger a recalculation, only
526 * Configure paging threshold values. For more details on what each
527 * threshold signifies, see the comments at the top of this file.
529 lotsfree_max
= tune(clockinit
.ci_lotsfree_max
, looppages
,
530 btop(LOTSFREE_MAX_DEFAULT
));
531 lotsfree_min
= tune(clockinit
.ci_lotsfree_min
, lotsfree_max
,
532 btop(LOTSFREE_MIN_DEFAULT
));
534 lotsfree
= tune(clockinit
.ci_lotsfree
, looppages
,
535 clamp(looppages
/ lotsfree_fraction
, lotsfree_min
, lotsfree_max
));
537 desfree
= tune(clockinit
.ci_desfree
, lotsfree
,
540 minfree
= tune(clockinit
.ci_minfree
, desfree
,
541 half
? desfree
/ 2 : 3 * desfree
/ 4);
543 throttlefree
= tune(clockinit
.ci_throttlefree
, desfree
,
546 pageout_reserve
= tune(clockinit
.ci_pageout_reserve
, throttlefree
,
547 half
? throttlefree
/ 2 : 3 * throttlefree
/ 4);
550 * Maxpgio thresholds how much paging is acceptable.
551 * This figures that 2/3 busy on an arm is all that is
552 * tolerable for paging. We assume one operation per disk rev.
554 * XXX - Does not account for multiple swap devices.
556 if (clockinit
.ci_maxpgio
== 0) {
557 maxpgio
= (DISKRPM
* 2) / 3;
559 maxpgio
= clockinit
.ci_maxpgio
;
563 * The clock scan rate varies between fastscan and slowscan
564 * based on the amount of free memory available. Fastscan
565 * rate should be set based on the number pages that can be
566 * scanned per sec using ~10% of processor time. Since this
567 * value depends on the processor, MMU, Mhz etc., it is
568 * difficult to determine it in a generic manner for all
571 * Instead of trying to determine the number of pages scanned
572 * per sec for every processor, fastscan is set to be the smaller
573 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
574 * time is limited to ~4% of processor time.
576 * Setting fastscan to be 1/2 of memory allows pageout to scan
577 * all of memory in ~2 secs. This implies that user pages not
578 * accessed within 1 sec (assuming, handspreadpages == fastscan)
579 * can be reclaimed when free memory is very low. Stealing pages
580 * not accessed within 1 sec seems reasonable and ensures that
581 * active user processes don't thrash.
583 * Smaller values of fastscan result in scanning fewer pages
584 * every second and consequently pageout may not be able to free
585 * sufficient memory to maintain the minimum threshold. Larger
586 * values of fastscan result in scanning a lot more pages which
587 * could lead to thrashing and higher CPU usage.
589 * Fastscan needs to be limited to a maximum value and should not
590 * scale with memory to prevent pageout from consuming too much
591 * time for scanning on slow CPU's and avoid thrashing, as a
592 * result of scanning too many pages, on faster CPU's.
593 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
594 * (the upper bound for fastscan) based on the average number
595 * of pages that can potentially be scanned in ~1 sec (using ~4%
596 * of the CPU) on some of the following machines that currently
599 * average memory scanned in ~1 sec
601 * 25 Mhz SS1+: 23 Meg
603 * 50 Mhz SC2000: 68 Meg
608 * When free memory falls just below lotsfree, the scan rate
609 * goes from 0 to slowscan (i.e., pageout starts running). This
610 * transition needs to be smooth and is achieved by ensuring that
611 * pageout scans a small number of pages to satisfy the transient
612 * memory demand. This is set to not exceed 100 pages/sec (25 per
613 * wakeup) since scanning that many pages has no noticible impact
614 * on system performance.
616 * In addition to setting fastscan and slowscan, pageout is
617 * limited to using ~4% of the CPU. This results in increasing
618 * the time taken to scan all of memory, which in turn means that
619 * user processes have a better opportunity of preventing their
620 * pages from being stolen. This has a positive effect on
621 * interactive and overall system performance when memory demand
624 * Thus, the rate at which pages are scanned for replacement will
625 * vary linearly between slowscan and the number of pages that
626 * can be scanned using ~4% of processor time instead of varying
627 * linearly between slowscan and fastscan.
629 * Also, the processor time used by pageout will vary from ~1%
630 * at slowscan to ~4% at fastscan instead of varying between
631 * ~1% at slowscan and ~10% at fastscan.
633 * The values chosen for the various VM parameters (fastscan,
634 * handspreadpages, etc) are not universally true for all machines,
635 * but appear to be a good rule of thumb for the machines we've
636 * tested. They have the following ranges:
638 * cpu speed: 20 to 70 Mhz
639 * page size: 4K to 8K
640 * memory size: 16M to 5G
641 * page scan rate: 4000 - 17400 4K pages per sec
643 * The values need to be re-examined for machines which don't
644 * fall into the various ranges (e.g., slower or faster CPUs,
645 * smaller or larger pagesizes etc) shown above.
647 * On an MP machine, pageout is often unable to maintain the
648 * minimum paging thresholds under heavy load. This is due to
649 * the fact that user processes running on other CPU's can be
650 * dirtying memory at a much faster pace than pageout can find
651 * pages to free. The memory demands could be met by enabling
652 * more than one CPU to run the clock algorithm in such a manner
653 * that the various clock hands don't overlap. This also makes
654 * it more difficult to determine the values for fastscan, slowscan
655 * and handspreadpages.
657 * The swapper is currently used to free up memory when pageout
658 * is unable to meet memory demands by swapping out processes.
659 * In addition to freeing up memory, swapping also reduces the
660 * demand for memory by preventing user processes from running
661 * and thereby consuming memory.
663 if (clockinit
.ci_maxfastscan
== 0) {
664 if (pageout_new_spread
!= 0) {
665 maxfastscan
= pageout_new_spread
;
667 maxfastscan
= MAXHANDSPREADPAGES
;
670 maxfastscan
= clockinit
.ci_maxfastscan
;
673 if (clockinit
.ci_fastscan
== 0) {
674 fastscan
= MIN(looppages
/ loopfraction
, maxfastscan
);
676 fastscan
= clockinit
.ci_fastscan
;
679 if (fastscan
> looppages
/ loopfraction
) {
680 fastscan
= looppages
/ loopfraction
;
684 * Set slow scan time to 1/10 the fast scan time, but
685 * not to exceed maxslowscan.
687 if (clockinit
.ci_slowscan
== 0) {
688 slowscan
= MIN(fastscan
/ 10, maxslowscan
);
690 slowscan
= clockinit
.ci_slowscan
;
693 if (slowscan
> fastscan
/ 2) {
694 slowscan
= fastscan
/ 2;
698 * Handspreadpages is the distance (in pages) between front and back
699 * pageout daemon hands. The amount of time to reclaim a page
700 * once pageout examines it increases with this distance and
701 * decreases as the scan rate rises. It must be < the amount
702 * of pageable memory.
704 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
705 * to be "fastscan" results in the front hand being a few secs
706 * (varies based on the processor speed) ahead of the back hand
707 * at fastscan rates. This distance can be further reduced, if
708 * necessary, by increasing the processor time used by pageout
709 * to be more than ~4% and preferrably not more than ~10%.
711 * As a result, user processes have a much better chance of
712 * referencing their pages before the back hand examines them.
713 * This also significantly lowers the number of reclaims from
714 * the freelist since pageout does not end up freeing pages which
715 * may be referenced a sec later.
717 if (clockinit
.ci_handspreadpages
== 0) {
718 handspreadpages
= fastscan
;
720 handspreadpages
= clockinit
.ci_handspreadpages
;
724 * Make sure that back hand follows front hand by at least
725 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
726 * back hand to look at a page during the same wakeup of the pageout
727 * daemon in which the front hand cleared its ref bit.
729 if (handspreadpages
>= looppages
) {
730 handspreadpages
= looppages
- 1;
734 * Establish the minimum and maximum length of time to be spent
735 * scanning pages per wakeup, limiting the scanner duty cycle. The
736 * input percentage values (0-100) must be converted to a fraction of
737 * the number of nanoseconds in a second of wall time, then further
738 * scaled down by the number of scanner wakeups in a second.
740 min_pageout_nsec
= MAX(1,
741 NANOSEC
* min_percent_cpu
/ 100 / SCHEDPAGING_HZ
);
742 max_pageout_nsec
= MAX(min_pageout_nsec
,
743 NANOSEC
* max_percent_cpu
/ 100 / SCHEDPAGING_HZ
);
746 * If not called for recalculation, return and skip the remaining
753 * Set a flag to re-evaluate the clock hand positions.
755 for (uint_t i
= 0; i
< MAX_PSCAN_THREADS
; i
++)
756 reset_hands
[i
] = true;
758 recalc_pagescanners();
761 static kmutex_t pageout_mutex
;
764 * Pool of available async pageout putpage requests.
766 static struct async_reqs
*push_req
;
767 static struct async_reqs
*req_freelist
; /* available req structs */
768 static struct async_reqs
*push_list
; /* pending reqs */
769 static kmutex_t push_lock
; /* protects req pool */
770 static kcondvar_t push_cv
;
773 * If pageout() is stuck on a single push for this many seconds,
774 * pageout_deadman() will assume the system has hit a memory deadlock. If set
775 * to 0, the deadman will have no effect.
777 * Note that we are only looking for stalls in the calls that pageout() makes
778 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
779 * I/O, which should not take long unless the underlying strategy call blocks
780 * indefinitely for memory. The actual I/O request happens (or fails) later.
782 uint_t pageout_deadman_seconds
= 90;
784 static uint_t pageout_stucktime
= 0;
785 static bool pageout_pushing
= false;
786 static uint64_t pageout_pushcount
= 0;
787 static uint64_t pageout_pushcount_seen
= 0;
789 int async_list_size
= 8192;
791 static void pageout_scanner(void *);
794 * If a page is being shared more than "po_share" times
795 * then leave it alone- don't page it out.
797 #define MIN_PO_SHARE (8)
798 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
799 ulong_t po_share
= MIN_PO_SHARE
;
802 * Schedule rate for paging.
803 * Rate is linear interpolation between
804 * slowscan with lotsfree and fastscan when out of memory.
807 schedpaging(void *arg
)
811 if (freemem
< lotsfree
+ needfree
+ kmem_reapahead
)
814 if (freemem
< lotsfree
+ needfree
)
817 if (kcage_on
&& (kcage_freemem
< kcage_desfree
|| kcage_needfree
))
818 kcage_cageout_wakeup();
820 if (mutex_tryenter(&pageout_mutex
)) {
821 if (pageouts_running
!= 0)
824 /* No pageout scanner threads running. */
826 vavail
= freemem
- deficit
;
827 if (pageout_new_spread
!= 0)
829 /* Note that vavail is signed so don't use clamp() here */
832 if (vavail
> lotsfree
)
835 if (needfree
> 0 && pageout_new_spread
== 0) {
837 * If we've not yet collected enough samples to
838 * calculate a spread, use the old logic of kicking
839 * into high gear anytime needfree is non-zero.
841 desscan
= fastscan
/ SCHEDPAGING_HZ
;
844 * Once we've calculated a spread based on system
845 * memory and usage, just treat needfree as another
848 spgcnt_t faststmp
, slowstmp
, result
;
850 slowstmp
= slowscan
* vavail
;
851 faststmp
= fastscan
* (lotsfree
- vavail
);
852 result
= (slowstmp
+ faststmp
) /
853 nz(lotsfree
) / SCHEDPAGING_HZ
;
854 desscan
= (pgcnt_t
)result
;
857 pageout_nsec
= min_pageout_nsec
+ (lotsfree
- vavail
) *
858 (max_pageout_nsec
- min_pageout_nsec
) / nz(lotsfree
);
860 DTRACE_PROBE2(schedpage__calc
, pgcnt_t
, desscan
, hrtime_t
,
863 if (pageout_new_spread
!= 0 && despagescanners
!= 0 &&
864 despagescanners
!= n_page_scanners
) {
866 * We have finished the pagescan initialisation and the
867 * desired number of page scanners has changed, either
868 * because sampling just finished, because of a memory
869 * DR, or because despagescanners has been modified on
870 * the fly (e.g. via mdb(1)).
872 uint_t curr_nscan
= n_page_scanners
;
875 /* Re-validate despagescanners */
876 recalc_pagescanners();
878 n_page_scanners
= despagescanners
;
880 for (i
= 0; i
< MAX_PSCAN_THREADS
; i
++)
881 reset_hands
[i
] = true;
883 /* If we need more scanners, start them now. */
884 for (i
= curr_nscan
; i
< n_page_scanners
; i
++) {
885 (void) lwp_kernel_create(proc_pageout
,
886 pageout_scanner
, (void *)(uintptr_t)i
,
887 TS_RUN
, curthread
->t_pri
);
891 * If the number of scanners has decreased, trigger a
892 * wakeup so that the excess threads will terminate.
894 if (n_page_scanners
< curr_nscan
) {
895 WAKE_PAGEOUT_SCANNER(reducing
);
899 if (pageout_sampling
) {
901 * We still need to measure the rate at which the
902 * system is able to scan pages of memory. Each of
903 * these initial samples is a scan of as much system
904 * memory as practical, regardless of whether or not we
905 * are experiencing memory pressure.
907 desscan
= total_pages
;
908 pageout_nsec
= max_pageout_nsec
;
910 WAKE_PAGEOUT_SCANNER(sampling
);
911 } else if (freemem
< lotsfree
+ needfree
) {
913 * We need more memory.
916 WAKE_PAGEOUT_SCANNER(lowmem
);
919 * There are enough free pages, no need to
920 * kick the scanner threads. And next time
921 * around, keep more of the `highly shared'
925 if (po_share
> MIN_PO_SHARE
)
929 mutex_exit(&pageout_mutex
);
933 * Signal threads waiting for available memory.
934 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
935 * in this case it is not needed - the waiters will be woken up during
936 * the next invocation of this function.
938 if (kmem_avail() > 0)
939 cv_broadcast(&memavail_cv
);
941 (void) timeout(schedpaging
, arg
, hz
/ SCHEDPAGING_HZ
);
945 ulong_t push_list_size
; /* # of requests on pageout queue */
948 * Paging out should always be enabled. This tunable exists to hold pageout
949 * for debugging purposes. If set to 0, pageout_scanner() will go back to
950 * sleep each time it is woken by schedpaging().
952 uint_t dopageout
= 1;
955 * The page out daemon, which runs as process 2.
957 * The daemon treats physical memory as a circular array of pages and scans
958 * the pages using a 'two-handed clock' algorithm. The front hand moves
959 * through the pages, clearing the reference bit. The back hand travels a
960 * distance (handspreadpages) behind the front hand, freeing the pages that
961 * have not been referenced in the time since the front hand passed. If
962 * modified, they are first written to their backing store before being
965 * In order to make page invalidation more responsive on machines with
966 * larger memory, multiple pageout_scanner threads may be created. In this
967 * case, each thread is given a segment of the memory "clock face" so that
968 * memory can be reclaimed more quickly. As long as there are at least lotsfree
969 * pages, then pageout_scanner threads are not run.
971 * There are multiple threads that act on behalf of the pageout process. A
972 * set of threads scan pages (pageout_scanner) and frees them up if they
973 * don't require any VOP_PUTPAGE operation. If a page must be written back
974 * to its backing store, the request is put on a list and the other
975 * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
976 * requests from the list, and processes them. Some filesystems may require
977 * resources for the VOP_PUTPAGE operations (like memory) and hence can
978 * block the pageout thread, but the scanner thread can still operate.
979 * There is still no guarantee that memory deadlocks cannot occur.
984 struct async_reqs
*arg
;
990 proc_pageout
= ttoproc(curthread
);
991 proc_pageout
->p_cstime
= 0;
992 proc_pageout
->p_stime
= 0;
993 proc_pageout
->p_cutime
= 0;
994 proc_pageout
->p_utime
= 0;
995 bcopy("pageout", PTOU(curproc
)->u_psargs
, 8);
996 bcopy("pageout", PTOU(curproc
)->u_comm
, 7);
998 mutex_init(&pageout_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
999 mutex_init(&push_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1002 * Allocate and initialize the async request structures for pageout.
1004 push_req
= (struct async_reqs
*)
1005 kmem_zalloc(async_list_size
* sizeof (struct async_reqs
), KM_SLEEP
);
1007 req_freelist
= push_req
;
1008 for (i
= 0; i
< async_list_size
- 1; i
++) {
1009 push_req
[i
].a_next
= &push_req
[i
+ 1];
1012 pageout_pri
= curthread
->t_pri
;
1014 /* Create the first pageout scanner thread. */
1015 (void) lwp_kernel_create(proc_pageout
, pageout_scanner
,
1016 (void *)0, /* this is instance 0, not NULL */
1017 TS_RUN
, pageout_pri
- 1);
1020 * kick off the pageout scheduler.
1025 * Create kernel cage thread.
1026 * The kernel cage thread is started under the pageout process
1027 * to take advantage of the less restricted page allocation
1028 * in page_create_throttle().
1030 kcage_cageout_init();
1033 * Limit pushes to avoid saturating pageout devices.
1035 max_pushes
= maxpgio
/ SCHEDPAGING_HZ
;
1036 CALLB_CPR_INIT(&cprinfo
, &push_lock
, callb_generic_cpr
, "pageout");
1039 mutex_enter(&push_lock
);
1041 while ((arg
= push_list
) == NULL
|| pushes
> max_pushes
) {
1042 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
1043 cv_wait(&push_cv
, &push_lock
);
1045 CALLB_CPR_SAFE_END(&cprinfo
, &push_lock
);
1047 push_list
= arg
->a_next
;
1049 pageout_pushing
= true;
1050 mutex_exit(&push_lock
);
1052 DTRACE_PROBE(pageout__push
);
1054 if (VOP_PUTPAGE(arg
->a_vp
, (offset_t
)arg
->a_off
,
1055 arg
->a_len
, arg
->a_flags
, arg
->a_cred
, NULL
) == 0) {
1059 /* vp held by checkpage() */
1062 mutex_enter(&push_lock
);
1063 pageout_pushing
= false;
1064 pageout_pushcount
++;
1065 arg
->a_next
= req_freelist
; /* back on freelist */
1068 mutex_exit(&push_lock
);
1073 pageout_sample_add(pgcnt_t count
, hrtime_t elapsed
)
1075 VERIFY(pageout_sampling
);
1078 * The global variables used below are only modified during initial
1079 * scanning when there is a single page scanner thread running.
1081 pageout_sample_pages
+= count
;
1082 pageout_sample_etime
+= elapsed
;
1083 pageout_sample_cnt
++;
1085 if (pageout_sample_cnt
>= pageout_sample_lim
) {
1087 * We have enough samples, set the spread.
1089 pageout_sampling
= false;
1090 pageout_rate
= (hrrate_t
)pageout_sample_pages
*
1091 (hrrate_t
)(NANOSEC
) / pageout_sample_etime
;
1092 pageout_new_spread
= pageout_rate
/ 10;
1096 static inline page_t
*
1097 wrapping_page_next(page_t
*cur
, page_t
*start
, page_t
*end
)
1101 return (page_nextn(cur
, 1));
1105 * Kernel thread that scans pages looking for ones to free
1108 pageout_scanner(void *a
)
1110 page_t
*fhand
, *bhand
, *fhandstart
;
1111 page_t
*regionstart
, *regionend
;
1113 callb_cpr_t cprinfo
;
1116 hrtime_t sample_start
, sample_end
;
1117 uint_t inst
= (uint_t
)(uintptr_t)a
;
1119 VERIFY3U(inst
, <, MAX_PSCAN_THREADS
);
1121 CALLB_CPR_INIT(&cprinfo
, &pageout_mutex
, callb_generic_cpr
, "poscan");
1122 mutex_enter(&pageout_mutex
);
1125 * The restart case does not attempt to point the hands at roughly
1126 * the right point on the assumption that after one circuit things
1127 * will have settled down, and restarts shouldn't be that often.
1129 reset_hands
[inst
] = true;
1132 mutex_exit(&pageout_mutex
);
1135 cv_signal_pageout();
1137 mutex_enter(&pageout_mutex
);
1139 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
1140 cv_wait(&proc_pageout
->p_cv
, &pageout_mutex
);
1141 CALLB_CPR_SAFE_END(&cprinfo
, &pageout_mutex
);
1143 mutex_exit(&pageout_mutex
);
1146 * Check if pageout has been disabled for debugging purposes.
1152 * One may reset the clock hands and scanned region for debugging
1153 * purposes. Hands will also be reset on first thread startup, if
1154 * the number of scanning threads (n_page_scanners) changes, or if
1155 * memory is added to, or removed from, the system.
1157 if (reset_hands
[inst
]) {
1160 reset_hands
[inst
] = false;
1162 if (inst
>= n_page_scanners
) {
1164 * The desired number of page scanners has been
1165 * reduced and this instance is no longer wanted.
1168 VERIFY3U(inst
, !=, 0);
1169 DTRACE_PROBE1(pageout__exit
, uint_t
, inst
);
1170 mutex_enter(&pageout_mutex
);
1172 mutex_exit(&pageout_mutex
);
1173 mutex_enter(&curproc
->p_lock
);
1178 first
= page_first();
1181 * Each scanner thread gets its own sector of the memory
1184 pgcnt_t span
, offset
;
1186 span
= looppages
/ n_page_scanners
;
1187 VERIFY3U(span
, >, handspreadpages
);
1189 offset
= inst
* span
;
1190 regionstart
= page_nextn(first
, offset
);
1191 if (inst
== n_page_scanners
- 1) {
1192 /* The last instance goes up to the last page */
1193 regionend
= page_nextn(first
, looppages
- 1);
1195 regionend
= page_nextn(regionstart
, span
- 1);
1198 bhand
= regionstart
;
1199 fhand
= page_nextn(bhand
, handspreadpages
);
1201 DTRACE_PROBE4(pageout__reset
, uint_t
, inst
,
1202 pgcnt_t
, regionstart
, pgcnt_t
, regionend
,
1207 * This CPU kstat is only incremented here and we're on this CPU, so no
1210 CPU_STATS_ADDQ(CPU
, vm
, pgrrun
, 1);
1213 * Keep track of the number of times we have scanned all the way around
1214 * the loop on this wakeup.
1219 * Track the number of pages visited during this scan so that we can
1220 * periodically measure our duty cycle.
1225 DTRACE_PROBE5(pageout__start
, uint_t
, inst
, pgcnt_t
, desscan
,
1226 hrtime_t
, pageout_nsec
, page_t
*, bhand
, page_t
*, fhand
);
1229 * Record the initial position of the front hand for this cycle so
1230 * that we can detect when the hand wraps around.
1234 sample_start
= gethrtime();
1237 * Scan the appropriate number of pages for a single duty cycle.
1239 while (nscan_cnt
< desscan
) {
1240 checkpage_result_t rvfront
, rvback
;
1242 if (!pageout_sampling
&& freemem
>= lotsfree
+ needfree
) {
1244 * We are not sampling and enough memory has become
1245 * available that scanning is no longer required.
1247 DTRACE_PROBE1(pageout__memfree
, uint_t
, inst
);
1251 DTRACE_PROBE2(pageout__loop
, uint_t
, inst
, pgcnt_t
, pcount
);
1254 * Periodically check to see if we have exceeded the CPU duty
1255 * cycle for a single wakeup.
1257 if ((pcount
& PAGES_POLL_MASK
) == PAGES_POLL_MASK
) {
1258 hrtime_t pageout_cycle_nsec
;
1260 pageout_cycle_nsec
= gethrtime() - sample_start
;
1261 if (pageout_cycle_nsec
>= pageout_nsec
) {
1262 atomic_inc_64(&pageout_timeouts
);
1263 DTRACE_PROBE1(pageout__timeout
, uint_t
, inst
);
1269 * If checkpage manages to add a page to the free list,
1270 * we give ourselves another couple of trips around the loop.
1272 if ((rvfront
= checkpage(fhand
, POH_FRONT
)) == CKP_FREED
) {
1275 if ((rvback
= checkpage(bhand
, POH_BACK
)) == CKP_FREED
) {
1282 * This CPU kstat is only incremented here and we're on this
1285 CPU_STATS_ADDQ(CPU
, vm
, scan
, 1);
1288 * Don't include ineligible pages in the number scanned.
1290 if (rvfront
!= CKP_INELIGIBLE
|| rvback
!= CKP_INELIGIBLE
)
1296 bhand
= wrapping_page_next(bhand
, regionstart
, regionend
);
1297 fhand
= wrapping_page_next(fhand
, regionstart
, regionend
);
1300 * The front hand has wrapped around during this wakeup.
1302 if (fhand
== fhandstart
) {
1304 DTRACE_PROBE2(pageout__hand__wrap
, uint_t
, inst
,
1308 * This CPU kstat is only incremented here and we're
1309 * on this CPU, so no lock.
1311 CPU_STATS_ADDQ(CPU
, vm
, rev
, 1);
1315 * Extremely unlikely, but it happens.
1316 * We went around the loop at least once
1317 * and didn't get far enough.
1318 * If we are still skipping `highly shared'
1319 * pages, skip fewer of them. Otherwise,
1320 * give up till the next clock tick.
1322 if (po_share
< MAX_PO_SHARE
) {
1331 sample_end
= gethrtime();
1332 atomic_add_long(&nscan
, nscan_cnt
);
1334 DTRACE_PROBE4(pageout__end
, uint_t
, inst
, uint_t
, laps
,
1335 pgcnt_t
, nscan_cnt
, pgcnt_t
, pcount
)
1338 * Continue accumulating samples until we have enough to get a
1339 * reasonable value for average scan rate.
1341 if (pageout_sampling
) {
1342 VERIFY3U(inst
, ==, 0);
1343 pageout_sample_add(pcount
, sample_end
- sample_start
);
1345 * If, after the sample just added, we have finished sampling,
1346 * set up the paging constants.
1348 if (!pageout_sampling
)
1356 * The pageout deadman is run once per second by clock().
1359 pageout_deadman(void)
1361 if (panicstr
!= NULL
) {
1363 * There is no pageout after panic.
1368 if (pageout_deadman_seconds
== 0) {
1370 * The deadman is not enabled.
1375 if (!pageout_pushing
) {
1380 * We are pushing a page. Check to see if it is the same call we saw
1381 * last time we looked:
1383 if (pageout_pushcount
!= pageout_pushcount_seen
) {
1385 * It is a different call from the last check, so we are not
1391 if (++pageout_stucktime
>= pageout_deadman_seconds
) {
1392 panic("pageout_deadman: stuck pushing the same page for %d "
1393 "seconds (freemem is %lu)", pageout_deadman_seconds
,
1401 * Reset our tracking state to reflect that we are not stuck:
1403 pageout_stucktime
= 0;
1404 pageout_pushcount_seen
= pageout_pushcount
;
1408 * Look at the page at hand. If it is locked (e.g., for physical i/o),
1409 * system (u., page table) or free, then leave it alone. Otherwise,
1410 * if we are running the front hand, turn off the page's reference bit.
1411 * If the proc is over maxrss, we take it. If running the back hand,
1412 * check whether the page has been reclaimed. If not, free the page,
1413 * pushing it to disk first if necessary.
1416 * CKP_INELIGIBLE if the page is not a candidate at all,
1417 * CKP_NOT_FREED if the page was not freed, or
1418 * CKP_FREED if we freed it.
1420 static checkpage_result_t
1421 checkpage(page_t
*pp
, pageout_hand_t whichhand
)
1430 * - associated with the kernel vnode since
1431 * they are always "exclusively" locked.
1433 * - that are shared more than po_share'd times
1434 * - its already locked
1436 * NOTE: These optimizations assume that reads are atomic.
1439 if (PP_ISKAS(pp
) || PAGE_LOCKED(pp
) || PP_ISFREE(pp
) ||
1440 pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0 ||
1441 hat_page_checkshare(pp
, po_share
)) {
1442 return (CKP_INELIGIBLE
);
1445 if (!page_trylock(pp
, SE_EXCL
)) {
1447 * Skip the page if we can't acquire the "exclusive" lock.
1449 return (CKP_INELIGIBLE
);
1450 } else if (PP_ISFREE(pp
)) {
1452 * It became free between the above check and our actually
1453 * locking the page. Oh well, there will be other pages.
1456 return (CKP_INELIGIBLE
);
1460 * Reject pages that cannot be freed. The page_struct_lock
1461 * need not be acquired to examine these
1462 * fields since the page has an "exclusive" lock.
1464 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
1466 return (CKP_INELIGIBLE
);
1470 * Maintain statistics for what we are freeing
1472 if (pp
->p_vnode
!= NULL
) {
1473 if (pp
->p_vnode
->v_flag
& VVMEXEC
)
1476 if (!IS_SWAPFSVP(pp
->p_vnode
))
1481 * Turn off REF and MOD bits with the front hand.
1482 * The back hand examines the REF bit and always considers
1483 * SHARED pages as referenced.
1485 if (whichhand
== POH_FRONT
) {
1486 pagesync_flag
= HAT_SYNC_ZERORM
;
1488 pagesync_flag
= HAT_SYNC_DONTZERO
| HAT_SYNC_STOPON_REF
|
1489 HAT_SYNC_STOPON_SHARED
;
1492 ppattr
= hat_pagesync(pp
, pagesync_flag
);
1496 * If page is referenced; make unreferenced but reclaimable.
1497 * If this page is not referenced, then it must be reclaimable
1498 * and we can add it to the free list.
1500 if (ppattr
& P_REF
) {
1501 DTRACE_PROBE2(pageout__isref
, page_t
*, pp
,
1502 pageout_hand_t
, whichhand
);
1504 if (whichhand
== POH_FRONT
) {
1506 * Checking of rss or madvise flags needed here...
1508 * If not "well-behaved", fall through into the code
1509 * for not referenced.
1515 * Somebody referenced the page since the front
1516 * hand went by, so it's not a candidate for
1520 return (CKP_NOT_FREED
);
1523 VM_STAT_ADD(pageoutvmstats
.checkpage
[0]);
1526 * If large page, attempt to demote it. If successfully demoted,
1527 * retry the checkpage.
1529 if (pp
->p_szc
!= 0) {
1530 if (!page_try_demote_pages(pp
)) {
1531 VM_STAT_ADD(pageoutvmstats
.checkpage
[1]);
1533 return (CKP_INELIGIBLE
);
1536 ASSERT(pp
->p_szc
== 0);
1537 VM_STAT_ADD(pageoutvmstats
.checkpage
[2]);
1540 * Since page_try_demote_pages() could have unloaded some
1541 * mappings it makes sense to reload ppattr.
1543 ppattr
= hat_page_getattr(pp
, P_MOD
| P_REF
);
1547 * If the page is currently dirty, we have to arrange to have it
1548 * cleaned before it can be freed.
1550 * XXX - ASSERT(pp->p_vnode != NULL);
1552 if ((ppattr
& P_MOD
) && pp
->p_vnode
!= NULL
) {
1553 struct vnode
*vp
= pp
->p_vnode
;
1554 u_offset_t offset
= pp
->p_offset
;
1557 * XXX - Test for process being swapped out or about to exit?
1558 * [Can't get back to process(es) using the page.]
1562 * Hold the vnode before releasing the page lock to
1563 * prevent it from being freed and re-used by some
1570 * Queue I/O request for the pageout thread.
1572 if (!queue_io_request(vp
, offset
)) {
1574 return (CKP_NOT_FREED
);
1580 * Now we unload all the translations and put the page back on to the
1581 * free list. If the page was used (referenced or modified) after the
1582 * pagesync but before it was unloaded we catch it and handle the page
1585 DTRACE_PROBE2(pageout__free
, page_t
*, pp
, pageout_hand_t
, whichhand
);
1586 (void) hat_pageunload(pp
, HAT_FORCE_PGUNLOAD
);
1587 ppattr
= hat_page_getattr(pp
, P_MOD
| P_REF
);
1588 if ((ppattr
& P_REF
) || ((ppattr
& P_MOD
) && pp
->p_vnode
!= NULL
)) {
1592 VN_DISPOSE(pp
, B_FREE
, 0, kcred
);
1594 CPU_STATS_ADD_K(vm
, dfree
, 1);
1598 CPU_STATS_ADD_K(vm
, execfree
, 1);
1600 CPU_STATS_ADD_K(vm
, fsfree
, 1);
1603 CPU_STATS_ADD_K(vm
, anonfree
, 1);
1610 * Queue async i/o request from pageout_scanner and segment swapout
1611 * routines on one common list. This ensures that pageout devices (swap)
1612 * are not saturated by pageout_scanner or swapout requests.
1613 * The pageout thread empties this list by initiating i/o operations.
1616 queue_io_request(vnode_t
*vp
, u_offset_t off
)
1618 struct async_reqs
*arg
;
1621 * If we cannot allocate an async request struct,
1624 mutex_enter(&push_lock
);
1625 if ((arg
= req_freelist
) == NULL
) {
1626 mutex_exit(&push_lock
);
1629 req_freelist
= arg
->a_next
; /* adjust freelist */
1634 arg
->a_len
= PAGESIZE
;
1635 arg
->a_flags
= B_ASYNC
| B_FREE
;
1636 arg
->a_cred
= kcred
; /* always held */
1639 * Add to list of pending write requests.
1641 arg
->a_next
= push_list
;
1644 if (req_freelist
== NULL
) {
1646 * No free async requests left. The lock is held so we
1647 * might as well signal the pusher thread now.
1649 cv_signal(&push_cv
);
1651 mutex_exit(&push_lock
);
1656 * Wake up pageout to initiate i/o if push_list is not empty.
1661 if (push_list
!= NULL
) {
1662 mutex_enter(&push_lock
);
1663 cv_signal(&push_cv
);
1664 mutex_exit(&push_lock
);