4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
45 #include <sys/systm.h>
48 #include <sys/vnode.h>
50 #include <sys/vmparam.h>
51 #include <sys/vtrace.h>
52 #include <sys/cmn_err.h>
53 #include <sys/cpuvar.h>
56 #include <sys/debug.h>
57 #include <sys/callb.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/mem_cage.h>
67 #include <vm/seg_kmem.h>
69 static int checkpage(page_t
*, int);
72 * The following parameters control operation of the page replacement
73 * algorithm. They are initialized to 0, and then computed at boot time
74 * based on the size of the system. If they are patched non-zero in
75 * a loaded vmunix they are left alone and may thus be changed per system
76 * using adb on the loaded system.
81 static pgcnt_t handspreadpages
= 0;
82 static int loopfraction
= 2;
83 static pgcnt_t looppages
;
84 static int min_percent_cpu
= 4;
85 static int max_percent_cpu
= 80;
86 static pgcnt_t maxfastscan
= 0;
87 static pgcnt_t maxslowscan
= 100;
94 pgcnt_t throttlefree
= 0;
95 pgcnt_t pageout_reserve
= 0;
102 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
103 * are the number of ticks in each wakeup cycle that gives the
104 * equivalent of some underlying %CPU duty cycle.
105 * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is
106 * awakened every 25 clock ticks. So, converting from %CPU to ticks
107 * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
108 * So, for example, 4% == 1 tick and 80% == 20 ticks.
111 * ticks/wakeup equivalent of min_percent_cpu.
114 * ticks/wakeup equivalent of max_percent_cpu.
117 * Number of clock ticks budgeted for each wakeup cycle.
118 * Computed each time around by schedpaging().
119 * Varies between min_pageout_ticks .. max_pageout_ticks,
120 * depending on memory pressure.
123 * Timestamp of the last time pageout_scanner woke up and started
124 * (or resumed) scanning for not recently referenced pages.
127 static clock_t min_pageout_ticks
;
128 static clock_t max_pageout_ticks
;
129 static clock_t pageout_ticks
;
130 static clock_t pageout_lbolt
;
132 static uint_t reset_hands
;
134 #define PAGES_POLL_MASK 1023
137 * pageout_sample_lim:
138 * The limit on the number of samples needed to establish a value
139 * for new pageout parameters, fastscan, slowscan, and handspreadpages.
141 * pageout_sample_cnt:
142 * Current sample number. Once the sample gets large enough,
143 * set new values for handspreadpages, fastscan and slowscan.
145 * pageout_sample_pages:
146 * The accumulated number of pages scanned during sampling.
148 * pageout_sample_ticks:
149 * The accumulated clock ticks for the sample.
152 * Rate in pages/nanosecond, computed at the end of sampling.
154 * pageout_new_spread:
155 * The new value to use for fastscan and handspreadpages.
156 * Calculated after enough samples have been taken.
159 typedef hrtime_t hrrate_t
;
161 static uint64_t pageout_sample_lim
= 4;
162 static uint64_t pageout_sample_cnt
= 0;
163 static pgcnt_t pageout_sample_pages
= 0;
164 static hrrate_t pageout_rate
= 0;
165 static pgcnt_t pageout_new_spread
= 0;
167 static clock_t pageout_cycle_ticks
;
168 static hrtime_t sample_start
, sample_end
;
169 static hrtime_t pageout_sample_etime
= 0;
172 * Record number of times a pageout_scanner wakeup cycle finished because it
173 * timed out (exceeded its CPU budget), rather than because it visited
174 * its budgeted number of pages.
176 uint64_t pageout_timeouts
= 0;
179 static struct pageoutvmstats_str
{
180 ulong_t checkpage
[3];
182 #endif /* VM_STATS */
185 * Threads waiting for free memory use this condition variable and lock until
186 * memory becomes available.
188 kmutex_t memavail_lock
;
189 kcondvar_t memavail_cv
;
192 * The size of the clock loop.
194 #define LOOPPAGES total_pages
197 * Set up the paging constants for the clock algorithm.
198 * Called after the system is initialized and the amount of memory
199 * and number of paging devices is known.
201 * lotsfree is 1/64 of memory, but at least 512K.
202 * desfree is 1/2 of lotsfree.
203 * minfree is 1/2 of desfree.
205 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
207 * lotsfree = btop(512K)
208 * desfree = btop(200K)
209 * minfree = btop(100K)
210 * throttlefree = INT_MIN
211 * max_percent_cpu = 4
214 setupclock(int recalc
)
217 static spgcnt_t init_lfree
, init_dfree
, init_mfree
;
218 static spgcnt_t init_tfree
, init_preserve
, init_mpgio
;
219 static spgcnt_t init_mfscan
, init_fscan
, init_sscan
, init_hspages
;
221 looppages
= LOOPPAGES
;
224 * setupclock can now be called to recalculate the paging
225 * parameters in the case of dynamic addition of memory.
226 * So to make sure we make the proper calculations, if such a
227 * situation should arise, we save away the initial values
228 * of each parameter so we can recall them when needed. This
229 * way we don't lose the settings an admin might have made
230 * through the /etc/system file.
234 init_lfree
= lotsfree
;
235 init_dfree
= desfree
;
236 init_mfree
= minfree
;
237 init_tfree
= throttlefree
;
238 init_preserve
= pageout_reserve
;
239 init_mpgio
= maxpgio
;
240 init_mfscan
= maxfastscan
;
241 init_fscan
= fastscan
;
242 init_sscan
= slowscan
;
243 init_hspages
= handspreadpages
;
247 * Set up thresholds for paging:
251 * Lotsfree is threshold where paging daemon turns on.
253 if (init_lfree
== 0 || init_lfree
>= looppages
)
254 lotsfree
= MAX(looppages
/ 64, btop(512 * 1024));
256 lotsfree
= init_lfree
;
259 * Desfree is amount of memory desired free.
260 * If less than this for extended period, start swapping.
262 if (init_dfree
== 0 || init_dfree
>= lotsfree
)
263 desfree
= lotsfree
/ 2;
265 desfree
= init_dfree
;
268 * Minfree is minimal amount of free memory which is tolerable.
270 if (init_mfree
== 0 || init_mfree
>= desfree
)
271 minfree
= desfree
/ 2;
273 minfree
= init_mfree
;
276 * Throttlefree is the point at which we start throttling
277 * PG_WAIT requests until enough memory becomes available.
279 if (init_tfree
== 0 || init_tfree
>= desfree
)
280 throttlefree
= minfree
;
282 throttlefree
= init_tfree
;
285 * Pageout_reserve is the number of pages that we keep in
286 * stock for pageout's own use. Having a few such pages
287 * provides insurance against system deadlock due to
288 * pageout needing pages. When freemem < pageout_reserve,
289 * non-blocking allocations are denied to any threads
290 * other than pageout and sched. (At some point we might
291 * want to consider a per-thread flag like T_PUSHING_PAGES
292 * to indicate that a thread is part of the page-pushing
293 * dance (e.g. an interrupt thread) and thus is entitled
294 * to the same special dispensation we accord pageout.)
296 if (init_preserve
== 0 || init_preserve
>= throttlefree
)
297 pageout_reserve
= throttlefree
/ 2;
299 pageout_reserve
= init_preserve
;
302 * Maxpgio thresholds how much paging is acceptable.
303 * This figures that 2/3 busy on an arm is all that is
304 * tolerable for paging. We assume one operation per disk rev.
306 * XXX - Does not account for multiple swap devices.
309 maxpgio
= (DISKRPM
* 2) / 3;
311 maxpgio
= init_mpgio
;
314 * The clock scan rate varies between fastscan and slowscan
315 * based on the amount of free memory available. Fastscan
316 * rate should be set based on the number pages that can be
317 * scanned per sec using ~10% of processor time. Since this
318 * value depends on the processor, MMU, Mhz etc., it is
319 * difficult to determine it in a generic manner for all
322 * Instead of trying to determine the number of pages scanned
323 * per sec for every processor, fastscan is set to be the smaller
324 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
325 * time is limited to ~4% of processor time.
327 * Setting fastscan to be 1/2 of memory allows pageout to scan
328 * all of memory in ~2 secs. This implies that user pages not
329 * accessed within 1 sec (assuming, handspreadpages == fastscan)
330 * can be reclaimed when free memory is very low. Stealing pages
331 * not accessed within 1 sec seems reasonable and ensures that
332 * active user processes don't thrash.
334 * Smaller values of fastscan result in scanning fewer pages
335 * every second and consequently pageout may not be able to free
336 * sufficient memory to maintain the minimum threshold. Larger
337 * values of fastscan result in scanning a lot more pages which
338 * could lead to thrashing and higher CPU usage.
340 * Fastscan needs to be limited to a maximum value and should not
341 * scale with memory to prevent pageout from consuming too much
342 * time for scanning on slow CPU's and avoid thrashing, as a
343 * result of scanning too many pages, on faster CPU's.
344 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
345 * (the upper bound for fastscan) based on the average number
346 * of pages that can potentially be scanned in ~1 sec (using ~4%
347 * of the CPU) on some of the following machines that currently
350 * average memory scanned in ~1 sec
352 * 25 Mhz SS1+: 23 Meg
354 * 50 Mhz SC2000: 68 Meg
359 * When free memory falls just below lotsfree, the scan rate
360 * goes from 0 to slowscan (i.e., pageout starts running). This
361 * transition needs to be smooth and is achieved by ensuring that
362 * pageout scans a small number of pages to satisfy the transient
363 * memory demand. This is set to not exceed 100 pages/sec (25 per
364 * wakeup) since scanning that many pages has no noticible impact
365 * on system performance.
367 * In addition to setting fastscan and slowscan, pageout is
368 * limited to using ~4% of the CPU. This results in increasing
369 * the time taken to scan all of memory, which in turn means that
370 * user processes have a better opportunity of preventing their
371 * pages from being stolen. This has a positive effect on
372 * interactive and overall system performance when memory demand
375 * Thus, the rate at which pages are scanned for replacement will
376 * vary linearly between slowscan and the number of pages that
377 * can be scanned using ~4% of processor time instead of varying
378 * linearly between slowscan and fastscan.
380 * Also, the processor time used by pageout will vary from ~1%
381 * at slowscan to ~4% at fastscan instead of varying between
382 * ~1% at slowscan and ~10% at fastscan.
384 * The values chosen for the various VM parameters (fastscan,
385 * handspreadpages, etc) are not universally true for all machines,
386 * but appear to be a good rule of thumb for the machines we've
387 * tested. They have the following ranges:
389 * cpu speed: 20 to 70 Mhz
390 * page size: 4K to 8K
391 * memory size: 16M to 5G
392 * page scan rate: 4000 - 17400 4K pages per sec
394 * The values need to be re-examined for machines which don't
395 * fall into the various ranges (e.g., slower or faster CPUs,
396 * smaller or larger pagesizes etc) shown above.
398 * On an MP machine, pageout is often unable to maintain the
399 * minimum paging thresholds under heavy load. This is due to
400 * the fact that user processes running on other CPU's can be
401 * dirtying memory at a much faster pace than pageout can find
402 * pages to free. The memory demands could be met by enabling
403 * more than one CPU to run the clock algorithm in such a manner
404 * that the various clock hands don't overlap. This also makes
405 * it more difficult to determine the values for fastscan, slowscan
406 * and handspreadpages.
408 * The swapper is currently used to free up memory when pageout
409 * is unable to meet memory demands by swapping out processes.
410 * In addition to freeing up memory, swapping also reduces the
411 * demand for memory by preventing user processes from running
412 * and thereby consuming memory.
414 if (init_mfscan
== 0) {
415 if (pageout_new_spread
!= 0)
416 maxfastscan
= pageout_new_spread
;
418 maxfastscan
= MAXHANDSPREADPAGES
;
420 maxfastscan
= init_mfscan
;
423 fastscan
= MIN(looppages
/ loopfraction
, maxfastscan
);
425 fastscan
= init_fscan
;
426 if (fastscan
> looppages
/ loopfraction
)
427 fastscan
= looppages
/ loopfraction
;
430 * Set slow scan time to 1/10 the fast scan time, but
431 * not to exceed maxslowscan.
434 slowscan
= MIN(fastscan
/ 10, maxslowscan
);
436 slowscan
= init_sscan
;
437 if (slowscan
> fastscan
/ 2)
438 slowscan
= fastscan
/ 2;
441 * Handspreadpages is distance (in pages) between front and back
442 * pageout daemon hands. The amount of time to reclaim a page
443 * once pageout examines it increases with this distance and
444 * decreases as the scan rate rises. It must be < the amount
445 * of pageable memory.
447 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
448 * to be "fastscan" results in the front hand being a few secs
449 * (varies based on the processor speed) ahead of the back hand
450 * at fastscan rates. This distance can be further reduced, if
451 * necessary, by increasing the processor time used by pageout
452 * to be more than ~4% and preferrably not more than ~10%.
454 * As a result, user processes have a much better chance of
455 * referencing their pages before the back hand examines them.
456 * This also significantly lowers the number of reclaims from
457 * the freelist since pageout does not end up freeing pages which
458 * may be referenced a sec later.
460 if (init_hspages
== 0)
461 handspreadpages
= fastscan
;
463 handspreadpages
= init_hspages
;
466 * Make sure that back hand follows front hand by at least
467 * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible
468 * for the back hand to look at a page during the same wakeup of
469 * the pageout daemon in which the front hand cleared its ref bit.
471 if (handspreadpages
>= looppages
)
472 handspreadpages
= looppages
- 1;
475 * If we have been called to recalculate the parameters,
476 * set a flag to re-evaluate the clock hand pointers.
483 * Pageout scheduling.
485 * Schedpaging controls the rate at which the page out daemon runs by
486 * setting the global variables nscan and desscan RATETOSCHEDPAGING
487 * times a second. Nscan records the number of pages pageout has examined
488 * in its current pass; schedpaging resets this value to zero each time
489 * it runs. Desscan records the number of pages pageout should examine
490 * in its next pass; schedpaging sets this value based on the amount of
491 * currently available memory.
494 #define RATETOSCHEDPAGING 4 /* hz that is */
496 static kmutex_t pageout_mutex
; /* held while pageout or schedpaging running */
499 * Pool of available async pageout putpage requests.
501 static struct async_reqs
*push_req
;
502 static struct async_reqs
*req_freelist
; /* available req structs */
503 static struct async_reqs
*push_list
; /* pending reqs */
504 static kmutex_t push_lock
; /* protects req pool */
505 static kcondvar_t push_cv
;
507 static int async_list_size
= 256; /* number of async request structs */
509 static void pageout_scanner(void);
512 * If a page is being shared more than "po_share" times
513 * then leave it alone- don't page it out.
515 #define MIN_PO_SHARE (8)
516 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
517 ulong_t po_share
= MIN_PO_SHARE
;
520 * Schedule rate for paging.
521 * Rate is linear interpolation between
522 * slowscan with lotsfree and fastscan when out of memory.
525 schedpaging(void *arg
)
529 if (freemem
< lotsfree
+ needfree
+ kmem_reapahead
)
532 if (freemem
< lotsfree
+ needfree
)
535 if (kcage_on
&& (kcage_freemem
< kcage_desfree
|| kcage_needfree
))
536 kcage_cageout_wakeup();
538 if (mutex_tryenter(&pageout_mutex
)) {
539 /* pageout() not running */
541 vavail
= freemem
- deficit
;
542 if (pageout_new_spread
!= 0)
546 if (vavail
> lotsfree
)
550 * Fix for 1161438 (CRS SPR# 73922). All variables
551 * in the original calculation for desscan were 32 bit signed
552 * ints. As freemem approaches 0x0 on a system with 1 Gig or
553 * more of memory, the calculation can overflow. When this
554 * happens, desscan becomes negative and pageout_scanner()
557 if ((needfree
) && (pageout_new_spread
== 0)) {
559 * If we've not yet collected enough samples to
560 * calculate a spread, use the old logic of kicking
561 * into high gear anytime needfree is non-zero.
563 desscan
= fastscan
/ RATETOSCHEDPAGING
;
566 * Once we've calculated a spread based on system
567 * memory and usage, just treat needfree as another
570 spgcnt_t faststmp
, slowstmp
, result
;
572 slowstmp
= slowscan
* vavail
;
573 faststmp
= fastscan
* (lotsfree
- vavail
);
574 result
= (slowstmp
+ faststmp
) /
575 nz(lotsfree
) / RATETOSCHEDPAGING
;
576 desscan
= (pgcnt_t
)result
;
579 pageout_ticks
= min_pageout_ticks
+ (lotsfree
- vavail
) *
580 (max_pageout_ticks
- min_pageout_ticks
) / nz(lotsfree
);
582 if (freemem
< lotsfree
+ needfree
||
583 pageout_sample_cnt
< pageout_sample_lim
) {
584 TRACE_1(TR_FAC_VM
, TR_PAGEOUT_CV_SIGNAL
,
585 "pageout_cv_signal:freemem %ld", freemem
);
586 cv_signal(&proc_pageout
->p_cv
);
589 * There are enough free pages, no need to
590 * kick the scanner thread. And next time
591 * around, keep more of the `highly shared'
595 if (po_share
> MIN_PO_SHARE
) {
599 mutex_exit(&pageout_mutex
);
603 * Signal threads waiting for available memory.
604 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
605 * in this case it is not needed - the waiters will be waken up during
606 * the next invocation of this function.
608 if (kmem_avail() > 0)
609 cv_broadcast(&memavail_cv
);
611 (void) timeout(schedpaging
, arg
, hz
/ RATETOSCHEDPAGING
);
615 ulong_t push_list_size
; /* # of requests on pageout queue */
620 int dopageout
= 1; /* must be non-zero to turn page stealing on */
623 * The page out daemon, which runs as process 2.
625 * As long as there are at least lotsfree pages,
626 * this process is not run. When the number of free
627 * pages stays in the range desfree to lotsfree,
628 * this daemon runs through the pages in the loop
629 * at a rate determined in schedpaging(). Pageout manages
630 * two hands on the clock. The front hand moves through
631 * memory, clearing the reference bit,
632 * and stealing pages from procs that are over maxrss.
633 * The back hand travels a distance behind the front hand,
634 * freeing the pages that have not been referenced in the time
635 * since the front hand passed. If modified, they are pushed to
636 * swap before being freed.
638 * There are 2 threads that act on behalf of the pageout process.
639 * One thread scans pages (pageout_scanner) and frees them up if
640 * they don't require any fop_putpage operation. If a page must be
641 * written back to its backing store, the request is put on a list
642 * and the other (pageout) thread is signaled. The pageout thread
643 * grabs fop_putpage requests from the list, and processes them.
644 * Some filesystems may require resources for the fop_putpage
645 * operations (like memory) and hence can block the pageout
646 * thread, but the scanner thread can still operate. There is still
647 * no guarantee that memory deadlocks cannot occur.
649 * For now, this thing is in very rough form.
654 struct async_reqs
*arg
;
660 proc_pageout
= ttoproc(curthread
);
661 proc_pageout
->p_cstime
= 0;
662 proc_pageout
->p_stime
= 0;
663 proc_pageout
->p_cutime
= 0;
664 proc_pageout
->p_utime
= 0;
665 bcopy("pageout", PTOU(curproc
)->u_psargs
, 8);
666 bcopy("pageout", PTOU(curproc
)->u_comm
, 7);
669 * Create pageout scanner thread
671 mutex_init(&pageout_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
672 mutex_init(&push_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
675 * Allocate and initialize the async request structures
678 push_req
= (struct async_reqs
*)
679 kmem_zalloc(async_list_size
* sizeof (struct async_reqs
), KM_SLEEP
);
681 req_freelist
= push_req
;
682 for (i
= 0; i
< async_list_size
- 1; i
++)
683 push_req
[i
].a_next
= &push_req
[i
+ 1];
685 pageout_pri
= curthread
->t_pri
;
687 /* Create the pageout scanner thread. */
688 (void) lwp_kernel_create(proc_pageout
, pageout_scanner
, NULL
, TS_RUN
,
692 * kick off pageout scheduler.
697 * Create kernel cage thread.
698 * The kernel cage thread is started under the pageout process
699 * to take advantage of the less restricted page allocation
700 * in page_create_throttle().
702 kcage_cageout_init();
705 * Limit pushes to avoid saturating pageout devices.
707 max_pushes
= maxpgio
/ RATETOSCHEDPAGING
;
708 CALLB_CPR_INIT(&cprinfo
, &push_lock
, callb_generic_cpr
, "pageout");
711 mutex_enter(&push_lock
);
713 while ((arg
= push_list
) == NULL
|| pushes
> max_pushes
) {
714 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
715 cv_wait(&push_cv
, &push_lock
);
717 CALLB_CPR_SAFE_END(&cprinfo
, &push_lock
);
719 push_list
= arg
->a_next
;
721 mutex_exit(&push_lock
);
723 if (fop_putpage(arg
->a_vp
, (offset_t
)arg
->a_off
,
724 arg
->a_len
, arg
->a_flags
, arg
->a_cred
, NULL
) == 0) {
728 /* vp held by checkpage() */
731 mutex_enter(&push_lock
);
732 arg
->a_next
= req_freelist
; /* back on freelist */
735 mutex_exit(&push_lock
);
740 * Kernel thread that scans pages looking for ones to free
743 pageout_scanner(void)
745 struct page
*fronthand
, *backhand
;
751 CALLB_CPR_INIT(&cprinfo
, &pageout_mutex
, callb_generic_cpr
, "poscan");
752 mutex_enter(&pageout_mutex
);
755 * The restart case does not attempt to point the hands at roughly
756 * the right point on the assumption that after one circuit things
757 * will have settled down - and restarts shouldn't be that often.
761 * Set the two clock hands to be separated by a reasonable amount,
762 * but no more than 360 degrees apart.
764 backhand
= page_first();
765 if (handspreadpages
>= total_pages
)
766 fronthand
= page_nextn(backhand
, total_pages
- 1);
768 fronthand
= page_nextn(backhand
, handspreadpages
);
770 min_pageout_ticks
= MAX(1,
771 ((hz
* min_percent_cpu
) / 100) / RATETOSCHEDPAGING
);
772 max_pageout_ticks
= MAX(min_pageout_ticks
,
773 ((hz
* max_percent_cpu
) / 100) / RATETOSCHEDPAGING
);
778 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
779 cv_wait(&proc_pageout
->p_cv
, &pageout_mutex
);
780 CALLB_CPR_SAFE_END(&cprinfo
, &pageout_mutex
);
788 backhand
= page_first();
789 if (handspreadpages
>= total_pages
)
790 fronthand
= page_nextn(backhand
, total_pages
- 1);
792 fronthand
= page_nextn(backhand
, handspreadpages
);
795 CPU_STATS_ADDQ(CPU
, vm
, pgrrun
, 1);
798 TRACE_4(TR_FAC_VM
, TR_PAGEOUT_START
,
799 "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
800 freemem
, lotsfree
, nscan
, desscan
);
803 if (pageout_sample_cnt
< pageout_sample_lim
) {
804 nscan_limit
= total_pages
;
806 nscan_limit
= desscan
;
808 pageout_lbolt
= ddi_get_lbolt();
809 sample_start
= gethrtime();
812 * Scan the appropriate number of pages for a single duty cycle.
813 * However, stop scanning as soon as there is enough free memory.
814 * For a short while, we will be sampling the performance of the
815 * scanner and need to keep running just to get sample data, in
816 * which case we keep going and don't pay attention to whether
817 * or not there is enough free memory.
820 while (nscan
< nscan_limit
&& (freemem
< lotsfree
+ needfree
||
821 pageout_sample_cnt
< pageout_sample_lim
)) {
825 * Check to see if we have exceeded our %CPU budget
826 * for this wakeup, but not on every single page visited,
827 * just every once in a while.
829 if ((pcount
& PAGES_POLL_MASK
) == PAGES_POLL_MASK
) {
830 pageout_cycle_ticks
= ddi_get_lbolt() - pageout_lbolt
;
831 if (pageout_cycle_ticks
>= pageout_ticks
) {
838 * If checkpage manages to add a page to the free list,
839 * we give ourselves another couple of trips around the loop.
841 if ((rvfront
= checkpage(fronthand
, FRONT
)) == 1)
843 if ((rvback
= checkpage(backhand
, BACK
)) == 1)
849 * protected by pageout_mutex instead of cpu_stat_lock
851 CPU_STATS_ADDQ(CPU
, vm
, scan
, 1);
854 * Don't include ineligible pages in the number scanned.
856 if (rvfront
!= -1 || rvback
!= -1)
859 backhand
= page_next(backhand
);
862 * backhand update and wraparound check are done separately
863 * because lint barks when it finds an empty "if" body
866 if ((fronthand
= page_next(fronthand
)) == page_first()) {
867 TRACE_2(TR_FAC_VM
, TR_PAGEOUT_HAND_WRAP
,
868 "pageout_hand_wrap:freemem %ld whichhand %d",
872 * protected by pageout_mutex instead of cpu_stat_lock
874 CPU_STATS_ADDQ(CPU
, vm
, rev
, 1);
877 * Extremely unlikely, but it happens.
878 * We went around the loop at least once
879 * and didn't get far enough.
880 * If we are still skipping `highly shared'
881 * pages, skip fewer of them. Otherwise,
882 * give up till the next clock tick.
884 if (po_share
< MAX_PO_SHARE
) {
888 * Really a "goto loop", but
889 * if someone is TRACing, at least
890 * make records to show where we
899 sample_end
= gethrtime();
901 TRACE_5(TR_FAC_VM
, TR_PAGEOUT_END
,
902 "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
903 freemem
, lotsfree
, nscan
, desscan
, count
);
905 if (pageout_sample_cnt
< pageout_sample_lim
) {
906 pageout_sample_pages
+= pcount
;
907 pageout_sample_etime
+= sample_end
- sample_start
;
908 ++pageout_sample_cnt
;
910 if (pageout_sample_cnt
>= pageout_sample_lim
&&
911 pageout_new_spread
== 0) {
912 pageout_rate
= (hrrate_t
)pageout_sample_pages
*
913 (hrrate_t
)(NANOSEC
) / pageout_sample_etime
;
914 pageout_new_spread
= pageout_rate
/ 10;
922 * Look at the page at hand. If it is locked (e.g., for physical i/o),
923 * system (u., page table) or free, then leave it alone. Otherwise,
924 * if we are running the front hand, turn off the page's reference bit.
925 * If the proc is over maxrss, we take it. If running the back hand,
926 * check whether the page has been reclaimed. If not, free the page,
927 * pushing it to disk first if necessary.
930 * -1 if the page is not a candidate at all,
935 checkpage(struct page
*pp
, int whichhand
)
944 * - associated with the kernel vnode since
945 * they are always "exclusively" locked.
947 * - that are shared more than po_share'd times
948 * - its already locked
950 * NOTE: These optimizations assume that reads are atomic.
953 if (PP_ISKAS(pp
) || PAGE_LOCKED(pp
) || PP_ISFREE(pp
) ||
954 pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0 ||
955 hat_page_checkshare(pp
, po_share
)) {
959 if (!page_trylock(pp
, SE_EXCL
)) {
961 * Skip the page if we can't acquire the "exclusive" lock.
964 } else if (PP_ISFREE(pp
)) {
966 * It became free between the above check and our actually
967 * locking the page. Oh, well there will be other pages.
974 * Reject pages that cannot be freed. The page_struct_lock
975 * need not be acquired to examine these
976 * fields since the page has an "exclusive" lock.
978 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
984 * Maintain statistics for what we are freeing
987 if (pp
->p_vnode
!= NULL
) {
988 if (pp
->p_vnode
->v_flag
& VVMEXEC
)
991 if (!IS_SWAPFSVP(pp
->p_vnode
))
996 * Turn off REF and MOD bits with the front hand.
997 * The back hand examines the REF bit and always considers
998 * SHARED pages as referenced.
1000 if (whichhand
== FRONT
)
1001 pagesync_flag
= HAT_SYNC_ZERORM
;
1003 pagesync_flag
= HAT_SYNC_DONTZERO
| HAT_SYNC_STOPON_REF
|
1004 HAT_SYNC_STOPON_SHARED
;
1006 ppattr
= hat_pagesync(pp
, pagesync_flag
);
1010 * If page is referenced; make unreferenced but reclaimable.
1011 * If this page is not referenced, then it must be reclaimable
1012 * and we can add it to the free list.
1014 if (ppattr
& P_REF
) {
1015 TRACE_2(TR_FAC_VM
, TR_PAGEOUT_ISREF
,
1016 "pageout_isref:pp %p whichhand %d", pp
, whichhand
);
1017 if (whichhand
== FRONT
) {
1019 * Checking of rss or madvise flags needed here...
1021 * If not "well-behaved", fall through into the code
1022 * for not referenced.
1027 * Somebody referenced the page since the front
1028 * hand went by, so it's not a candidate for
1035 VM_STAT_ADD(pageoutvmstats
.checkpage
[0]);
1038 * If large page, attempt to demote it. If successfully demoted,
1039 * retry the checkpage.
1041 if (pp
->p_szc
!= 0) {
1042 if (!page_try_demote_pages(pp
)) {
1043 VM_STAT_ADD(pageoutvmstats
.checkpage
[1]);
1047 ASSERT(pp
->p_szc
== 0);
1048 VM_STAT_ADD(pageoutvmstats
.checkpage
[2]);
1050 * since page_try_demote_pages() could have unloaded some
1051 * mappings it makes sense to reload ppattr.
1053 ppattr
= hat_page_getattr(pp
, P_MOD
| P_REF
);
1057 * If the page is currently dirty, we have to arrange
1058 * to have it cleaned before it can be freed.
1060 * XXX - ASSERT(pp->p_vnode != NULL);
1062 if ((ppattr
& P_MOD
) && pp
->p_vnode
) {
1063 struct vnode
*vp
= pp
->p_vnode
;
1064 uoff_t offset
= pp
->p_offset
;
1067 * XXX - Test for process being swapped out or about to exit?
1068 * [Can't get back to process(es) using the page.]
1072 * Hold the vnode before releasing the page lock to
1073 * prevent it from being freed and re-used by some
1080 * Queue i/o request for the pageout thread.
1082 if (!queue_io_request(vp
, offset
)) {
1090 * Now we unload all the translations,
1091 * and put the page back on to the free list.
1092 * If the page was used (referenced or modified) after
1093 * the pagesync but before it was unloaded we catch it
1094 * and handle the page properly.
1096 TRACE_2(TR_FAC_VM
, TR_PAGEOUT_FREE
,
1097 "pageout_free:pp %p whichhand %d", pp
, whichhand
);
1098 (void) hat_pageunload(pp
, HAT_FORCE_PGUNLOAD
);
1099 ppattr
= hat_page_getattr(pp
, P_MOD
| P_REF
);
1100 if ((ppattr
& P_REF
) || ((ppattr
& P_MOD
) && pp
->p_vnode
))
1103 VN_DISPOSE(pp
, B_FREE
, 0, kcred
);
1105 CPU_STATS_ADD_K(vm
, dfree
, 1);
1109 CPU_STATS_ADD_K(vm
, execfree
, 1);
1111 CPU_STATS_ADD_K(vm
, fsfree
, 1);
1114 CPU_STATS_ADD_K(vm
, anonfree
, 1);
1117 return (1); /* freed a page! */
1121 * Queue async i/o request from pageout_scanner and segment swapout
1122 * routines on one common list. This ensures that pageout devices (swap)
1123 * are not saturated by pageout_scanner or swapout requests.
1124 * The pageout thread empties this list by initiating i/o operations.
1127 queue_io_request(vnode_t
*vp
, uoff_t off
)
1129 struct async_reqs
*arg
;
1132 * If we cannot allocate an async request struct,
1135 mutex_enter(&push_lock
);
1136 if ((arg
= req_freelist
) == NULL
) {
1137 mutex_exit(&push_lock
);
1140 req_freelist
= arg
->a_next
; /* adjust freelist */
1145 arg
->a_len
= PAGESIZE
;
1146 arg
->a_flags
= B_ASYNC
| B_FREE
;
1147 arg
->a_cred
= kcred
; /* always held */
1150 * Add to list of pending write requests.
1152 arg
->a_next
= push_list
;
1155 if (req_freelist
== NULL
) {
1157 * No free async requests left. The lock is held so we
1158 * might as well signal the pusher thread now.
1160 cv_signal(&push_cv
);
1162 mutex_exit(&push_lock
);
1167 * Wakeup pageout to initiate i/o if push_list is not empty.
1172 if (push_list
!= NULL
) {
1173 mutex_enter(&push_lock
);
1174 cv_signal(&push_cv
);
1175 mutex_exit(&push_lock
);