Merge commit '7e934d3acc051b7ee3ef0d11571fd1225800a607'
[unleashed.git] / kernel / os / vm_pageout.c
blobd4acbdafe294e1602db4b5688706a8332ce33cb1
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/buf.h>
43 #include <sys/uio.h>
44 #include <sys/proc.h>
45 #include <sys/systm.h>
46 #include <sys/mman.h>
47 #include <sys/cred.h>
48 #include <sys/vnode.h>
49 #include <sys/vm.h>
50 #include <sys/vmparam.h>
51 #include <sys/vtrace.h>
52 #include <sys/cmn_err.h>
53 #include <sys/cpuvar.h>
54 #include <sys/user.h>
55 #include <sys/kmem.h>
56 #include <sys/debug.h>
57 #include <sys/callb.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/time.h>
61 #include <vm/hat.h>
62 #include <vm/as.h>
63 #include <vm/seg.h>
64 #include <vm/page.h>
65 #include <vm/pvn.h>
66 #include <vm/seg_kmem.h>
68 static int checkpage(page_t *, int);
71 * The following parameters control operation of the page replacement
72 * algorithm. They are initialized to 0, and then computed at boot time
73 * based on the size of the system. If they are patched non-zero in
74 * a loaded vmunix they are left alone and may thus be changed per system
75 * using adb on the loaded system.
77 pgcnt_t slowscan = 0;
78 pgcnt_t fastscan = 0;
80 static pgcnt_t handspreadpages = 0;
81 static int loopfraction = 2;
82 static pgcnt_t looppages;
83 static int min_percent_cpu = 4;
84 static int max_percent_cpu = 80;
85 static pgcnt_t maxfastscan = 0;
86 static pgcnt_t maxslowscan = 100;
88 pgcnt_t maxpgio = 0;
89 pgcnt_t minfree = 0;
90 pgcnt_t desfree = 0;
91 pgcnt_t lotsfree = 0;
92 pgcnt_t needfree = 0;
93 pgcnt_t throttlefree = 0;
94 pgcnt_t pageout_reserve = 0;
96 pgcnt_t deficit;
97 pgcnt_t nscan;
98 pgcnt_t desscan;
101 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
102 * are the number of ticks in each wakeup cycle that gives the
103 * equivalent of some underlying %CPU duty cycle.
104 * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is
105 * awakened every 25 clock ticks. So, converting from %CPU to ticks
106 * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
107 * So, for example, 4% == 1 tick and 80% == 20 ticks.
109 * min_pageout_ticks:
110 * ticks/wakeup equivalent of min_percent_cpu.
112 * max_pageout_ticks:
113 * ticks/wakeup equivalent of max_percent_cpu.
115 * pageout_ticks:
116 * Number of clock ticks budgeted for each wakeup cycle.
117 * Computed each time around by schedpaging().
118 * Varies between min_pageout_ticks .. max_pageout_ticks,
119 * depending on memory pressure.
121 * pageout_lbolt:
122 * Timestamp of the last time pageout_scanner woke up and started
123 * (or resumed) scanning for not recently referenced pages.
126 static clock_t min_pageout_ticks;
127 static clock_t max_pageout_ticks;
128 static clock_t pageout_ticks;
129 static clock_t pageout_lbolt;
131 static uint_t reset_hands;
133 #define PAGES_POLL_MASK 1023
136 * pageout_sample_lim:
137 * The limit on the number of samples needed to establish a value
138 * for new pageout parameters, fastscan, slowscan, and handspreadpages.
140 * pageout_sample_cnt:
141 * Current sample number. Once the sample gets large enough,
142 * set new values for handspreadpages, fastscan and slowscan.
144 * pageout_sample_pages:
145 * The accumulated number of pages scanned during sampling.
147 * pageout_sample_ticks:
148 * The accumulated clock ticks for the sample.
150 * pageout_rate:
151 * Rate in pages/nanosecond, computed at the end of sampling.
153 * pageout_new_spread:
154 * The new value to use for fastscan and handspreadpages.
155 * Calculated after enough samples have been taken.
158 typedef hrtime_t hrrate_t;
160 static uint64_t pageout_sample_lim = 4;
161 static uint64_t pageout_sample_cnt = 0;
162 static pgcnt_t pageout_sample_pages = 0;
163 static hrrate_t pageout_rate = 0;
164 static pgcnt_t pageout_new_spread = 0;
166 static clock_t pageout_cycle_ticks;
167 static hrtime_t sample_start, sample_end;
168 static hrtime_t pageout_sample_etime = 0;
171 * Record number of times a pageout_scanner wakeup cycle finished because it
172 * timed out (exceeded its CPU budget), rather than because it visited
173 * its budgeted number of pages.
175 uint64_t pageout_timeouts = 0;
177 #ifdef VM_STATS
178 static struct pageoutvmstats_str {
179 ulong_t checkpage[3];
180 } pageoutvmstats;
181 #endif /* VM_STATS */
184 * Threads waiting for free memory use this condition variable and lock until
185 * memory becomes available.
187 kmutex_t memavail_lock;
188 kcondvar_t memavail_cv;
191 * The size of the clock loop.
193 #define LOOPPAGES total_pages
196 * Set up the paging constants for the clock algorithm.
197 * Called after the system is initialized and the amount of memory
198 * and number of paging devices is known.
200 * lotsfree is 1/64 of memory, but at least 512K.
201 * desfree is 1/2 of lotsfree.
202 * minfree is 1/2 of desfree.
204 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
206 * lotsfree = btop(512K)
207 * desfree = btop(200K)
208 * minfree = btop(100K)
209 * throttlefree = INT_MIN
210 * max_percent_cpu = 4
212 void
213 setupclock(int recalc)
216 static spgcnt_t init_lfree, init_dfree, init_mfree;
217 static spgcnt_t init_tfree, init_preserve, init_mpgio;
218 static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
220 looppages = LOOPPAGES;
223 * setupclock can now be called to recalculate the paging
224 * parameters in the case of dynamic addition of memory.
225 * So to make sure we make the proper calculations, if such a
226 * situation should arise, we save away the initial values
227 * of each parameter so we can recall them when needed. This
228 * way we don't lose the settings an admin might have made
229 * through the /etc/system file.
232 if (!recalc) {
233 init_lfree = lotsfree;
234 init_dfree = desfree;
235 init_mfree = minfree;
236 init_tfree = throttlefree;
237 init_preserve = pageout_reserve;
238 init_mpgio = maxpgio;
239 init_mfscan = maxfastscan;
240 init_fscan = fastscan;
241 init_sscan = slowscan;
242 init_hspages = handspreadpages;
246 * Set up thresholds for paging:
250 * Lotsfree is threshold where paging daemon turns on.
252 if (init_lfree == 0 || init_lfree >= looppages)
253 lotsfree = MAX(looppages / 64, btop(512 * 1024));
254 else
255 lotsfree = init_lfree;
258 * Desfree is amount of memory desired free.
259 * If less than this for extended period, start swapping.
261 if (init_dfree == 0 || init_dfree >= lotsfree)
262 desfree = lotsfree / 2;
263 else
264 desfree = init_dfree;
267 * Minfree is minimal amount of free memory which is tolerable.
269 if (init_mfree == 0 || init_mfree >= desfree)
270 minfree = desfree / 2;
271 else
272 minfree = init_mfree;
275 * Throttlefree is the point at which we start throttling
276 * PG_WAIT requests until enough memory becomes available.
278 if (init_tfree == 0 || init_tfree >= desfree)
279 throttlefree = minfree;
280 else
281 throttlefree = init_tfree;
284 * Pageout_reserve is the number of pages that we keep in
285 * stock for pageout's own use. Having a few such pages
286 * provides insurance against system deadlock due to
287 * pageout needing pages. When freemem < pageout_reserve,
288 * non-blocking allocations are denied to any threads
289 * other than pageout and sched. (At some point we might
290 * want to consider a per-thread flag like T_PUSHING_PAGES
291 * to indicate that a thread is part of the page-pushing
292 * dance (e.g. an interrupt thread) and thus is entitled
293 * to the same special dispensation we accord pageout.)
295 if (init_preserve == 0 || init_preserve >= throttlefree)
296 pageout_reserve = throttlefree / 2;
297 else
298 pageout_reserve = init_preserve;
301 * Maxpgio thresholds how much paging is acceptable.
302 * This figures that 2/3 busy on an arm is all that is
303 * tolerable for paging. We assume one operation per disk rev.
305 * XXX - Does not account for multiple swap devices.
307 if (init_mpgio == 0)
308 maxpgio = (DISKRPM * 2) / 3;
309 else
310 maxpgio = init_mpgio;
313 * The clock scan rate varies between fastscan and slowscan
314 * based on the amount of free memory available. Fastscan
315 * rate should be set based on the number pages that can be
316 * scanned per sec using ~10% of processor time. Since this
317 * value depends on the processor, MMU, Mhz etc., it is
318 * difficult to determine it in a generic manner for all
319 * architectures.
321 * Instead of trying to determine the number of pages scanned
322 * per sec for every processor, fastscan is set to be the smaller
323 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
324 * time is limited to ~4% of processor time.
326 * Setting fastscan to be 1/2 of memory allows pageout to scan
327 * all of memory in ~2 secs. This implies that user pages not
328 * accessed within 1 sec (assuming, handspreadpages == fastscan)
329 * can be reclaimed when free memory is very low. Stealing pages
330 * not accessed within 1 sec seems reasonable and ensures that
331 * active user processes don't thrash.
333 * Smaller values of fastscan result in scanning fewer pages
334 * every second and consequently pageout may not be able to free
335 * sufficient memory to maintain the minimum threshold. Larger
336 * values of fastscan result in scanning a lot more pages which
337 * could lead to thrashing and higher CPU usage.
339 * Fastscan needs to be limited to a maximum value and should not
340 * scale with memory to prevent pageout from consuming too much
341 * time for scanning on slow CPU's and avoid thrashing, as a
342 * result of scanning too many pages, on faster CPU's.
343 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
344 * (the upper bound for fastscan) based on the average number
345 * of pages that can potentially be scanned in ~1 sec (using ~4%
346 * of the CPU) on some of the following machines that currently
347 * run Solaris 2.x:
349 * average memory scanned in ~1 sec
351 * 25 Mhz SS1+: 23 Meg
352 * LX: 37 Meg
353 * 50 Mhz SC2000: 68 Meg
355 * 40 Mhz 486: 26 Meg
356 * 66 Mhz 486: 42 Meg
358 * When free memory falls just below lotsfree, the scan rate
359 * goes from 0 to slowscan (i.e., pageout starts running). This
360 * transition needs to be smooth and is achieved by ensuring that
361 * pageout scans a small number of pages to satisfy the transient
362 * memory demand. This is set to not exceed 100 pages/sec (25 per
363 * wakeup) since scanning that many pages has no noticible impact
364 * on system performance.
366 * In addition to setting fastscan and slowscan, pageout is
367 * limited to using ~4% of the CPU. This results in increasing
368 * the time taken to scan all of memory, which in turn means that
369 * user processes have a better opportunity of preventing their
370 * pages from being stolen. This has a positive effect on
371 * interactive and overall system performance when memory demand
372 * is high.
374 * Thus, the rate at which pages are scanned for replacement will
375 * vary linearly between slowscan and the number of pages that
376 * can be scanned using ~4% of processor time instead of varying
377 * linearly between slowscan and fastscan.
379 * Also, the processor time used by pageout will vary from ~1%
380 * at slowscan to ~4% at fastscan instead of varying between
381 * ~1% at slowscan and ~10% at fastscan.
383 * The values chosen for the various VM parameters (fastscan,
384 * handspreadpages, etc) are not universally true for all machines,
385 * but appear to be a good rule of thumb for the machines we've
386 * tested. They have the following ranges:
388 * cpu speed: 20 to 70 Mhz
389 * page size: 4K to 8K
390 * memory size: 16M to 5G
391 * page scan rate: 4000 - 17400 4K pages per sec
393 * The values need to be re-examined for machines which don't
394 * fall into the various ranges (e.g., slower or faster CPUs,
395 * smaller or larger pagesizes etc) shown above.
397 * On an MP machine, pageout is often unable to maintain the
398 * minimum paging thresholds under heavy load. This is due to
399 * the fact that user processes running on other CPU's can be
400 * dirtying memory at a much faster pace than pageout can find
401 * pages to free. The memory demands could be met by enabling
402 * more than one CPU to run the clock algorithm in such a manner
403 * that the various clock hands don't overlap. This also makes
404 * it more difficult to determine the values for fastscan, slowscan
405 * and handspreadpages.
407 * The swapper is currently used to free up memory when pageout
408 * is unable to meet memory demands by swapping out processes.
409 * In addition to freeing up memory, swapping also reduces the
410 * demand for memory by preventing user processes from running
411 * and thereby consuming memory.
413 if (init_mfscan == 0) {
414 if (pageout_new_spread != 0)
415 maxfastscan = pageout_new_spread;
416 else
417 maxfastscan = MAXHANDSPREADPAGES;
418 } else {
419 maxfastscan = init_mfscan;
421 if (init_fscan == 0)
422 fastscan = MIN(looppages / loopfraction, maxfastscan);
423 else
424 fastscan = init_fscan;
425 if (fastscan > looppages / loopfraction)
426 fastscan = looppages / loopfraction;
429 * Set slow scan time to 1/10 the fast scan time, but
430 * not to exceed maxslowscan.
432 if (init_sscan == 0)
433 slowscan = MIN(fastscan / 10, maxslowscan);
434 else
435 slowscan = init_sscan;
436 if (slowscan > fastscan / 2)
437 slowscan = fastscan / 2;
440 * Handspreadpages is distance (in pages) between front and back
441 * pageout daemon hands. The amount of time to reclaim a page
442 * once pageout examines it increases with this distance and
443 * decreases as the scan rate rises. It must be < the amount
444 * of pageable memory.
446 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
447 * to be "fastscan" results in the front hand being a few secs
448 * (varies based on the processor speed) ahead of the back hand
449 * at fastscan rates. This distance can be further reduced, if
450 * necessary, by increasing the processor time used by pageout
451 * to be more than ~4% and preferrably not more than ~10%.
453 * As a result, user processes have a much better chance of
454 * referencing their pages before the back hand examines them.
455 * This also significantly lowers the number of reclaims from
456 * the freelist since pageout does not end up freeing pages which
457 * may be referenced a sec later.
459 if (init_hspages == 0)
460 handspreadpages = fastscan;
461 else
462 handspreadpages = init_hspages;
465 * Make sure that back hand follows front hand by at least
466 * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible
467 * for the back hand to look at a page during the same wakeup of
468 * the pageout daemon in which the front hand cleared its ref bit.
470 if (handspreadpages >= looppages)
471 handspreadpages = looppages - 1;
474 * If we have been called to recalculate the parameters,
475 * set a flag to re-evaluate the clock hand pointers.
477 if (recalc)
478 reset_hands = 1;
482 * Pageout scheduling.
484 * Schedpaging controls the rate at which the page out daemon runs by
485 * setting the global variables nscan and desscan RATETOSCHEDPAGING
486 * times a second. Nscan records the number of pages pageout has examined
487 * in its current pass; schedpaging resets this value to zero each time
488 * it runs. Desscan records the number of pages pageout should examine
489 * in its next pass; schedpaging sets this value based on the amount of
490 * currently available memory.
493 #define RATETOSCHEDPAGING 4 /* hz that is */
495 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
498 * Pool of available async pageout putpage requests.
500 static struct async_reqs *push_req;
501 static struct async_reqs *req_freelist; /* available req structs */
502 static struct async_reqs *push_list; /* pending reqs */
503 static kmutex_t push_lock; /* protects req pool */
504 static kcondvar_t push_cv;
506 static int async_list_size = 256; /* number of async request structs */
508 static void pageout_scanner(void);
511 * If a page is being shared more than "po_share" times
512 * then leave it alone- don't page it out.
514 #define MIN_PO_SHARE (8)
515 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
516 ulong_t po_share = MIN_PO_SHARE;
519 * Schedule rate for paging.
520 * Rate is linear interpolation between
521 * slowscan with lotsfree and fastscan when out of memory.
523 static void
524 schedpaging(void *arg)
526 spgcnt_t vavail;
528 if (freemem < lotsfree + needfree + kmem_reapahead)
529 kmem_reap();
531 if (freemem < lotsfree + needfree)
532 seg_preap();
534 if (mutex_tryenter(&pageout_mutex)) {
535 /* pageout() not running */
536 nscan = 0;
537 vavail = freemem - deficit;
538 if (pageout_new_spread != 0)
539 vavail -= needfree;
540 if (vavail < 0)
541 vavail = 0;
542 if (vavail > lotsfree)
543 vavail = lotsfree;
546 * Fix for 1161438 (CRS SPR# 73922). All variables
547 * in the original calculation for desscan were 32 bit signed
548 * ints. As freemem approaches 0x0 on a system with 1 Gig or
549 * more of memory, the calculation can overflow. When this
550 * happens, desscan becomes negative and pageout_scanner()
551 * stops paging out.
553 if ((needfree) && (pageout_new_spread == 0)) {
555 * If we've not yet collected enough samples to
556 * calculate a spread, use the old logic of kicking
557 * into high gear anytime needfree is non-zero.
559 desscan = fastscan / RATETOSCHEDPAGING;
560 } else {
562 * Once we've calculated a spread based on system
563 * memory and usage, just treat needfree as another
564 * form of deficit.
566 spgcnt_t faststmp, slowstmp, result;
568 slowstmp = slowscan * vavail;
569 faststmp = fastscan * (lotsfree - vavail);
570 result = (slowstmp + faststmp) /
571 nz(lotsfree) / RATETOSCHEDPAGING;
572 desscan = (pgcnt_t)result;
575 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
576 (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
578 if (freemem < lotsfree + needfree ||
579 pageout_sample_cnt < pageout_sample_lim) {
580 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
581 "pageout_cv_signal:freemem %ld", freemem);
582 cv_signal(&proc_pageout->p_cv);
583 } else {
585 * There are enough free pages, no need to
586 * kick the scanner thread. And next time
587 * around, keep more of the `highly shared'
588 * pages.
590 cv_signal_pageout();
591 if (po_share > MIN_PO_SHARE) {
592 po_share >>= 1;
595 mutex_exit(&pageout_mutex);
599 * Signal threads waiting for available memory.
600 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
601 * in this case it is not needed - the waiters will be waken up during
602 * the next invocation of this function.
604 if (kmem_avail() > 0)
605 cv_broadcast(&memavail_cv);
607 (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
610 pgcnt_t pushes;
611 ulong_t push_list_size; /* # of requests on pageout queue */
613 #define FRONT 1
614 #define BACK 2
616 int dopageout = 1; /* must be non-zero to turn page stealing on */
619 * The page out daemon, which runs as process 2.
621 * As long as there are at least lotsfree pages,
622 * this process is not run. When the number of free
623 * pages stays in the range desfree to lotsfree,
624 * this daemon runs through the pages in the loop
625 * at a rate determined in schedpaging(). Pageout manages
626 * two hands on the clock. The front hand moves through
627 * memory, clearing the reference bit,
628 * and stealing pages from procs that are over maxrss.
629 * The back hand travels a distance behind the front hand,
630 * freeing the pages that have not been referenced in the time
631 * since the front hand passed. If modified, they are pushed to
632 * swap before being freed.
634 * There are 2 threads that act on behalf of the pageout process.
635 * One thread scans pages (pageout_scanner) and frees them up if
636 * they don't require any fop_putpage operation. If a page must be
637 * written back to its backing store, the request is put on a list
638 * and the other (pageout) thread is signaled. The pageout thread
639 * grabs fop_putpage requests from the list, and processes them.
640 * Some filesystems may require resources for the fop_putpage
641 * operations (like memory) and hence can block the pageout
642 * thread, but the scanner thread can still operate. There is still
643 * no guarantee that memory deadlocks cannot occur.
645 * For now, this thing is in very rough form.
647 void
648 pageout()
650 struct async_reqs *arg;
651 pri_t pageout_pri;
652 int i;
653 pgcnt_t max_pushes;
654 callb_cpr_t cprinfo;
656 proc_pageout = ttoproc(curthread);
657 proc_pageout->p_cstime = 0;
658 proc_pageout->p_stime = 0;
659 proc_pageout->p_cutime = 0;
660 proc_pageout->p_utime = 0;
661 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
662 bcopy("pageout", PTOU(curproc)->u_comm, 7);
665 * Create pageout scanner thread
667 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
668 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
671 * Allocate and initialize the async request structures
672 * for pageout.
674 push_req = (struct async_reqs *)
675 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
677 req_freelist = push_req;
678 for (i = 0; i < async_list_size - 1; i++)
679 push_req[i].a_next = &push_req[i + 1];
681 pageout_pri = curthread->t_pri;
683 /* Create the pageout scanner thread. */
684 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
685 pageout_pri - 1);
688 * kick off pageout scheduler.
690 schedpaging(NULL);
693 * Limit pushes to avoid saturating pageout devices.
695 max_pushes = maxpgio / RATETOSCHEDPAGING;
696 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
698 for (;;) {
699 mutex_enter(&push_lock);
701 while ((arg = push_list) == NULL || pushes > max_pushes) {
702 CALLB_CPR_SAFE_BEGIN(&cprinfo);
703 cv_wait(&push_cv, &push_lock);
704 pushes = 0;
705 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
707 push_list = arg->a_next;
708 arg->a_next = NULL;
709 mutex_exit(&push_lock);
711 if (fop_putpage(arg->a_vp, (offset_t)arg->a_off,
712 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
713 pushes++;
716 /* vp held by checkpage() */
717 VN_RELE(arg->a_vp);
719 mutex_enter(&push_lock);
720 arg->a_next = req_freelist; /* back on freelist */
721 req_freelist = arg;
722 push_list_size--;
723 mutex_exit(&push_lock);
728 * Kernel thread that scans pages looking for ones to free
730 static void
731 pageout_scanner(void)
733 struct page *fronthand, *backhand;
734 uint_t count;
735 callb_cpr_t cprinfo;
736 pgcnt_t nscan_limit;
737 pgcnt_t pcount;
739 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
740 mutex_enter(&pageout_mutex);
743 * The restart case does not attempt to point the hands at roughly
744 * the right point on the assumption that after one circuit things
745 * will have settled down - and restarts shouldn't be that often.
749 * Set the two clock hands to be separated by a reasonable amount,
750 * but no more than 360 degrees apart.
752 backhand = page_first();
753 if (handspreadpages >= total_pages)
754 fronthand = page_nextn(backhand, total_pages - 1);
755 else
756 fronthand = page_nextn(backhand, handspreadpages);
758 min_pageout_ticks = MAX(1,
759 ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
760 max_pageout_ticks = MAX(min_pageout_ticks,
761 ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
763 loop:
764 cv_signal_pageout();
766 CALLB_CPR_SAFE_BEGIN(&cprinfo);
767 cv_wait(&proc_pageout->p_cv, &pageout_mutex);
768 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
770 if (!dopageout)
771 goto loop;
773 if (reset_hands) {
774 reset_hands = 0;
776 backhand = page_first();
777 if (handspreadpages >= total_pages)
778 fronthand = page_nextn(backhand, total_pages - 1);
779 else
780 fronthand = page_nextn(backhand, handspreadpages);
783 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
784 count = 0;
786 TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
787 "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
788 freemem, lotsfree, nscan, desscan);
790 pcount = 0;
791 if (pageout_sample_cnt < pageout_sample_lim) {
792 nscan_limit = total_pages;
793 } else {
794 nscan_limit = desscan;
796 pageout_lbolt = ddi_get_lbolt();
797 sample_start = gethrtime();
800 * Scan the appropriate number of pages for a single duty cycle.
801 * However, stop scanning as soon as there is enough free memory.
802 * For a short while, we will be sampling the performance of the
803 * scanner and need to keep running just to get sample data, in
804 * which case we keep going and don't pay attention to whether
805 * or not there is enough free memory.
808 while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
809 pageout_sample_cnt < pageout_sample_lim)) {
810 int rvfront, rvback;
813 * Check to see if we have exceeded our %CPU budget
814 * for this wakeup, but not on every single page visited,
815 * just every once in a while.
817 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
818 pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
819 if (pageout_cycle_ticks >= pageout_ticks) {
820 ++pageout_timeouts;
821 break;
826 * If checkpage manages to add a page to the free list,
827 * we give ourselves another couple of trips around the loop.
829 if ((rvfront = checkpage(fronthand, FRONT)) == 1)
830 count = 0;
831 if ((rvback = checkpage(backhand, BACK)) == 1)
832 count = 0;
834 ++pcount;
837 * protected by pageout_mutex instead of cpu_stat_lock
839 CPU_STATS_ADDQ(CPU, vm, scan, 1);
842 * Don't include ineligible pages in the number scanned.
844 if (rvfront != -1 || rvback != -1)
845 nscan++;
847 backhand = page_next(backhand);
850 * backhand update and wraparound check are done separately
851 * because lint barks when it finds an empty "if" body
854 if ((fronthand = page_next(fronthand)) == page_first()) {
855 TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
856 "pageout_hand_wrap:freemem %ld whichhand %d",
857 freemem, FRONT);
860 * protected by pageout_mutex instead of cpu_stat_lock
862 CPU_STATS_ADDQ(CPU, vm, rev, 1);
863 if (++count > 1) {
865 * Extremely unlikely, but it happens.
866 * We went around the loop at least once
867 * and didn't get far enough.
868 * If we are still skipping `highly shared'
869 * pages, skip fewer of them. Otherwise,
870 * give up till the next clock tick.
872 if (po_share < MAX_PO_SHARE) {
873 po_share <<= 1;
874 } else {
876 * Really a "goto loop", but
877 * if someone is TRACing, at least
878 * make records to show where we
879 * are.
881 break;
887 sample_end = gethrtime();
889 TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
890 "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
891 freemem, lotsfree, nscan, desscan, count);
893 if (pageout_sample_cnt < pageout_sample_lim) {
894 pageout_sample_pages += pcount;
895 pageout_sample_etime += sample_end - sample_start;
896 ++pageout_sample_cnt;
898 if (pageout_sample_cnt >= pageout_sample_lim &&
899 pageout_new_spread == 0) {
900 pageout_rate = (hrrate_t)pageout_sample_pages *
901 (hrrate_t)(NANOSEC) / pageout_sample_etime;
902 pageout_new_spread = pageout_rate / 10;
903 setupclock(1);
906 goto loop;
910 * Look at the page at hand. If it is locked (e.g., for physical i/o),
911 * system (u., page table) or free, then leave it alone. Otherwise,
912 * if we are running the front hand, turn off the page's reference bit.
913 * If the proc is over maxrss, we take it. If running the back hand,
914 * check whether the page has been reclaimed. If not, free the page,
915 * pushing it to disk first if necessary.
917 * Return values:
918 * -1 if the page is not a candidate at all,
919 * 0 if not freed, or
920 * 1 if we freed it.
922 static int
923 checkpage(struct page *pp, int whichhand)
925 int ppattr;
926 int isfs = 0;
927 int isexec = 0;
928 int pagesync_flag;
931 * Skip pages:
932 * - associated with the kernel vnode since
933 * they are always "exclusively" locked.
934 * - that are free
935 * - that are shared more than po_share'd times
936 * - its already locked
938 * NOTE: These optimizations assume that reads are atomic.
941 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
942 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
943 hat_page_checkshare(pp, po_share)) {
944 return (-1);
947 if (!page_trylock(pp, SE_EXCL)) {
949 * Skip the page if we can't acquire the "exclusive" lock.
951 return (-1);
952 } else if (PP_ISFREE(pp)) {
954 * It became free between the above check and our actually
955 * locking the page. Oh, well there will be other pages.
957 page_unlock(pp);
958 return (-1);
962 * Reject pages that cannot be freed. The page_struct_lock
963 * need not be acquired to examine these
964 * fields since the page has an "exclusive" lock.
966 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
967 page_unlock(pp);
968 return (-1);
972 * Maintain statistics for what we are freeing
975 if (pp->p_vnode != NULL) {
976 if (pp->p_vnode->v_flag & VVMEXEC)
977 isexec = 1;
979 if (!IS_SWAPFSVP(pp->p_vnode))
980 isfs = 1;
984 * Turn off REF and MOD bits with the front hand.
985 * The back hand examines the REF bit and always considers
986 * SHARED pages as referenced.
988 if (whichhand == FRONT)
989 pagesync_flag = HAT_SYNC_ZERORM;
990 else
991 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
992 HAT_SYNC_STOPON_SHARED;
994 ppattr = hat_pagesync(pp, pagesync_flag);
996 recheck:
998 * If page is referenced; make unreferenced but reclaimable.
999 * If this page is not referenced, then it must be reclaimable
1000 * and we can add it to the free list.
1002 if (ppattr & P_REF) {
1003 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1004 "pageout_isref:pp %p whichhand %d", pp, whichhand);
1005 if (whichhand == FRONT) {
1007 * Checking of rss or madvise flags needed here...
1009 * If not "well-behaved", fall through into the code
1010 * for not referenced.
1012 hat_clrref(pp);
1015 * Somebody referenced the page since the front
1016 * hand went by, so it's not a candidate for
1017 * freeing up.
1019 page_unlock(pp);
1020 return (0);
1023 VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1026 * If large page, attempt to demote it. If successfully demoted,
1027 * retry the checkpage.
1029 if (pp->p_szc != 0) {
1030 if (!page_try_demote_pages(pp)) {
1031 VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1032 page_unlock(pp);
1033 return (-1);
1035 ASSERT(pp->p_szc == 0);
1036 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1038 * since page_try_demote_pages() could have unloaded some
1039 * mappings it makes sense to reload ppattr.
1041 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1045 * If the page is currently dirty, we have to arrange
1046 * to have it cleaned before it can be freed.
1048 * XXX - ASSERT(pp->p_vnode != NULL);
1050 if ((ppattr & P_MOD) && pp->p_vnode) {
1051 struct vnode *vp = pp->p_vnode;
1052 uoff_t offset = pp->p_offset;
1055 * XXX - Test for process being swapped out or about to exit?
1056 * [Can't get back to process(es) using the page.]
1060 * Hold the vnode before releasing the page lock to
1061 * prevent it from being freed and re-used by some
1062 * other thread.
1064 VN_HOLD(vp);
1065 page_unlock(pp);
1068 * Queue i/o request for the pageout thread.
1070 if (!queue_io_request(vp, offset)) {
1071 VN_RELE(vp);
1072 return (0);
1074 return (1);
1078 * Now we unload all the translations,
1079 * and put the page back on to the free list.
1080 * If the page was used (referenced or modified) after
1081 * the pagesync but before it was unloaded we catch it
1082 * and handle the page properly.
1084 TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1085 "pageout_free:pp %p whichhand %d", pp, whichhand);
1086 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1087 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1088 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1089 goto recheck;
1091 VN_DISPOSE(pp, B_FREE, 0, kcred);
1093 CPU_STATS_ADD_K(vm, dfree, 1);
1095 if (isfs) {
1096 if (isexec) {
1097 CPU_STATS_ADD_K(vm, execfree, 1);
1098 } else {
1099 CPU_STATS_ADD_K(vm, fsfree, 1);
1101 } else {
1102 CPU_STATS_ADD_K(vm, anonfree, 1);
1105 return (1); /* freed a page! */
1109 * Queue async i/o request from pageout_scanner and segment swapout
1110 * routines on one common list. This ensures that pageout devices (swap)
1111 * are not saturated by pageout_scanner or swapout requests.
1112 * The pageout thread empties this list by initiating i/o operations.
1115 queue_io_request(vnode_t *vp, uoff_t off)
1117 struct async_reqs *arg;
1120 * If we cannot allocate an async request struct,
1121 * skip this page.
1123 mutex_enter(&push_lock);
1124 if ((arg = req_freelist) == NULL) {
1125 mutex_exit(&push_lock);
1126 return (0);
1128 req_freelist = arg->a_next; /* adjust freelist */
1129 push_list_size++;
1131 arg->a_vp = vp;
1132 arg->a_off = off;
1133 arg->a_len = PAGESIZE;
1134 arg->a_flags = B_ASYNC | B_FREE;
1135 arg->a_cred = kcred; /* always held */
1138 * Add to list of pending write requests.
1140 arg->a_next = push_list;
1141 push_list = arg;
1143 if (req_freelist == NULL) {
1145 * No free async requests left. The lock is held so we
1146 * might as well signal the pusher thread now.
1148 cv_signal(&push_cv);
1150 mutex_exit(&push_lock);
1151 return (1);
1155 * Wakeup pageout to initiate i/o if push_list is not empty.
1157 void
1158 cv_signal_pageout()
1160 if (push_list != NULL) {
1161 mutex_enter(&push_lock);
1162 cv_signal(&push_cv);
1163 mutex_exit(&push_lock);