kernel/os/vm_pageout.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/buf.h>
  43 #include <sys/uio.h>
  44 #include <sys/proc.h>
  45 #include <sys/systm.h>
  46 #include <sys/mman.h>
  47 #include <sys/cred.h>
  48 #include <sys/vnode.h>
  49 #include <sys/vm.h>
  50 #include <sys/vmparam.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/cpuvar.h>
  54 #include <sys/user.h>
  55 #include <sys/kmem.h>
  56 #include <sys/debug.h>
  57 #include <sys/callb.h>
  58 #include <sys/tnf_probe.h>
  59 #include <sys/mem_cage.h>
  60 #include <sys/time.h>
  61
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/page.h>
  66 #include <vm/pvn.h>
  67 #include <vm/seg_kmem.h>
  68
  69 static int checkpage(page_t *, int);
  70
  71 /*
  72  * The following parameters control operation of the page replacement
  73  * algorithm.  They are initialized to 0, and then computed at boot time
  74  * based on the size of the system.  If they are patched non-zero in
  75  * a loaded vmunix they are left alone and may thus be changed per system
  76  * using adb on the loaded system.
  77  */
  78 pgcnt_t         slowscan = 0;
  79 pgcnt_t         fastscan = 0;
  80
  81 static pgcnt_t  handspreadpages = 0;
  82 static int      loopfraction = 2;
  83 static pgcnt_t  looppages;
  84 static int      min_percent_cpu = 4;
  85 static int      max_percent_cpu = 80;
  86 static pgcnt_t  maxfastscan = 0;
  87 static pgcnt_t  maxslowscan = 100;
  88
  89 pgcnt_t maxpgio = 0;
  90 pgcnt_t minfree = 0;
  91 pgcnt_t desfree = 0;
  92 pgcnt_t lotsfree = 0;
  93 pgcnt_t needfree = 0;
  94 pgcnt_t throttlefree = 0;
  95 pgcnt_t pageout_reserve = 0;
  96
  97 pgcnt_t deficit;
  98 pgcnt_t nscan;
  99 pgcnt_t desscan;
 100
 101 /*
 102  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
 103  * are the number of ticks in each wakeup cycle that gives the
 104  * equivalent of some underlying %CPU duty cycle.
 105  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
 106  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
 107  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
 108  * So, for example, 4% == 1 tick and 80% == 20 ticks.
 109  *
 110  * min_pageout_ticks:
 111  *     ticks/wakeup equivalent of min_percent_cpu.
 112  *
 113  * max_pageout_ticks:
 114  *     ticks/wakeup equivalent of max_percent_cpu.
 115  *
 116  * pageout_ticks:
 117  *     Number of clock ticks budgeted for each wakeup cycle.
 118  *     Computed each time around by schedpaging().
 119  *     Varies between min_pageout_ticks .. max_pageout_ticks,
 120  *     depending on memory pressure.
 121  *
 122  * pageout_lbolt:
 123  *     Timestamp of the last time pageout_scanner woke up and started
 124  *     (or resumed) scanning for not recently referenced pages.
 125  */
 126
 127 static clock_t  min_pageout_ticks;
 128 static clock_t  max_pageout_ticks;
 129 static clock_t  pageout_ticks;
 130 static clock_t  pageout_lbolt;
 131
 132 static uint_t   reset_hands;
 133
 134 #define PAGES_POLL_MASK 1023
 135
 136 /*
 137  * pageout_sample_lim:
 138  *     The limit on the number of samples needed to establish a value
 139  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
 140  *
 141  * pageout_sample_cnt:
 142  *     Current sample number.  Once the sample gets large enough,
 143  *     set new values for handspreadpages, fastscan and slowscan.
 144  *
 145  * pageout_sample_pages:
 146  *     The accumulated number of pages scanned during sampling.
 147  *
 148  * pageout_sample_ticks:
 149  *     The accumulated clock ticks for the sample.
 150  *
 151  * pageout_rate:
 152  *     Rate in pages/nanosecond, computed at the end of sampling.
 153  *
 154  * pageout_new_spread:
 155  *     The new value to use for fastscan and handspreadpages.
 156  *     Calculated after enough samples have been taken.
 157  */
 158
 159 typedef hrtime_t hrrate_t;
 160
 161 static uint64_t pageout_sample_lim = 4;
 162 static uint64_t pageout_sample_cnt = 0;
 163 static pgcnt_t  pageout_sample_pages = 0;
 164 static hrrate_t pageout_rate = 0;
 165 static pgcnt_t  pageout_new_spread = 0;
 166
 167 static clock_t  pageout_cycle_ticks;
 168 static hrtime_t sample_start, sample_end;
 169 static hrtime_t pageout_sample_etime = 0;
 170
 171 /*
 172  * Record number of times a pageout_scanner wakeup cycle finished because it
 173  * timed out (exceeded its CPU budget), rather than because it visited
 174  * its budgeted number of pages.
 175  */
 176 uint64_t pageout_timeouts = 0;
 177
 178 #ifdef VM_STATS
 179 static struct pageoutvmstats_str {
 180         ulong_t checkpage[3];
 181 } pageoutvmstats;
 182 #endif /* VM_STATS */
 183
 184 /*
 185  * Threads waiting for free memory use this condition variable and lock until
 186  * memory becomes available.
 187  */
 188 kmutex_t        memavail_lock;
 189 kcondvar_t      memavail_cv;
 190
 191 /*
 192  * The size of the clock loop.
 193  */
 194 #define LOOPPAGES       total_pages
 195
 196 /*
 197  * Set up the paging constants for the clock algorithm.
 198  * Called after the system is initialized and the amount of memory
 199  * and number of paging devices is known.
 200  *
 201  * lotsfree is 1/64 of memory, but at least 512K.
 202  * desfree is 1/2 of lotsfree.
 203  * minfree is 1/2 of desfree.
 204  *
 205  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
 206  *
 207  *      lotsfree = btop(512K)
 208  *      desfree = btop(200K)
 209  *      minfree = btop(100K)
 210  *      throttlefree = INT_MIN
 211  *      max_percent_cpu = 4
 212  */
 213 void
 214 setupclock(int recalc)
 215 {
 216
 217         static spgcnt_t init_lfree, init_dfree, init_mfree;
 218         static spgcnt_t init_tfree, init_preserve, init_mpgio;
 219         static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
 220
 221         looppages = LOOPPAGES;
 222
 223         /*
 224          * setupclock can now be called to recalculate the paging
 225          * parameters in the case of dynamic addition of memory.
 226          * So to make sure we make the proper calculations, if such a
 227          * situation should arise, we save away the initial values
 228          * of each parameter so we can recall them when needed. This
 229          * way we don't lose the settings an admin might have made
 230          * through the /etc/system file.
 231          */
 232
 233         if (!recalc) {
 234                 init_lfree = lotsfree;
 235                 init_dfree = desfree;
 236                 init_mfree = minfree;
 237                 init_tfree = throttlefree;
 238                 init_preserve = pageout_reserve;
 239                 init_mpgio = maxpgio;
 240                 init_mfscan = maxfastscan;
 241                 init_fscan = fastscan;
 242                 init_sscan = slowscan;
 243                 init_hspages = handspreadpages;
 244         }
 245
 246         /*
 247          * Set up thresholds for paging:
 248          */
 249
 250         /*
 251          * Lotsfree is threshold where paging daemon turns on.
 252          */
 253         if (init_lfree == 0 || init_lfree >= looppages)
 254                 lotsfree = MAX(looppages / 64, btop(512 * 1024));
 255         else
 256                 lotsfree = init_lfree;
 257
 258         /*
 259          * Desfree is amount of memory desired free.
 260          * If less than this for extended period, start swapping.
 261          */
 262         if (init_dfree == 0 || init_dfree >= lotsfree)
 263                 desfree = lotsfree / 2;
 264         else
 265                 desfree = init_dfree;
 266
 267         /*
 268          * Minfree is minimal amount of free memory which is tolerable.
 269          */
 270         if (init_mfree == 0 || init_mfree >= desfree)
 271                 minfree = desfree / 2;
 272         else
 273                 minfree = init_mfree;
 274
 275         /*
 276          * Throttlefree is the point at which we start throttling
 277          * PG_WAIT requests until enough memory becomes available.
 278          */
 279         if (init_tfree == 0 || init_tfree >= desfree)
 280                 throttlefree = minfree;
 281         else
 282                 throttlefree = init_tfree;
 283
 284         /*
 285          * Pageout_reserve is the number of pages that we keep in
 286          * stock for pageout's own use.  Having a few such pages
 287          * provides insurance against system deadlock due to
 288          * pageout needing pages.  When freemem < pageout_reserve,
 289          * non-blocking allocations are denied to any threads
 290          * other than pageout and sched.  (At some point we might
 291          * want to consider a per-thread flag like T_PUSHING_PAGES
 292          * to indicate that a thread is part of the page-pushing
 293          * dance (e.g. an interrupt thread) and thus is entitled
 294          * to the same special dispensation we accord pageout.)
 295          */
 296         if (init_preserve == 0 || init_preserve >= throttlefree)
 297                 pageout_reserve = throttlefree / 2;
 298         else
 299                 pageout_reserve = init_preserve;
 300
 301         /*
 302          * Maxpgio thresholds how much paging is acceptable.
 303          * This figures that 2/3 busy on an arm is all that is
 304          * tolerable for paging.  We assume one operation per disk rev.
 305          *
 306          * XXX - Does not account for multiple swap devices.
 307          */
 308         if (init_mpgio == 0)
 309                 maxpgio = (DISKRPM * 2) / 3;
 310         else
 311                 maxpgio = init_mpgio;
 312
 313         /*
 314          * The clock scan rate varies between fastscan and slowscan
 315          * based on the amount of free memory available.  Fastscan
 316          * rate should be set based on the number pages that can be
 317          * scanned per sec using ~10% of processor time.  Since this
 318          * value depends on the processor, MMU, Mhz etc., it is
 319          * difficult to determine it in a generic manner for all
 320          * architectures.
 321          *
 322          * Instead of trying to determine the number of pages scanned
 323          * per sec for every processor, fastscan is set to be the smaller
 324          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 325          * time is limited to ~4% of processor time.
 326          *
 327          * Setting fastscan to be 1/2 of memory allows pageout to scan
 328          * all of memory in ~2 secs.  This implies that user pages not
 329          * accessed within 1 sec (assuming, handspreadpages == fastscan)
 330          * can be reclaimed when free memory is very low.  Stealing pages
 331          * not accessed within 1 sec seems reasonable and ensures that
 332          * active user processes don't thrash.
 333          *
 334          * Smaller values of fastscan result in scanning fewer pages
 335          * every second and consequently pageout may not be able to free
 336          * sufficient memory to maintain the minimum threshold.  Larger
 337          * values of fastscan result in scanning a lot more pages which
 338          * could lead to thrashing and higher CPU usage.
 339          *
 340          * Fastscan needs to be limited to a maximum value and should not
 341          * scale with memory to prevent pageout from consuming too much
 342          * time for scanning on slow CPU's and avoid thrashing, as a
 343          * result of scanning too many pages, on faster CPU's.
 344          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 345          * (the upper bound for fastscan) based on the average number
 346          * of pages that can potentially be scanned in ~1 sec (using ~4%
 347          * of the CPU) on some of the following machines that currently
 348          * run Solaris 2.x:
 349          *
 350          *                      average memory scanned in ~1 sec
 351          *
 352          *      25 Mhz SS1+:            23 Meg
 353          *      LX:                     37 Meg
 354          *      50 Mhz SC2000:          68 Meg
 355          *
 356          *      40 Mhz 486:             26 Meg
 357          *      66 Mhz 486:             42 Meg
 358          *
 359          * When free memory falls just below lotsfree, the scan rate
 360          * goes from 0 to slowscan (i.e., pageout starts running).  This
 361          * transition needs to be smooth and is achieved by ensuring that
 362          * pageout scans a small number of pages to satisfy the transient
 363          * memory demand.  This is set to not exceed 100 pages/sec (25 per
 364          * wakeup) since scanning that many pages has no noticible impact
 365          * on system performance.
 366          *
 367          * In addition to setting fastscan and slowscan, pageout is
 368          * limited to using ~4% of the CPU.  This results in increasing
 369          * the time taken to scan all of memory, which in turn means that
 370          * user processes have a better opportunity of preventing their
 371          * pages from being stolen.  This has a positive effect on
 372          * interactive and overall system performance when memory demand
 373          * is high.
 374          *
 375          * Thus, the rate at which pages are scanned for replacement will
 376          * vary linearly between slowscan and the number of pages that
 377          * can be scanned using ~4% of processor time instead of varying
 378          * linearly between slowscan and fastscan.
 379          *
 380          * Also, the processor time used by pageout will vary from ~1%
 381          * at slowscan to ~4% at fastscan instead of varying between
 382          * ~1% at slowscan and ~10% at fastscan.
 383          *
 384          * The values chosen for the various VM parameters (fastscan,
 385          * handspreadpages, etc) are not universally true for all machines,
 386          * but appear to be a good rule of thumb for the machines we've
 387          * tested.  They have the following ranges:
 388          *
 389          *      cpu speed:      20 to 70 Mhz
 390          *      page size:      4K to 8K
 391          *      memory size:    16M to 5G
 392          *      page scan rate: 4000 - 17400 4K pages per sec
 393          *
 394          * The values need to be re-examined for machines which don't
 395          * fall into the various ranges (e.g., slower or faster CPUs,
 396          * smaller or larger pagesizes etc) shown above.
 397          *
 398          * On an MP machine, pageout is often unable to maintain the
 399          * minimum paging thresholds under heavy load.  This is due to
 400          * the fact that user processes running on other CPU's can be
 401          * dirtying memory at a much faster pace than pageout can find
 402          * pages to free.  The memory demands could be met by enabling
 403          * more than one CPU to run the clock algorithm in such a manner
 404          * that the various clock hands don't overlap.  This also makes
 405          * it more difficult to determine the values for fastscan, slowscan
 406          * and handspreadpages.
 407          *
 408          * The swapper is currently used to free up memory when pageout
 409          * is unable to meet memory demands by swapping out processes.
 410          * In addition to freeing up memory, swapping also reduces the
 411          * demand for memory by preventing user processes from running
 412          * and thereby consuming memory.
 413          */
 414         if (init_mfscan == 0) {
 415                 if (pageout_new_spread != 0)
 416                         maxfastscan = pageout_new_spread;
 417                 else
 418                         maxfastscan = MAXHANDSPREADPAGES;
 419         } else {
 420                 maxfastscan = init_mfscan;
 421         }
 422         if (init_fscan == 0)
 423                 fastscan = MIN(looppages / loopfraction, maxfastscan);
 424         else
 425                 fastscan = init_fscan;
 426         if (fastscan > looppages / loopfraction)
 427                 fastscan = looppages / loopfraction;
 428
 429         /*
 430          * Set slow scan time to 1/10 the fast scan time, but
 431          * not to exceed maxslowscan.
 432          */
 433         if (init_sscan == 0)
 434                 slowscan = MIN(fastscan / 10, maxslowscan);
 435         else
 436                 slowscan = init_sscan;
 437         if (slowscan > fastscan / 2)
 438                 slowscan = fastscan / 2;
 439
 440         /*
 441          * Handspreadpages is distance (in pages) between front and back
 442          * pageout daemon hands.  The amount of time to reclaim a page
 443          * once pageout examines it increases with this distance and
 444          * decreases as the scan rate rises. It must be < the amount
 445          * of pageable memory.
 446          *
 447          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 448          * to be "fastscan" results in the front hand being a few secs
 449          * (varies based on the processor speed) ahead of the back hand
 450          * at fastscan rates.  This distance can be further reduced, if
 451          * necessary, by increasing the processor time used by pageout
 452          * to be more than ~4% and preferrably not more than ~10%.
 453          *
 454          * As a result, user processes have a much better chance of
 455          * referencing their pages before the back hand examines them.
 456          * This also significantly lowers the number of reclaims from
 457          * the freelist since pageout does not end up freeing pages which
 458          * may be referenced a sec later.
 459          */
 460         if (init_hspages == 0)
 461                 handspreadpages = fastscan;
 462         else
 463                 handspreadpages = init_hspages;
 464
 465         /*
 466          * Make sure that back hand follows front hand by at least
 467          * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
 468          * for the back hand to look at a page during the same wakeup of
 469          * the pageout daemon in which the front hand cleared its ref bit.
 470          */
 471         if (handspreadpages >= looppages)
 472                 handspreadpages = looppages - 1;
 473
 474         /*
 475          * If we have been called to recalculate the parameters,
 476          * set a flag to re-evaluate the clock hand pointers.
 477          */
 478         if (recalc)
 479                 reset_hands = 1;
 480 }
 481
 482 /*
 483  * Pageout scheduling.
 484  *
 485  * Schedpaging controls the rate at which the page out daemon runs by
 486  * setting the global variables nscan and desscan RATETOSCHEDPAGING
 487  * times a second.  Nscan records the number of pages pageout has examined
 488  * in its current pass; schedpaging resets this value to zero each time
 489  * it runs.  Desscan records the number of pages pageout should examine
 490  * in its next pass; schedpaging sets this value based on the amount of
 491  * currently available memory.
 492  */
 493
 494 #define RATETOSCHEDPAGING       4               /* hz that is */
 495
 496 static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 497
 498 /*
 499  * Pool of available async pageout putpage requests.
 500  */
 501 static struct async_reqs *push_req;
 502 static struct async_reqs *req_freelist; /* available req structs */
 503 static struct async_reqs *push_list;    /* pending reqs */
 504 static kmutex_t push_lock;              /* protects req pool */
 505 static kcondvar_t push_cv;
 506
 507 static int async_list_size = 256;       /* number of async request structs */
 508
 509 static void pageout_scanner(void);
 510
 511 /*
 512  * If a page is being shared more than "po_share" times
 513  * then leave it alone- don't page it out.
 514  */
 515 #define MIN_PO_SHARE    (8)
 516 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 517 ulong_t po_share = MIN_PO_SHARE;
 518
 519 /*
 520  * Schedule rate for paging.
 521  * Rate is linear interpolation between
 522  * slowscan with lotsfree and fastscan when out of memory.
 523  */
 524 static void
 525 schedpaging(void *arg)
 526 {
 527         spgcnt_t vavail;
 528
 529         if (freemem < lotsfree + needfree + kmem_reapahead)
 530                 kmem_reap();
 531
 532         if (freemem < lotsfree + needfree)
 533                 seg_preap();
 534
 535         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 536                 kcage_cageout_wakeup();
 537
 538         if (mutex_tryenter(&pageout_mutex)) {
 539                 /* pageout() not running */
 540                 nscan = 0;
 541                 vavail = freemem - deficit;
 542                 if (pageout_new_spread != 0)
 543                         vavail -= needfree;
 544                 if (vavail < 0)
 545                         vavail = 0;
 546                 if (vavail > lotsfree)
 547                         vavail = lotsfree;
 548
 549                 /*
 550                  * Fix for 1161438 (CRS SPR# 73922).  All variables
 551                  * in the original calculation for desscan were 32 bit signed
 552                  * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 553                  * more of memory, the calculation can overflow.  When this
 554                  * happens, desscan becomes negative and pageout_scanner()
 555                  * stops paging out.
 556                  */
 557                 if ((needfree) && (pageout_new_spread == 0)) {
 558                         /*
 559                          * If we've not yet collected enough samples to
 560                          * calculate a spread, use the old logic of kicking
 561                          * into high gear anytime needfree is non-zero.
 562                          */
 563                         desscan = fastscan / RATETOSCHEDPAGING;
 564                 } else {
 565                         /*
 566                          * Once we've calculated a spread based on system
 567                          * memory and usage, just treat needfree as another
 568                          * form of deficit.
 569                          */
 570                         spgcnt_t faststmp, slowstmp, result;
 571
 572                         slowstmp = slowscan * vavail;
 573                         faststmp = fastscan * (lotsfree - vavail);
 574                         result = (slowstmp + faststmp) /
 575                             nz(lotsfree) / RATETOSCHEDPAGING;
 576                         desscan = (pgcnt_t)result;
 577                 }
 578
 579                 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
 580                     (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
 581
 582                 if (freemem < lotsfree + needfree ||
 583                     pageout_sample_cnt < pageout_sample_lim) {
 584                         TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
 585                             "pageout_cv_signal:freemem %ld", freemem);
 586                         cv_signal(&proc_pageout->p_cv);
 587                 } else {
 588                         /*
 589                          * There are enough free pages, no need to
 590                          * kick the scanner thread.  And next time
 591                          * around, keep more of the `highly shared'
 592                          * pages.
 593                          */
 594                         cv_signal_pageout();
 595                         if (po_share > MIN_PO_SHARE) {
 596                                 po_share >>= 1;
 597                         }
 598                 }
 599                 mutex_exit(&pageout_mutex);
 600         }
 601
 602         /*
 603          * Signal threads waiting for available memory.
 604          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 605          * in this case it is not needed - the waiters will be waken up during
 606          * the next invocation of this function.
 607          */
 608         if (kmem_avail() > 0)
 609                 cv_broadcast(&memavail_cv);
 610
 611         (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
 612 }
 613
 614 pgcnt_t         pushes;
 615 ulong_t         push_list_size;         /* # of requests on pageout queue */
 616
 617 #define FRONT   1
 618 #define BACK    2
 619
 620 int dopageout = 1;      /* must be non-zero to turn page stealing on */
 621
 622 /*
 623  * The page out daemon, which runs as process 2.
 624  *
 625  * As long as there are at least lotsfree pages,
 626  * this process is not run.  When the number of free
 627  * pages stays in the range desfree to lotsfree,
 628  * this daemon runs through the pages in the loop
 629  * at a rate determined in schedpaging().  Pageout manages
 630  * two hands on the clock.  The front hand moves through
 631  * memory, clearing the reference bit,
 632  * and stealing pages from procs that are over maxrss.
 633  * The back hand travels a distance behind the front hand,
 634  * freeing the pages that have not been referenced in the time
 635  * since the front hand passed.  If modified, they are pushed to
 636  * swap before being freed.
 637  *
 638  * There are 2 threads that act on behalf of the pageout process.
 639  * One thread scans pages (pageout_scanner) and frees them up if
 640  * they don't require any fop_putpage operation. If a page must be
 641  * written back to its backing store, the request is put on a list
 642  * and the other (pageout) thread is signaled. The pageout thread
 643  * grabs fop_putpage requests from the list, and processes them.
 644  * Some filesystems may require resources for the fop_putpage
 645  * operations (like memory) and hence can block the pageout
 646  * thread, but the scanner thread can still operate. There is still
 647  * no guarantee that memory deadlocks cannot occur.
 648  *
 649  * For now, this thing is in very rough form.
 650  */
 651 void
 652 pageout()
 653 {
 654         struct async_reqs *arg;
 655         pri_t pageout_pri;
 656         int i;
 657         pgcnt_t max_pushes;
 658         callb_cpr_t cprinfo;
 659
 660         proc_pageout = ttoproc(curthread);
 661         proc_pageout->p_cstime = 0;
 662         proc_pageout->p_stime =  0;
 663         proc_pageout->p_cutime =  0;
 664         proc_pageout->p_utime = 0;
 665         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
 666         bcopy("pageout", PTOU(curproc)->u_comm, 7);
 667
 668         /*
 669          * Create pageout scanner thread
 670          */
 671         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
 672         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
 673
 674         /*
 675          * Allocate and initialize the async request structures
 676          * for pageout.
 677          */
 678         push_req = (struct async_reqs *)
 679             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
 680
 681         req_freelist = push_req;
 682         for (i = 0; i < async_list_size - 1; i++)
 683                 push_req[i].a_next = &push_req[i + 1];
 684
 685         pageout_pri = curthread->t_pri;
 686
 687         /* Create the pageout scanner thread. */
 688         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
 689             pageout_pri - 1);
 690
 691         /*
 692          * kick off pageout scheduler.
 693          */
 694         schedpaging(NULL);
 695
 696         /*
 697          * Create kernel cage thread.
 698          * The kernel cage thread is started under the pageout process
 699          * to take advantage of the less restricted page allocation
 700          * in page_create_throttle().
 701          */
 702         kcage_cageout_init();
 703
 704         /*
 705          * Limit pushes to avoid saturating pageout devices.
 706          */
 707         max_pushes = maxpgio / RATETOSCHEDPAGING;
 708         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
 709
 710         for (;;) {
 711                 mutex_enter(&push_lock);
 712
 713                 while ((arg = push_list) == NULL || pushes > max_pushes) {
 714                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 715                         cv_wait(&push_cv, &push_lock);
 716                         pushes = 0;
 717                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
 718                 }
 719                 push_list = arg->a_next;
 720                 arg->a_next = NULL;
 721                 mutex_exit(&push_lock);
 722
 723                 if (fop_putpage(arg->a_vp, (offset_t)arg->a_off,
 724                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 725                         pushes++;
 726                 }
 727
 728                 /* vp held by checkpage() */
 729                 VN_RELE(arg->a_vp);
 730
 731                 mutex_enter(&push_lock);
 732                 arg->a_next = req_freelist;     /* back on freelist */
 733                 req_freelist = arg;
 734                 push_list_size--;
 735                 mutex_exit(&push_lock);
 736         }
 737 }
 738
 739 /*
 740  * Kernel thread that scans pages looking for ones to free
 741  */
 742 static void
 743 pageout_scanner(void)
 744 {
 745         struct page *fronthand, *backhand;
 746         uint_t count;
 747         callb_cpr_t cprinfo;
 748         pgcnt_t nscan_limit;
 749         pgcnt_t pcount;
 750
 751         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
 752         mutex_enter(&pageout_mutex);
 753
 754         /*
 755          * The restart case does not attempt to point the hands at roughly
 756          * the right point on the assumption that after one circuit things
 757          * will have settled down - and restarts shouldn't be that often.
 758          */
 759
 760         /*
 761          * Set the two clock hands to be separated by a reasonable amount,
 762          * but no more than 360 degrees apart.
 763          */
 764         backhand = page_first();
 765         if (handspreadpages >= total_pages)
 766                 fronthand = page_nextn(backhand, total_pages - 1);
 767         else
 768                 fronthand = page_nextn(backhand, handspreadpages);
 769
 770         min_pageout_ticks = MAX(1,
 771             ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
 772         max_pageout_ticks = MAX(min_pageout_ticks,
 773             ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
 774
 775 loop:
 776         cv_signal_pageout();
 777
 778         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 779         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
 780         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
 781
 782         if (!dopageout)
 783                 goto loop;
 784
 785         if (reset_hands) {
 786                 reset_hands = 0;
 787
 788                 backhand = page_first();
 789                 if (handspreadpages >= total_pages)
 790                         fronthand = page_nextn(backhand, total_pages - 1);
 791                 else
 792                         fronthand = page_nextn(backhand, handspreadpages);
 793         }
 794
 795         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
 796         count = 0;
 797
 798         TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
 799             "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
 800             freemem, lotsfree, nscan, desscan);
 801
 802         pcount = 0;
 803         if (pageout_sample_cnt < pageout_sample_lim) {
 804                 nscan_limit = total_pages;
 805         } else {
 806                 nscan_limit = desscan;
 807         }
 808         pageout_lbolt = ddi_get_lbolt();
 809         sample_start = gethrtime();
 810
 811         /*
 812          * Scan the appropriate number of pages for a single duty cycle.
 813          * However, stop scanning as soon as there is enough free memory.
 814          * For a short while, we will be sampling the performance of the
 815          * scanner and need to keep running just to get sample data, in
 816          * which case we keep going and don't pay attention to whether
 817          * or not there is enough free memory.
 818          */
 819
 820         while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
 821             pageout_sample_cnt < pageout_sample_lim)) {
 822                 int rvfront, rvback;
 823
 824                 /*
 825                  * Check to see if we have exceeded our %CPU budget
 826                  * for this wakeup, but not on every single page visited,
 827                  * just every once in a while.
 828                  */
 829                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
 830                         pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
 831                         if (pageout_cycle_ticks >= pageout_ticks) {
 832                                 ++pageout_timeouts;
 833                                 break;
 834                         }
 835                 }
 836
 837                 /*
 838                  * If checkpage manages to add a page to the free list,
 839                  * we give ourselves another couple of trips around the loop.
 840                  */
 841                 if ((rvfront = checkpage(fronthand, FRONT)) == 1)
 842                         count = 0;
 843                 if ((rvback = checkpage(backhand, BACK)) == 1)
 844                         count = 0;
 845
 846                 ++pcount;
 847
 848                 /*
 849                  * protected by pageout_mutex instead of cpu_stat_lock
 850                  */
 851                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
 852
 853                 /*
 854                  * Don't include ineligible pages in the number scanned.
 855                  */
 856                 if (rvfront != -1 || rvback != -1)
 857                         nscan++;
 858
 859                 backhand = page_next(backhand);
 860
 861                 /*
 862                  * backhand update and wraparound check are done separately
 863                  * because lint barks when it finds an empty "if" body
 864                  */
 865
 866                 if ((fronthand = page_next(fronthand)) == page_first()) {
 867                         TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
 868                             "pageout_hand_wrap:freemem %ld whichhand %d",
 869                             freemem, FRONT);
 870
 871                         /*
 872                          * protected by pageout_mutex instead of cpu_stat_lock
 873                          */
 874                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
 875                         if (++count > 1) {
 876                                 /*
 877                                  * Extremely unlikely, but it happens.
 878                                  * We went around the loop at least once
 879                                  * and didn't get far enough.
 880                                  * If we are still skipping `highly shared'
 881                                  * pages, skip fewer of them.  Otherwise,
 882                                  * give up till the next clock tick.
 883                                  */
 884                                 if (po_share < MAX_PO_SHARE) {
 885                                         po_share <<= 1;
 886                                 } else {
 887                                         /*
 888                                          * Really a "goto loop", but
 889                                          * if someone is TRACing,  at least
 890                                          * make records to show where we
 891                                          * are.
 892                                          */
 893                                         break;
 894                                 }
 895                         }
 896                 }
 897         }
 898
 899         sample_end = gethrtime();
 900
 901         TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
 902             "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
 903             freemem, lotsfree, nscan, desscan, count);
 904
 905         if (pageout_sample_cnt < pageout_sample_lim) {
 906                 pageout_sample_pages += pcount;
 907                 pageout_sample_etime += sample_end - sample_start;
 908                 ++pageout_sample_cnt;
 909         }
 910         if (pageout_sample_cnt >= pageout_sample_lim &&
 911             pageout_new_spread == 0) {
 912                 pageout_rate = (hrrate_t)pageout_sample_pages *
 913                     (hrrate_t)(NANOSEC) / pageout_sample_etime;
 914                 pageout_new_spread = pageout_rate / 10;
 915                 setupclock(1);
 916         }
 917
 918         goto loop;
 919 }
 920
 921 /*
 922  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
 923  * system (u., page table) or free, then leave it alone.  Otherwise,
 924  * if we are running the front hand, turn off the page's reference bit.
 925  * If the proc is over maxrss, we take it.  If running the back hand,
 926  * check whether the page has been reclaimed.  If not, free the page,
 927  * pushing it to disk first if necessary.
 928  *
 929  * Return values:
 930  *      -1 if the page is not a candidate at all,
 931  *       0 if not freed, or
 932  *       1 if we freed it.
 933  */
 934 static int
 935 checkpage(struct page *pp, int whichhand)
 936 {
 937         int ppattr;
 938         int isfs = 0;
 939         int isexec = 0;
 940         int pagesync_flag;
 941
 942         /*
 943          * Skip pages:
 944          *      - associated with the kernel vnode since
 945          *          they are always "exclusively" locked.
 946          *      - that are free
 947          *      - that are shared more than po_share'd times
 948          *      - its already locked
 949          *
 950          * NOTE:  These optimizations assume that reads are atomic.
 951          */
 952
 953         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
 954             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
 955             hat_page_checkshare(pp, po_share)) {
 956                 return (-1);
 957         }
 958
 959         if (!page_trylock(pp, SE_EXCL)) {
 960                 /*
 961                  * Skip the page if we can't acquire the "exclusive" lock.
 962                  */
 963                 return (-1);
 964         } else if (PP_ISFREE(pp)) {
 965                 /*
 966                  * It became free between the above check and our actually
 967                  * locking the page.  Oh, well there will be other pages.
 968                  */
 969                 page_unlock(pp);
 970                 return (-1);
 971         }
 972
 973         /*
 974          * Reject pages that cannot be freed. The page_struct_lock
 975          * need not be acquired to examine these
 976          * fields since the page has an "exclusive" lock.
 977          */
 978         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 979                 page_unlock(pp);
 980                 return (-1);
 981         }
 982
 983         /*
 984          * Maintain statistics for what we are freeing
 985          */
 986
 987         if (pp->p_vnode != NULL) {
 988                 if (pp->p_vnode->v_flag & VVMEXEC)
 989                         isexec = 1;
 990
 991                 if (!IS_SWAPFSVP(pp->p_vnode))
 992                         isfs = 1;
 993         }
 994
 995         /*
 996          * Turn off REF and MOD bits with the front hand.
 997          * The back hand examines the REF bit and always considers
 998          * SHARED pages as referenced.
 999          */
1000         if (whichhand == FRONT)
1001                 pagesync_flag = HAT_SYNC_ZERORM;
1002         else
1003                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1004                     HAT_SYNC_STOPON_SHARED;
1005
1006         ppattr = hat_pagesync(pp, pagesync_flag);
1007
1008 recheck:
1009         /*
1010          * If page is referenced; make unreferenced but reclaimable.
1011          * If this page is not referenced, then it must be reclaimable
1012          * and we can add it to the free list.
1013          */
1014         if (ppattr & P_REF) {
1015                 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1016                     "pageout_isref:pp %p whichhand %d", pp, whichhand);
1017                 if (whichhand == FRONT) {
1018                         /*
1019                          * Checking of rss or madvise flags needed here...
1020                          *
1021                          * If not "well-behaved", fall through into the code
1022                          * for not referenced.
1023                          */
1024                         hat_clrref(pp);
1025                 }
1026                 /*
1027                  * Somebody referenced the page since the front
1028                  * hand went by, so it's not a candidate for
1029                  * freeing up.
1030                  */
1031                 page_unlock(pp);
1032                 return (0);
1033         }
1034
1035         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1036
1037         /*
1038          * If large page, attempt to demote it. If successfully demoted,
1039          * retry the checkpage.
1040          */
1041         if (pp->p_szc != 0) {
1042                 if (!page_try_demote_pages(pp)) {
1043                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1044                         page_unlock(pp);
1045                         return (-1);
1046                 }
1047                 ASSERT(pp->p_szc == 0);
1048                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1049                 /*
1050                  * since page_try_demote_pages() could have unloaded some
1051                  * mappings it makes sense to reload ppattr.
1052                  */
1053                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1054         }
1055
1056         /*
1057          * If the page is currently dirty, we have to arrange
1058          * to have it cleaned before it can be freed.
1059          *
1060          * XXX - ASSERT(pp->p_vnode != NULL);
1061          */
1062         if ((ppattr & P_MOD) && pp->p_vnode) {
1063                 struct vnode *vp = pp->p_vnode;
1064                 uoff_t offset = pp->p_offset;
1065
1066                 /*
1067                  * XXX - Test for process being swapped out or about to exit?
1068                  * [Can't get back to process(es) using the page.]
1069                  */
1070
1071                 /*
1072                  * Hold the vnode before releasing the page lock to
1073                  * prevent it from being freed and re-used by some
1074                  * other thread.
1075                  */
1076                 VN_HOLD(vp);
1077                 page_unlock(pp);
1078
1079                 /*
1080                  * Queue i/o request for the pageout thread.
1081                  */
1082                 if (!queue_io_request(vp, offset)) {
1083                         VN_RELE(vp);
1084                         return (0);
1085                 }
1086                 return (1);
1087         }
1088
1089         /*
1090          * Now we unload all the translations,
1091          * and put the page back on to the free list.
1092          * If the page was used (referenced or modified) after
1093          * the pagesync but before it was unloaded we catch it
1094          * and handle the page properly.
1095          */
1096         TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1097             "pageout_free:pp %p whichhand %d", pp, whichhand);
1098         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1099         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1100         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1101                 goto recheck;
1102
1103         VN_DISPOSE(pp, B_FREE, 0, kcred);
1104
1105         CPU_STATS_ADD_K(vm, dfree, 1);
1106
1107         if (isfs) {
1108                 if (isexec) {
1109                         CPU_STATS_ADD_K(vm, execfree, 1);
1110                 } else {
1111                         CPU_STATS_ADD_K(vm, fsfree, 1);
1112                 }
1113         } else {
1114                 CPU_STATS_ADD_K(vm, anonfree, 1);
1115         }
1116
1117         return (1);             /* freed a page! */
1118 }
1119
1120 /*
1121  * Queue async i/o request from pageout_scanner and segment swapout
1122  * routines on one common list.  This ensures that pageout devices (swap)
1123  * are not saturated by pageout_scanner or swapout requests.
1124  * The pageout thread empties this list by initiating i/o operations.
1125  */
1126 int
1127 queue_io_request(vnode_t *vp, uoff_t off)
1128 {
1129         struct async_reqs *arg;
1130
1131         /*
1132          * If we cannot allocate an async request struct,
1133          * skip this page.
1134          */
1135         mutex_enter(&push_lock);
1136         if ((arg = req_freelist) == NULL) {
1137                 mutex_exit(&push_lock);
1138                 return (0);
1139         }
1140         req_freelist = arg->a_next;             /* adjust freelist */
1141         push_list_size++;
1142
1143         arg->a_vp = vp;
1144         arg->a_off = off;
1145         arg->a_len = PAGESIZE;
1146         arg->a_flags = B_ASYNC | B_FREE;
1147         arg->a_cred = kcred;            /* always held */
1148
1149         /*
1150          * Add to list of pending write requests.
1151          */
1152         arg->a_next = push_list;
1153         push_list = arg;
1154
1155         if (req_freelist == NULL) {
1156                 /*
1157                  * No free async requests left. The lock is held so we
1158                  * might as well signal the pusher thread now.
1159                  */
1160                 cv_signal(&push_cv);
1161         }
1162         mutex_exit(&push_lock);
1163         return (1);
1164 }
1165
1166 /*
1167  * Wakeup pageout to initiate i/o if push_list is not empty.
1168  */
1169 void
1170 cv_signal_pageout()
1171 {
1172         if (push_list != NULL) {
1173                 mutex_enter(&push_lock);
1174                 cv_signal(&push_cv);
1175                 mutex_exit(&push_lock);
1176         }
1177 }