kernel/os/vm_pageout.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/buf.h>
  43 #include <sys/uio.h>
  44 #include <sys/proc.h>
  45 #include <sys/systm.h>
  46 #include <sys/mman.h>
  47 #include <sys/cred.h>
  48 #include <sys/vnode.h>
  49 #include <sys/vm.h>
  50 #include <sys/vmparam.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/cpuvar.h>
  54 #include <sys/user.h>
  55 #include <sys/kmem.h>
  56 #include <sys/debug.h>
  57 #include <sys/callb.h>
  58 #include <sys/tnf_probe.h>
  59 #include <sys/time.h>
  60
  61 #include <vm/hat.h>
  62 #include <vm/as.h>
  63 #include <vm/seg.h>
  64 #include <vm/page.h>
  65 #include <vm/pvn.h>
  66 #include <vm/seg_kmem.h>
  67
  68 static int checkpage(page_t *, int);
  69
  70 /*
  71  * The following parameters control operation of the page replacement
  72  * algorithm.  They are initialized to 0, and then computed at boot time
  73  * based on the size of the system.  If they are patched non-zero in
  74  * a loaded vmunix they are left alone and may thus be changed per system
  75  * using adb on the loaded system.
  76  */
  77 pgcnt_t         slowscan = 0;
  78 pgcnt_t         fastscan = 0;
  79
  80 static pgcnt_t  handspreadpages = 0;
  81 static int      loopfraction = 2;
  82 static pgcnt_t  looppages;
  83 static int      min_percent_cpu = 4;
  84 static int      max_percent_cpu = 80;
  85 static pgcnt_t  maxfastscan = 0;
  86 static pgcnt_t  maxslowscan = 100;
  87
  88 pgcnt_t maxpgio = 0;
  89 pgcnt_t minfree = 0;
  90 pgcnt_t desfree = 0;
  91 pgcnt_t lotsfree = 0;
  92 pgcnt_t needfree = 0;
  93 pgcnt_t throttlefree = 0;
  94 pgcnt_t pageout_reserve = 0;
  95
  96 pgcnt_t deficit;
  97 pgcnt_t nscan;
  98 pgcnt_t desscan;
  99
 100 /*
 101  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
 102  * are the number of ticks in each wakeup cycle that gives the
 103  * equivalent of some underlying %CPU duty cycle.
 104  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
 105  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
 106  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
 107  * So, for example, 4% == 1 tick and 80% == 20 ticks.
 108  *
 109  * min_pageout_ticks:
 110  *     ticks/wakeup equivalent of min_percent_cpu.
 111  *
 112  * max_pageout_ticks:
 113  *     ticks/wakeup equivalent of max_percent_cpu.
 114  *
 115  * pageout_ticks:
 116  *     Number of clock ticks budgeted for each wakeup cycle.
 117  *     Computed each time around by schedpaging().
 118  *     Varies between min_pageout_ticks .. max_pageout_ticks,
 119  *     depending on memory pressure.
 120  *
 121  * pageout_lbolt:
 122  *     Timestamp of the last time pageout_scanner woke up and started
 123  *     (or resumed) scanning for not recently referenced pages.
 124  */
 125
 126 static clock_t  min_pageout_ticks;
 127 static clock_t  max_pageout_ticks;
 128 static clock_t  pageout_ticks;
 129 static clock_t  pageout_lbolt;
 130
 131 static uint_t   reset_hands;
 132
 133 #define PAGES_POLL_MASK 1023
 134
 135 /*
 136  * pageout_sample_lim:
 137  *     The limit on the number of samples needed to establish a value
 138  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
 139  *
 140  * pageout_sample_cnt:
 141  *     Current sample number.  Once the sample gets large enough,
 142  *     set new values for handspreadpages, fastscan and slowscan.
 143  *
 144  * pageout_sample_pages:
 145  *     The accumulated number of pages scanned during sampling.
 146  *
 147  * pageout_sample_ticks:
 148  *     The accumulated clock ticks for the sample.
 149  *
 150  * pageout_rate:
 151  *     Rate in pages/nanosecond, computed at the end of sampling.
 152  *
 153  * pageout_new_spread:
 154  *     The new value to use for fastscan and handspreadpages.
 155  *     Calculated after enough samples have been taken.
 156  */
 157
 158 typedef hrtime_t hrrate_t;
 159
 160 static uint64_t pageout_sample_lim = 4;
 161 static uint64_t pageout_sample_cnt = 0;
 162 static pgcnt_t  pageout_sample_pages = 0;
 163 static hrrate_t pageout_rate = 0;
 164 static pgcnt_t  pageout_new_spread = 0;
 165
 166 static clock_t  pageout_cycle_ticks;
 167 static hrtime_t sample_start, sample_end;
 168 static hrtime_t pageout_sample_etime = 0;
 169
 170 /*
 171  * Record number of times a pageout_scanner wakeup cycle finished because it
 172  * timed out (exceeded its CPU budget), rather than because it visited
 173  * its budgeted number of pages.
 174  */
 175 uint64_t pageout_timeouts = 0;
 176
 177 #ifdef VM_STATS
 178 static struct pageoutvmstats_str {
 179         ulong_t checkpage[3];
 180 } pageoutvmstats;
 181 #endif /* VM_STATS */
 182
 183 /*
 184  * Threads waiting for free memory use this condition variable and lock until
 185  * memory becomes available.
 186  */
 187 kmutex_t        memavail_lock;
 188 kcondvar_t      memavail_cv;
 189
 190 /*
 191  * The size of the clock loop.
 192  */
 193 #define LOOPPAGES       total_pages
 194
 195 /*
 196  * Set up the paging constants for the clock algorithm.
 197  * Called after the system is initialized and the amount of memory
 198  * and number of paging devices is known.
 199  *
 200  * lotsfree is 1/64 of memory, but at least 512K.
 201  * desfree is 1/2 of lotsfree.
 202  * minfree is 1/2 of desfree.
 203  *
 204  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
 205  *
 206  *      lotsfree = btop(512K)
 207  *      desfree = btop(200K)
 208  *      minfree = btop(100K)
 209  *      throttlefree = INT_MIN
 210  *      max_percent_cpu = 4
 211  */
 212 void
 213 setupclock(int recalc)
 214 {
 215
 216         static spgcnt_t init_lfree, init_dfree, init_mfree;
 217         static spgcnt_t init_tfree, init_preserve, init_mpgio;
 218         static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
 219
 220         looppages = LOOPPAGES;
 221
 222         /*
 223          * setupclock can now be called to recalculate the paging
 224          * parameters in the case of dynamic addition of memory.
 225          * So to make sure we make the proper calculations, if such a
 226          * situation should arise, we save away the initial values
 227          * of each parameter so we can recall them when needed. This
 228          * way we don't lose the settings an admin might have made
 229          * through the /etc/system file.
 230          */
 231
 232         if (!recalc) {
 233                 init_lfree = lotsfree;
 234                 init_dfree = desfree;
 235                 init_mfree = minfree;
 236                 init_tfree = throttlefree;
 237                 init_preserve = pageout_reserve;
 238                 init_mpgio = maxpgio;
 239                 init_mfscan = maxfastscan;
 240                 init_fscan = fastscan;
 241                 init_sscan = slowscan;
 242                 init_hspages = handspreadpages;
 243         }
 244
 245         /*
 246          * Set up thresholds for paging:
 247          */
 248
 249         /*
 250          * Lotsfree is threshold where paging daemon turns on.
 251          */
 252         if (init_lfree == 0 || init_lfree >= looppages)
 253                 lotsfree = MAX(looppages / 64, btop(512 * 1024));
 254         else
 255                 lotsfree = init_lfree;
 256
 257         /*
 258          * Desfree is amount of memory desired free.
 259          * If less than this for extended period, start swapping.
 260          */
 261         if (init_dfree == 0 || init_dfree >= lotsfree)
 262                 desfree = lotsfree / 2;
 263         else
 264                 desfree = init_dfree;
 265
 266         /*
 267          * Minfree is minimal amount of free memory which is tolerable.
 268          */
 269         if (init_mfree == 0 || init_mfree >= desfree)
 270                 minfree = desfree / 2;
 271         else
 272                 minfree = init_mfree;
 273
 274         /*
 275          * Throttlefree is the point at which we start throttling
 276          * PG_WAIT requests until enough memory becomes available.
 277          */
 278         if (init_tfree == 0 || init_tfree >= desfree)
 279                 throttlefree = minfree;
 280         else
 281                 throttlefree = init_tfree;
 282
 283         /*
 284          * Pageout_reserve is the number of pages that we keep in
 285          * stock for pageout's own use.  Having a few such pages
 286          * provides insurance against system deadlock due to
 287          * pageout needing pages.  When freemem < pageout_reserve,
 288          * non-blocking allocations are denied to any threads
 289          * other than pageout and sched.  (At some point we might
 290          * want to consider a per-thread flag like T_PUSHING_PAGES
 291          * to indicate that a thread is part of the page-pushing
 292          * dance (e.g. an interrupt thread) and thus is entitled
 293          * to the same special dispensation we accord pageout.)
 294          */
 295         if (init_preserve == 0 || init_preserve >= throttlefree)
 296                 pageout_reserve = throttlefree / 2;
 297         else
 298                 pageout_reserve = init_preserve;
 299
 300         /*
 301          * Maxpgio thresholds how much paging is acceptable.
 302          * This figures that 2/3 busy on an arm is all that is
 303          * tolerable for paging.  We assume one operation per disk rev.
 304          *
 305          * XXX - Does not account for multiple swap devices.
 306          */
 307         if (init_mpgio == 0)
 308                 maxpgio = (DISKRPM * 2) / 3;
 309         else
 310                 maxpgio = init_mpgio;
 311
 312         /*
 313          * The clock scan rate varies between fastscan and slowscan
 314          * based on the amount of free memory available.  Fastscan
 315          * rate should be set based on the number pages that can be
 316          * scanned per sec using ~10% of processor time.  Since this
 317          * value depends on the processor, MMU, Mhz etc., it is
 318          * difficult to determine it in a generic manner for all
 319          * architectures.
 320          *
 321          * Instead of trying to determine the number of pages scanned
 322          * per sec for every processor, fastscan is set to be the smaller
 323          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 324          * time is limited to ~4% of processor time.
 325          *
 326          * Setting fastscan to be 1/2 of memory allows pageout to scan
 327          * all of memory in ~2 secs.  This implies that user pages not
 328          * accessed within 1 sec (assuming, handspreadpages == fastscan)
 329          * can be reclaimed when free memory is very low.  Stealing pages
 330          * not accessed within 1 sec seems reasonable and ensures that
 331          * active user processes don't thrash.
 332          *
 333          * Smaller values of fastscan result in scanning fewer pages
 334          * every second and consequently pageout may not be able to free
 335          * sufficient memory to maintain the minimum threshold.  Larger
 336          * values of fastscan result in scanning a lot more pages which
 337          * could lead to thrashing and higher CPU usage.
 338          *
 339          * Fastscan needs to be limited to a maximum value and should not
 340          * scale with memory to prevent pageout from consuming too much
 341          * time for scanning on slow CPU's and avoid thrashing, as a
 342          * result of scanning too many pages, on faster CPU's.
 343          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 344          * (the upper bound for fastscan) based on the average number
 345          * of pages that can potentially be scanned in ~1 sec (using ~4%
 346          * of the CPU) on some of the following machines that currently
 347          * run Solaris 2.x:
 348          *
 349          *                      average memory scanned in ~1 sec
 350          *
 351          *      25 Mhz SS1+:            23 Meg
 352          *      LX:                     37 Meg
 353          *      50 Mhz SC2000:          68 Meg
 354          *
 355          *      40 Mhz 486:             26 Meg
 356          *      66 Mhz 486:             42 Meg
 357          *
 358          * When free memory falls just below lotsfree, the scan rate
 359          * goes from 0 to slowscan (i.e., pageout starts running).  This
 360          * transition needs to be smooth and is achieved by ensuring that
 361          * pageout scans a small number of pages to satisfy the transient
 362          * memory demand.  This is set to not exceed 100 pages/sec (25 per
 363          * wakeup) since scanning that many pages has no noticible impact
 364          * on system performance.
 365          *
 366          * In addition to setting fastscan and slowscan, pageout is
 367          * limited to using ~4% of the CPU.  This results in increasing
 368          * the time taken to scan all of memory, which in turn means that
 369          * user processes have a better opportunity of preventing their
 370          * pages from being stolen.  This has a positive effect on
 371          * interactive and overall system performance when memory demand
 372          * is high.
 373          *
 374          * Thus, the rate at which pages are scanned for replacement will
 375          * vary linearly between slowscan and the number of pages that
 376          * can be scanned using ~4% of processor time instead of varying
 377          * linearly between slowscan and fastscan.
 378          *
 379          * Also, the processor time used by pageout will vary from ~1%
 380          * at slowscan to ~4% at fastscan instead of varying between
 381          * ~1% at slowscan and ~10% at fastscan.
 382          *
 383          * The values chosen for the various VM parameters (fastscan,
 384          * handspreadpages, etc) are not universally true for all machines,
 385          * but appear to be a good rule of thumb for the machines we've
 386          * tested.  They have the following ranges:
 387          *
 388          *      cpu speed:      20 to 70 Mhz
 389          *      page size:      4K to 8K
 390          *      memory size:    16M to 5G
 391          *      page scan rate: 4000 - 17400 4K pages per sec
 392          *
 393          * The values need to be re-examined for machines which don't
 394          * fall into the various ranges (e.g., slower or faster CPUs,
 395          * smaller or larger pagesizes etc) shown above.
 396          *
 397          * On an MP machine, pageout is often unable to maintain the
 398          * minimum paging thresholds under heavy load.  This is due to
 399          * the fact that user processes running on other CPU's can be
 400          * dirtying memory at a much faster pace than pageout can find
 401          * pages to free.  The memory demands could be met by enabling
 402          * more than one CPU to run the clock algorithm in such a manner
 403          * that the various clock hands don't overlap.  This also makes
 404          * it more difficult to determine the values for fastscan, slowscan
 405          * and handspreadpages.
 406          *
 407          * The swapper is currently used to free up memory when pageout
 408          * is unable to meet memory demands by swapping out processes.
 409          * In addition to freeing up memory, swapping also reduces the
 410          * demand for memory by preventing user processes from running
 411          * and thereby consuming memory.
 412          */
 413         if (init_mfscan == 0) {
 414                 if (pageout_new_spread != 0)
 415                         maxfastscan = pageout_new_spread;
 416                 else
 417                         maxfastscan = MAXHANDSPREADPAGES;
 418         } else {
 419                 maxfastscan = init_mfscan;
 420         }
 421         if (init_fscan == 0)
 422                 fastscan = MIN(looppages / loopfraction, maxfastscan);
 423         else
 424                 fastscan = init_fscan;
 425         if (fastscan > looppages / loopfraction)
 426                 fastscan = looppages / loopfraction;
 427
 428         /*
 429          * Set slow scan time to 1/10 the fast scan time, but
 430          * not to exceed maxslowscan.
 431          */
 432         if (init_sscan == 0)
 433                 slowscan = MIN(fastscan / 10, maxslowscan);
 434         else
 435                 slowscan = init_sscan;
 436         if (slowscan > fastscan / 2)
 437                 slowscan = fastscan / 2;
 438
 439         /*
 440          * Handspreadpages is distance (in pages) between front and back
 441          * pageout daemon hands.  The amount of time to reclaim a page
 442          * once pageout examines it increases with this distance and
 443          * decreases as the scan rate rises. It must be < the amount
 444          * of pageable memory.
 445          *
 446          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 447          * to be "fastscan" results in the front hand being a few secs
 448          * (varies based on the processor speed) ahead of the back hand
 449          * at fastscan rates.  This distance can be further reduced, if
 450          * necessary, by increasing the processor time used by pageout
 451          * to be more than ~4% and preferrably not more than ~10%.
 452          *
 453          * As a result, user processes have a much better chance of
 454          * referencing their pages before the back hand examines them.
 455          * This also significantly lowers the number of reclaims from
 456          * the freelist since pageout does not end up freeing pages which
 457          * may be referenced a sec later.
 458          */
 459         if (init_hspages == 0)
 460                 handspreadpages = fastscan;
 461         else
 462                 handspreadpages = init_hspages;
 463
 464         /*
 465          * Make sure that back hand follows front hand by at least
 466          * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
 467          * for the back hand to look at a page during the same wakeup of
 468          * the pageout daemon in which the front hand cleared its ref bit.
 469          */
 470         if (handspreadpages >= looppages)
 471                 handspreadpages = looppages - 1;
 472
 473         /*
 474          * If we have been called to recalculate the parameters,
 475          * set a flag to re-evaluate the clock hand pointers.
 476          */
 477         if (recalc)
 478                 reset_hands = 1;
 479 }
 480
 481 /*
 482  * Pageout scheduling.
 483  *
 484  * Schedpaging controls the rate at which the page out daemon runs by
 485  * setting the global variables nscan and desscan RATETOSCHEDPAGING
 486  * times a second.  Nscan records the number of pages pageout has examined
 487  * in its current pass; schedpaging resets this value to zero each time
 488  * it runs.  Desscan records the number of pages pageout should examine
 489  * in its next pass; schedpaging sets this value based on the amount of
 490  * currently available memory.
 491  */
 492
 493 #define RATETOSCHEDPAGING       4               /* hz that is */
 494
 495 static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 496
 497 /*
 498  * Pool of available async pageout putpage requests.
 499  */
 500 static struct async_reqs *push_req;
 501 static struct async_reqs *req_freelist; /* available req structs */
 502 static struct async_reqs *push_list;    /* pending reqs */
 503 static kmutex_t push_lock;              /* protects req pool */
 504 static kcondvar_t push_cv;
 505
 506 static int async_list_size = 256;       /* number of async request structs */
 507
 508 static void pageout_scanner(void);
 509
 510 /*
 511  * If a page is being shared more than "po_share" times
 512  * then leave it alone- don't page it out.
 513  */
 514 #define MIN_PO_SHARE    (8)
 515 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 516 ulong_t po_share = MIN_PO_SHARE;
 517
 518 /*
 519  * Schedule rate for paging.
 520  * Rate is linear interpolation between
 521  * slowscan with lotsfree and fastscan when out of memory.
 522  */
 523 static void
 524 schedpaging(void *arg)
 525 {
 526         spgcnt_t vavail;
 527
 528         if (freemem < lotsfree + needfree + kmem_reapahead)
 529                 kmem_reap();
 530
 531         if (freemem < lotsfree + needfree)
 532                 seg_preap();
 533
 534         if (mutex_tryenter(&pageout_mutex)) {
 535                 /* pageout() not running */
 536                 nscan = 0;
 537                 vavail = freemem - deficit;
 538                 if (pageout_new_spread != 0)
 539                         vavail -= needfree;
 540                 if (vavail < 0)
 541                         vavail = 0;
 542                 if (vavail > lotsfree)
 543                         vavail = lotsfree;
 544
 545                 /*
 546                  * Fix for 1161438 (CRS SPR# 73922).  All variables
 547                  * in the original calculation for desscan were 32 bit signed
 548                  * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 549                  * more of memory, the calculation can overflow.  When this
 550                  * happens, desscan becomes negative and pageout_scanner()
 551                  * stops paging out.
 552                  */
 553                 if ((needfree) && (pageout_new_spread == 0)) {
 554                         /*
 555                          * If we've not yet collected enough samples to
 556                          * calculate a spread, use the old logic of kicking
 557                          * into high gear anytime needfree is non-zero.
 558                          */
 559                         desscan = fastscan / RATETOSCHEDPAGING;
 560                 } else {
 561                         /*
 562                          * Once we've calculated a spread based on system
 563                          * memory and usage, just treat needfree as another
 564                          * form of deficit.
 565                          */
 566                         spgcnt_t faststmp, slowstmp, result;
 567
 568                         slowstmp = slowscan * vavail;
 569                         faststmp = fastscan * (lotsfree - vavail);
 570                         result = (slowstmp + faststmp) /
 571                             nz(lotsfree) / RATETOSCHEDPAGING;
 572                         desscan = (pgcnt_t)result;
 573                 }
 574
 575                 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
 576                     (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
 577
 578                 if (freemem < lotsfree + needfree ||
 579                     pageout_sample_cnt < pageout_sample_lim) {
 580                         TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
 581                             "pageout_cv_signal:freemem %ld", freemem);
 582                         cv_signal(&proc_pageout->p_cv);
 583                 } else {
 584                         /*
 585                          * There are enough free pages, no need to
 586                          * kick the scanner thread.  And next time
 587                          * around, keep more of the `highly shared'
 588                          * pages.
 589                          */
 590                         cv_signal_pageout();
 591                         if (po_share > MIN_PO_SHARE) {
 592                                 po_share >>= 1;
 593                         }
 594                 }
 595                 mutex_exit(&pageout_mutex);
 596         }
 597
 598         /*
 599          * Signal threads waiting for available memory.
 600          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 601          * in this case it is not needed - the waiters will be waken up during
 602          * the next invocation of this function.
 603          */
 604         if (kmem_avail() > 0)
 605                 cv_broadcast(&memavail_cv);
 606
 607         (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
 608 }
 609
 610 pgcnt_t         pushes;
 611 ulong_t         push_list_size;         /* # of requests on pageout queue */
 612
 613 #define FRONT   1
 614 #define BACK    2
 615
 616 int dopageout = 1;      /* must be non-zero to turn page stealing on */
 617
 618 /*
 619  * The page out daemon, which runs as process 2.
 620  *
 621  * As long as there are at least lotsfree pages,
 622  * this process is not run.  When the number of free
 623  * pages stays in the range desfree to lotsfree,
 624  * this daemon runs through the pages in the loop
 625  * at a rate determined in schedpaging().  Pageout manages
 626  * two hands on the clock.  The front hand moves through
 627  * memory, clearing the reference bit,
 628  * and stealing pages from procs that are over maxrss.
 629  * The back hand travels a distance behind the front hand,
 630  * freeing the pages that have not been referenced in the time
 631  * since the front hand passed.  If modified, they are pushed to
 632  * swap before being freed.
 633  *
 634  * There are 2 threads that act on behalf of the pageout process.
 635  * One thread scans pages (pageout_scanner) and frees them up if
 636  * they don't require any fop_putpage operation. If a page must be
 637  * written back to its backing store, the request is put on a list
 638  * and the other (pageout) thread is signaled. The pageout thread
 639  * grabs fop_putpage requests from the list, and processes them.
 640  * Some filesystems may require resources for the fop_putpage
 641  * operations (like memory) and hence can block the pageout
 642  * thread, but the scanner thread can still operate. There is still
 643  * no guarantee that memory deadlocks cannot occur.
 644  *
 645  * For now, this thing is in very rough form.
 646  */
 647 void
 648 pageout()
 649 {
 650         struct async_reqs *arg;
 651         pri_t pageout_pri;
 652         int i;
 653         pgcnt_t max_pushes;
 654         callb_cpr_t cprinfo;
 655
 656         proc_pageout = ttoproc(curthread);
 657         proc_pageout->p_cstime = 0;
 658         proc_pageout->p_stime =  0;
 659         proc_pageout->p_cutime =  0;
 660         proc_pageout->p_utime = 0;
 661         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
 662         bcopy("pageout", PTOU(curproc)->u_comm, 7);
 663
 664         /*
 665          * Create pageout scanner thread
 666          */
 667         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
 668         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
 669
 670         /*
 671          * Allocate and initialize the async request structures
 672          * for pageout.
 673          */
 674         push_req = (struct async_reqs *)
 675             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
 676
 677         req_freelist = push_req;
 678         for (i = 0; i < async_list_size - 1; i++)
 679                 push_req[i].a_next = &push_req[i + 1];
 680
 681         pageout_pri = curthread->t_pri;
 682
 683         /* Create the pageout scanner thread. */
 684         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
 685             pageout_pri - 1);
 686
 687         /*
 688          * kick off pageout scheduler.
 689          */
 690         schedpaging(NULL);
 691
 692         /*
 693          * Limit pushes to avoid saturating pageout devices.
 694          */
 695         max_pushes = maxpgio / RATETOSCHEDPAGING;
 696         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
 697
 698         for (;;) {
 699                 mutex_enter(&push_lock);
 700
 701                 while ((arg = push_list) == NULL || pushes > max_pushes) {
 702                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 703                         cv_wait(&push_cv, &push_lock);
 704                         pushes = 0;
 705                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
 706                 }
 707                 push_list = arg->a_next;
 708                 arg->a_next = NULL;
 709                 mutex_exit(&push_lock);
 710
 711                 if (fop_putpage(arg->a_vp, (offset_t)arg->a_off,
 712                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 713                         pushes++;
 714                 }
 715
 716                 /* vp held by checkpage() */
 717                 VN_RELE(arg->a_vp);
 718
 719                 mutex_enter(&push_lock);
 720                 arg->a_next = req_freelist;     /* back on freelist */
 721                 req_freelist = arg;
 722                 push_list_size--;
 723                 mutex_exit(&push_lock);
 724         }
 725 }
 726
 727 /*
 728  * Kernel thread that scans pages looking for ones to free
 729  */
 730 static void
 731 pageout_scanner(void)
 732 {
 733         struct page *fronthand, *backhand;
 734         uint_t count;
 735         callb_cpr_t cprinfo;
 736         pgcnt_t nscan_limit;
 737         pgcnt_t pcount;
 738
 739         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
 740         mutex_enter(&pageout_mutex);
 741
 742         /*
 743          * The restart case does not attempt to point the hands at roughly
 744          * the right point on the assumption that after one circuit things
 745          * will have settled down - and restarts shouldn't be that often.
 746          */
 747
 748         /*
 749          * Set the two clock hands to be separated by a reasonable amount,
 750          * but no more than 360 degrees apart.
 751          */
 752         backhand = page_first();
 753         if (handspreadpages >= total_pages)
 754                 fronthand = page_nextn(backhand, total_pages - 1);
 755         else
 756                 fronthand = page_nextn(backhand, handspreadpages);
 757
 758         min_pageout_ticks = MAX(1,
 759             ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
 760         max_pageout_ticks = MAX(min_pageout_ticks,
 761             ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
 762
 763 loop:
 764         cv_signal_pageout();
 765
 766         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 767         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
 768         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
 769
 770         if (!dopageout)
 771                 goto loop;
 772
 773         if (reset_hands) {
 774                 reset_hands = 0;
 775
 776                 backhand = page_first();
 777                 if (handspreadpages >= total_pages)
 778                         fronthand = page_nextn(backhand, total_pages - 1);
 779                 else
 780                         fronthand = page_nextn(backhand, handspreadpages);
 781         }
 782
 783         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
 784         count = 0;
 785
 786         TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
 787             "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
 788             freemem, lotsfree, nscan, desscan);
 789
 790         pcount = 0;
 791         if (pageout_sample_cnt < pageout_sample_lim) {
 792                 nscan_limit = total_pages;
 793         } else {
 794                 nscan_limit = desscan;
 795         }
 796         pageout_lbolt = ddi_get_lbolt();
 797         sample_start = gethrtime();
 798
 799         /*
 800          * Scan the appropriate number of pages for a single duty cycle.
 801          * However, stop scanning as soon as there is enough free memory.
 802          * For a short while, we will be sampling the performance of the
 803          * scanner and need to keep running just to get sample data, in
 804          * which case we keep going and don't pay attention to whether
 805          * or not there is enough free memory.
 806          */
 807
 808         while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
 809             pageout_sample_cnt < pageout_sample_lim)) {
 810                 int rvfront, rvback;
 811
 812                 /*
 813                  * Check to see if we have exceeded our %CPU budget
 814                  * for this wakeup, but not on every single page visited,
 815                  * just every once in a while.
 816                  */
 817                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
 818                         pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
 819                         if (pageout_cycle_ticks >= pageout_ticks) {
 820                                 ++pageout_timeouts;
 821                                 break;
 822                         }
 823                 }
 824
 825                 /*
 826                  * If checkpage manages to add a page to the free list,
 827                  * we give ourselves another couple of trips around the loop.
 828                  */
 829                 if ((rvfront = checkpage(fronthand, FRONT)) == 1)
 830                         count = 0;
 831                 if ((rvback = checkpage(backhand, BACK)) == 1)
 832                         count = 0;
 833
 834                 ++pcount;
 835
 836                 /*
 837                  * protected by pageout_mutex instead of cpu_stat_lock
 838                  */
 839                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
 840
 841                 /*
 842                  * Don't include ineligible pages in the number scanned.
 843                  */
 844                 if (rvfront != -1 || rvback != -1)
 845                         nscan++;
 846
 847                 backhand = page_next(backhand);
 848
 849                 /*
 850                  * backhand update and wraparound check are done separately
 851                  * because lint barks when it finds an empty "if" body
 852                  */
 853
 854                 if ((fronthand = page_next(fronthand)) == page_first()) {
 855                         TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
 856                             "pageout_hand_wrap:freemem %ld whichhand %d",
 857                             freemem, FRONT);
 858
 859                         /*
 860                          * protected by pageout_mutex instead of cpu_stat_lock
 861                          */
 862                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
 863                         if (++count > 1) {
 864                                 /*
 865                                  * Extremely unlikely, but it happens.
 866                                  * We went around the loop at least once
 867                                  * and didn't get far enough.
 868                                  * If we are still skipping `highly shared'
 869                                  * pages, skip fewer of them.  Otherwise,
 870                                  * give up till the next clock tick.
 871                                  */
 872                                 if (po_share < MAX_PO_SHARE) {
 873                                         po_share <<= 1;
 874                                 } else {
 875                                         /*
 876                                          * Really a "goto loop", but
 877                                          * if someone is TRACing,  at least
 878                                          * make records to show where we
 879                                          * are.
 880                                          */
 881                                         break;
 882                                 }
 883                         }
 884                 }
 885         }
 886
 887         sample_end = gethrtime();
 888
 889         TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
 890             "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
 891             freemem, lotsfree, nscan, desscan, count);
 892
 893         if (pageout_sample_cnt < pageout_sample_lim) {
 894                 pageout_sample_pages += pcount;
 895                 pageout_sample_etime += sample_end - sample_start;
 896                 ++pageout_sample_cnt;
 897         }
 898         if (pageout_sample_cnt >= pageout_sample_lim &&
 899             pageout_new_spread == 0) {
 900                 pageout_rate = (hrrate_t)pageout_sample_pages *
 901                     (hrrate_t)(NANOSEC) / pageout_sample_etime;
 902                 pageout_new_spread = pageout_rate / 10;
 903                 setupclock(1);
 904         }
 905
 906         goto loop;
 907 }
 908
 909 /*
 910  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
 911  * system (u., page table) or free, then leave it alone.  Otherwise,
 912  * if we are running the front hand, turn off the page's reference bit.
 913  * If the proc is over maxrss, we take it.  If running the back hand,
 914  * check whether the page has been reclaimed.  If not, free the page,
 915  * pushing it to disk first if necessary.
 916  *
 917  * Return values:
 918  *      -1 if the page is not a candidate at all,
 919  *       0 if not freed, or
 920  *       1 if we freed it.
 921  */
 922 static int
 923 checkpage(struct page *pp, int whichhand)
 924 {
 925         int ppattr;
 926         int isfs = 0;
 927         int isexec = 0;
 928         int pagesync_flag;
 929
 930         /*
 931          * Skip pages:
 932          *      - associated with the kernel vnode since
 933          *          they are always "exclusively" locked.
 934          *      - that are free
 935          *      - that are shared more than po_share'd times
 936          *      - its already locked
 937          *
 938          * NOTE:  These optimizations assume that reads are atomic.
 939          */
 940
 941         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
 942             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
 943             hat_page_checkshare(pp, po_share)) {
 944                 return (-1);
 945         }
 946
 947         if (!page_trylock(pp, SE_EXCL)) {
 948                 /*
 949                  * Skip the page if we can't acquire the "exclusive" lock.
 950                  */
 951                 return (-1);
 952         } else if (PP_ISFREE(pp)) {
 953                 /*
 954                  * It became free between the above check and our actually
 955                  * locking the page.  Oh, well there will be other pages.
 956                  */
 957                 page_unlock(pp);
 958                 return (-1);
 959         }
 960
 961         /*
 962          * Reject pages that cannot be freed. The page_struct_lock
 963          * need not be acquired to examine these
 964          * fields since the page has an "exclusive" lock.
 965          */
 966         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 967                 page_unlock(pp);
 968                 return (-1);
 969         }
 970
 971         /*
 972          * Maintain statistics for what we are freeing
 973          */
 974
 975         if (pp->p_vnode != NULL) {
 976                 if (pp->p_vnode->v_flag & VVMEXEC)
 977                         isexec = 1;
 978
 979                 if (!IS_SWAPFSVP(pp->p_vnode))
 980                         isfs = 1;
 981         }
 982
 983         /*
 984          * Turn off REF and MOD bits with the front hand.
 985          * The back hand examines the REF bit and always considers
 986          * SHARED pages as referenced.
 987          */
 988         if (whichhand == FRONT)
 989                 pagesync_flag = HAT_SYNC_ZERORM;
 990         else
 991                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
 992                     HAT_SYNC_STOPON_SHARED;
 993
 994         ppattr = hat_pagesync(pp, pagesync_flag);
 995
 996 recheck:
 997         /*
 998          * If page is referenced; make unreferenced but reclaimable.
 999          * If this page is not referenced, then it must be reclaimable
1000          * and we can add it to the free list.
1001          */
1002         if (ppattr & P_REF) {
1003                 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1004                     "pageout_isref:pp %p whichhand %d", pp, whichhand);
1005                 if (whichhand == FRONT) {
1006                         /*
1007                          * Checking of rss or madvise flags needed here...
1008                          *
1009                          * If not "well-behaved", fall through into the code
1010                          * for not referenced.
1011                          */
1012                         hat_clrref(pp);
1013                 }
1014                 /*
1015                  * Somebody referenced the page since the front
1016                  * hand went by, so it's not a candidate for
1017                  * freeing up.
1018                  */
1019                 page_unlock(pp);
1020                 return (0);
1021         }
1022
1023         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1024
1025         /*
1026          * If large page, attempt to demote it. If successfully demoted,
1027          * retry the checkpage.
1028          */
1029         if (pp->p_szc != 0) {
1030                 if (!page_try_demote_pages(pp)) {
1031                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1032                         page_unlock(pp);
1033                         return (-1);
1034                 }
1035                 ASSERT(pp->p_szc == 0);
1036                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1037                 /*
1038                  * since page_try_demote_pages() could have unloaded some
1039                  * mappings it makes sense to reload ppattr.
1040                  */
1041                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1042         }
1043
1044         /*
1045          * If the page is currently dirty, we have to arrange
1046          * to have it cleaned before it can be freed.
1047          *
1048          * XXX - ASSERT(pp->p_vnode != NULL);
1049          */
1050         if ((ppattr & P_MOD) && pp->p_vnode) {
1051                 struct vnode *vp = pp->p_vnode;
1052                 uoff_t offset = pp->p_offset;
1053
1054                 /*
1055                  * XXX - Test for process being swapped out or about to exit?
1056                  * [Can't get back to process(es) using the page.]
1057                  */
1058
1059                 /*
1060                  * Hold the vnode before releasing the page lock to
1061                  * prevent it from being freed and re-used by some
1062                  * other thread.
1063                  */
1064                 VN_HOLD(vp);
1065                 page_unlock(pp);
1066
1067                 /*
1068                  * Queue i/o request for the pageout thread.
1069                  */
1070                 if (!queue_io_request(vp, offset)) {
1071                         VN_RELE(vp);
1072                         return (0);
1073                 }
1074                 return (1);
1075         }
1076
1077         /*
1078          * Now we unload all the translations,
1079          * and put the page back on to the free list.
1080          * If the page was used (referenced or modified) after
1081          * the pagesync but before it was unloaded we catch it
1082          * and handle the page properly.
1083          */
1084         TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1085             "pageout_free:pp %p whichhand %d", pp, whichhand);
1086         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1087         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1088         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1089                 goto recheck;
1090
1091         VN_DISPOSE(pp, B_FREE, 0, kcred);
1092
1093         CPU_STATS_ADD_K(vm, dfree, 1);
1094
1095         if (isfs) {
1096                 if (isexec) {
1097                         CPU_STATS_ADD_K(vm, execfree, 1);
1098                 } else {
1099                         CPU_STATS_ADD_K(vm, fsfree, 1);
1100                 }
1101         } else {
1102                 CPU_STATS_ADD_K(vm, anonfree, 1);
1103         }
1104
1105         return (1);             /* freed a page! */
1106 }
1107
1108 /*
1109  * Queue async i/o request from pageout_scanner and segment swapout
1110  * routines on one common list.  This ensures that pageout devices (swap)
1111  * are not saturated by pageout_scanner or swapout requests.
1112  * The pageout thread empties this list by initiating i/o operations.
1113  */
1114 int
1115 queue_io_request(vnode_t *vp, uoff_t off)
1116 {
1117         struct async_reqs *arg;
1118
1119         /*
1120          * If we cannot allocate an async request struct,
1121          * skip this page.
1122          */
1123         mutex_enter(&push_lock);
1124         if ((arg = req_freelist) == NULL) {
1125                 mutex_exit(&push_lock);
1126                 return (0);
1127         }
1128         req_freelist = arg->a_next;             /* adjust freelist */
1129         push_list_size++;
1130
1131         arg->a_vp = vp;
1132         arg->a_off = off;
1133         arg->a_len = PAGESIZE;
1134         arg->a_flags = B_ASYNC | B_FREE;
1135         arg->a_cred = kcred;            /* always held */
1136
1137         /*
1138          * Add to list of pending write requests.
1139          */
1140         arg->a_next = push_list;
1141         push_list = arg;
1142
1143         if (req_freelist == NULL) {
1144                 /*
1145                  * No free async requests left. The lock is held so we
1146                  * might as well signal the pusher thread now.
1147                  */
1148                 cv_signal(&push_cv);
1149         }
1150         mutex_exit(&push_lock);
1151         return (1);
1152 }
1153
1154 /*
1155  * Wakeup pageout to initiate i/o if push_list is not empty.
1156  */
1157 void
1158 cv_signal_pageout()
1159 {
1160         if (push_list != NULL) {
1161                 mutex_enter(&push_lock);
1162                 cv_signal(&push_cv);
1163                 mutex_exit(&push_lock);
1164         }
1165 }