usr/src/uts/common/os/vm_pageout.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2018 Joyent, Inc.
  24  * Copyright 2023 Oxide Computer Company
  25  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  26  */
  27
  28 /*
  29  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  30  * Use is subject to license terms.
  31  */
  32
  33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  34 /* All Rights Reserved */
  35
  36 /*
  37  * University Copyright- Copyright (c) 1982, 1986, 1988
  38  * The Regents of the University of California
  39  * All Rights Reserved
  40  *
  41  * University Acknowledgment- Portions of this document are derived from
  42  * software developed by the University of California, Berkeley, and its
  43  * contributors.
  44  */
  45
  46 #include <sys/types.h>
  47 #include <sys/t_lock.h>
  48 #include <sys/param.h>
  49 #include <sys/buf.h>
  50 #include <sys/uio.h>
  51 #include <sys/proc.h>
  52 #include <sys/systm.h>
  53 #include <sys/mman.h>
  54 #include <sys/cred.h>
  55 #include <sys/vnode.h>
  56 #include <sys/vm.h>
  57 #include <sys/vmparam.h>
  58 #include <sys/vtrace.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/cpuvar.h>
  61 #include <sys/user.h>
  62 #include <sys/kmem.h>
  63 #include <sys/debug.h>
  64 #include <sys/callb.h>
  65 #include <sys/mem_cage.h>
  66 #include <sys/time.h>
  67 #include <sys/stdbool.h>
  68
  69 #include <vm/hat.h>
  70 #include <vm/as.h>
  71 #include <vm/seg.h>
  72 #include <vm/page.h>
  73 #include <vm/pvn.h>
  74 #include <vm/seg_kmem.h>
  75
  76 /*
  77  * FREE MEMORY MANAGEMENT
  78  *
  79  * Management of the pool of free pages is a tricky business.  There are
  80  * several critical threshold values which constrain our allocation of new
  81  * pages and inform the rate of paging out of memory to swap.  These threshold
  82  * values, and the behaviour they induce, are described below in descending
  83  * order of size -- and thus increasing order of severity!
  84  *
  85  *   +---------------------------------------------------- physmem (all memory)
  86  *   |
  87  *   | Ordinarily there are no particular constraints placed on page
  88  *   v allocation.  The page scanner is not running and page_create_va()
  89  *   | will effectively grant all page requests (whether from the kernel
  90  *   | or from user processes) without artificial delay.
  91  *   |
  92  *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
  93  *   |
  94  *   | When we have less than "lotsfree" pages, pageout_scanner() is
  95  *   v signalled by schedpaging() to begin looking for pages that can
  96  *   | be evicted to disk to bring us back above lotsfree.  At this
  97  *   | stage there is still no constraint on allocation of free pages.
  98  *   |
  99  *   | For small systems, we set a lower bound of 16MB for lotsfree;
 100  *   v this is the natural value for a system with 1GB memory.  This is
 101  *   | to ensure that the pageout reserve pool contains at least 4MB
 102  *   | for use by ZFS.
 103  *   |
 104  *   | For systems with a large amount of memory, we constrain lotsfree
 105  *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
 106  *   v at some point the required slack relates more closely to the
 107  *   | rate at which paging can occur than to the total amount of memory.
 108  *   |
 109  *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
 110  *   |
 111  *   | When we drop below desfree, a number of kernel facilities will
 112  *   v wait before allocating more memory, under the assumption that
 113  *   | pageout or reaping will make progress and free up some memory.
 114  *   | This behaviour is not especially coordinated; look for comparisons
 115  *   | of desfree and freemem.
 116  *   |
 117  *   | In addition to various attempts at advisory caution, clock()
 118  *   | will wake up the thread that is ordinarily parked in sched().
 119  *   | This routine is responsible for the heavy-handed swapping out
 120  *   v of entire processes in an attempt to arrest the slide of free
 121  *   | memory.  See comments in sched.c for more details.
 122  *   |
 123  *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
 124  *   |
 125  *   | These two separate tunables have, by default, the same value.
 126  *   v Various parts of the kernel use minfree to signal the need for
 127  *   | more aggressive reclamation of memory, and sched() is more
 128  *   | aggressive at swapping processes out.
 129  *   |
 130  *   | If free memory falls below throttlefree, page_create_va() will
 131  *   | use page_create_throttle() to begin holding most requests for
 132  *   | new pages while pageout and reaping free up memory.  Sleeping
 133  *   v allocations (e.g., KM_SLEEP) are held here while we wait for
 134  *   | more memory.  Non-sleeping allocations are generally allowed to
 135  *   | proceed, unless their priority is explicitly lowered with
 136  *   | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
 137  *   |
 138  *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
 139  *   |
 140  *   | When we hit throttlefree, the situation is already dire.  The
 141  *   v system is generally paging out memory and swapping out entire
 142  *   | processes in order to free up memory for continued operation.
 143  *   |
 144  *   | Unfortunately, evicting memory to disk generally requires short
 145  *   | term use of additional memory; e.g., allocation of buffers for
 146  *   | storage drivers, updating maps of free and used blocks, etc.
 147  *   | As such, pageout_reserve is the number of pages that we keep in
 148  *   | special reserve for use by pageout() and sched() and by any
 149  *   v other parts of the kernel that need to be working for those to
 150  *   | make forward progress such as the ZFS I/O pipeline.
 151  *   |
 152  *   | When we are below pageout_reserve, we fail or hold any allocation
 153  *   | that has not explicitly requested access to the reserve pool.
 154  *   | Access to the reserve is generally granted via the KM_PUSHPAGE
 155  *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
 156  *   | can implicitly tap the reserve.  For more details, see the
 157  *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
 158  *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
 159  *   |
 160  *   +---------------------------------------------------------- no free memory
 161  *   |
 162  *   | If we have arrived here, things are very bad indeed.  It is
 163  *   v surprisingly difficult to tell if this condition is even fatal,
 164  *   | as enough memory may have been granted to pageout() and to the
 165  *   | ZFS I/O pipeline that requests for eviction that have already been
 166  *   | made will complete and free up memory some time soon.
 167  *   |
 168  *   | If free memory does not materialise, the system generally remains
 169  *   | deadlocked.  The pageout_deadman() below is run once per second
 170  *   | from clock(), seeking to limit the amount of time a single request
 171  *   v to page out can be blocked before the system panics to get a crash
 172  *   | dump and return to service.
 173  *   |
 174  *   +-------------------------------------------------------------------------
 175  */
 176
 177 /*
 178  * The following parameters control operation of the page replacement
 179  * algorithm.  They are initialized to 0, and then computed at boot time based
 180  * on the size of the system; see setupclock().  If they are patched non-zero
 181  * in a loaded vmunix they are left alone and may thus be changed per system
 182  * using "mdb -kw" on the loaded system.
 183  */
 184 pgcnt_t         slowscan = 0;
 185 pgcnt_t         fastscan = 0;
 186
 187 static pgcnt_t  handspreadpages = 0;
 188
 189 /*
 190  * looppages:
 191  *     Cached copy of the total number of pages in the system (total_pages).
 192  *
 193  * loopfraction:
 194  *     Divisor used to relate fastscan to looppages in setupclock().
 195  */
 196 static uint_t   loopfraction = 2;
 197 static pgcnt_t  looppages;
 198
 199 static uint_t   min_percent_cpu = 4;
 200 static uint_t   max_percent_cpu = 80;
 201 static pgcnt_t  maxfastscan = 0;
 202 static pgcnt_t  maxslowscan = 100;
 203
 204 #define         MEGABYTES               (1024ULL * 1024ULL)
 205
 206 /*
 207  * pageout_threshold_style:
 208  *     set to 1 to use the previous default threshold size calculation;
 209  *     i.e., each threshold is half of the next largest value.
 210  */
 211 uint_t          pageout_threshold_style = 0;
 212
 213 /*
 214  * The operator may override these tunables to request a different minimum or
 215  * maximum lotsfree value, or to change the divisor we use for automatic
 216  * sizing.
 217  *
 218  * By default, we make lotsfree 1/64th of the total memory in the machine.  The
 219  * minimum and maximum are specified in bytes, rather than pages; a zero value
 220  * means the default values (below) are used.
 221  */
 222 uint_t          lotsfree_fraction = 64;
 223 pgcnt_t         lotsfree_min = 0;
 224 pgcnt_t         lotsfree_max = 0;
 225
 226 #define         LOTSFREE_MIN_DEFAULT    (16 * MEGABYTES)
 227 #define         LOTSFREE_MAX_DEFAULT    (2048 * MEGABYTES)
 228
 229 /*
 230  * If these tunables are set to non-zero values in /etc/system, and provided
 231  * the value is not larger than the threshold above, the specified value will
 232  * be used directly without any additional calculation or adjustment.  The boot
 233  * time value of these overrides is preserved in the "clockinit" struct.  More
 234  * detail is available in the comment at the top of the file.
 235  */
 236 pgcnt_t         maxpgio = 0;
 237 pgcnt_t         minfree = 0;
 238 pgcnt_t         desfree = 0;
 239 pgcnt_t         lotsfree = 0;
 240 pgcnt_t         needfree = 0;
 241 pgcnt_t         throttlefree = 0;
 242 pgcnt_t         pageout_reserve = 0;
 243
 244 pgcnt_t         deficit;
 245 pgcnt_t         nscan;
 246 pgcnt_t         desscan;
 247
 248 /* kstats */
 249 uint64_t        low_mem_scan;
 250
 251 /* The maximum supported number of page_scanner() threads */
 252 #define MAX_PSCAN_THREADS       16
 253
 254 /*
 255  * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
 256  * number of nanoseconds in each wakeup cycle that gives the equivalent of some
 257  * underlying %CPU duty cycle.
 258  *
 259  * min_pageout_nsec:
 260  *     nanoseconds/wakeup equivalent of min_percent_cpu.
 261  *
 262  * max_pageout_nsec:
 263  *     nanoseconds/wakeup equivalent of max_percent_cpu.
 264  *
 265  * pageout_nsec:
 266  *     Number of nanoseconds budgeted for each wakeup cycle.
 267  *     Computed each time around by schedpaging().
 268  *     Varies between min_pageout_nsec and max_pageout_nsec,
 269  *     depending on memory pressure.
 270  */
 271 static hrtime_t min_pageout_nsec;
 272 static hrtime_t max_pageout_nsec;
 273 static hrtime_t pageout_nsec;
 274
 275 static bool     reset_hands[MAX_PSCAN_THREADS];
 276
 277 #define PAGES_POLL_MASK 1023
 278
 279 /*
 280  * Pageout scheduling.
 281  *
 282  * Schedpaging controls the rate at which the page out daemon runs by
 283  * setting the global variables nscan and desscan SCHEDPAGING_HZ
 284  * times a second.  Nscan records the number of pages pageout has examined
 285  * in its current pass; schedpaging() resets this value to zero each time
 286  * it runs.  Desscan records the number of pages pageout should examine
 287  * in its next pass; schedpaging() sets this value based on the amount of
 288  * currently available memory.
 289  */
 290 #define SCHEDPAGING_HZ  4
 291
 292 /*
 293  * despagescanners:
 294  *      The desired number of page scanner threads. For testing purposes, this
 295  *      value can be set in /etc/system or tuned directly with mdb(1). The
 296  *      system will bring the actual number of threads into line with the
 297  *      desired number. If set to an invalid value, the system will correct the
 298  *      setting.
 299  */
 300 uint_t despagescanners = 0;
 301
 302 /*
 303  * pageout_sample_lim:
 304  *     The limit on the number of samples needed to establish a value for new
 305  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 306  *     handspreadpages.
 307  *
 308  * pageout_sample_cnt:
 309  *     Current sample number.  Once the sample gets large enough, set new
 310  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 311  *
 312  * pageout_sample_pages:
 313  *     The accumulated number of pages scanned during sampling.
 314  *
 315  * pageout_sample_etime:
 316  *     The accumulated nanoseconds for the sample.
 317  *
 318  * pageout_sampling:
 319  *     True while sampling is still in progress.
 320  *
 321  * pageout_rate:
 322  *     Rate in pages/nanosecond, computed at the end of sampling.
 323  *
 324  * pageout_new_spread:
 325  *     Initially zero while the system scan rate is measured by
 326  *     pageout_scanner(), which then sets this value once per system boot after
 327  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 328  *     new value is used for fastscan and handspreadpages.
 329  */
 330 typedef hrtime_t hrrate_t;
 331
 332 static uint64_t pageout_sample_lim = 4;
 333 static uint64_t pageout_sample_cnt = 0;
 334 static pgcnt_t  pageout_sample_pages = 0;
 335 static hrtime_t pageout_sample_etime = 0;
 336 static bool     pageout_sampling = true;
 337 static hrrate_t pageout_rate = 0;
 338 static pgcnt_t  pageout_new_spread = 0;
 339
 340 /* The current number of page scanner threads */
 341 static uint_t n_page_scanners = 1;
 342 /* The number of page scanner threads that are actively scanning. */
 343 static uint_t pageouts_running;
 344
 345 /*
 346  * Record number of times a pageout_scanner() wakeup cycle finished because it
 347  * timed out (exceeded its CPU budget), rather than because it visited
 348  * its budgeted number of pages.
 349  */
 350 uint64_t        pageout_timeouts = 0;
 351
 352 #ifdef VM_STATS
 353 static struct pageoutvmstats_str {
 354         ulong_t checkpage[3];
 355 } pageoutvmstats;
 356 #endif /* VM_STATS */
 357
 358 /*
 359  * Threads waiting for free memory use this condition variable and lock until
 360  * memory becomes available.
 361  */
 362 kmutex_t        memavail_lock;
 363 kcondvar_t      memavail_cv;
 364
 365 typedef enum pageout_hand {
 366         POH_FRONT = 1,
 367         POH_BACK,
 368 } pageout_hand_t;
 369
 370 typedef enum {
 371         CKP_INELIGIBLE,
 372         CKP_NOT_FREED,
 373         CKP_FREED,
 374 } checkpage_result_t;
 375
 376 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
 377
 378 static struct clockinit {
 379         bool ci_init;
 380         pgcnt_t ci_lotsfree_min;
 381         pgcnt_t ci_lotsfree_max;
 382         pgcnt_t ci_lotsfree;
 383         pgcnt_t ci_desfree;
 384         pgcnt_t ci_minfree;
 385         pgcnt_t ci_throttlefree;
 386         pgcnt_t ci_pageout_reserve;
 387         pgcnt_t ci_maxpgio;
 388         pgcnt_t ci_maxfastscan;
 389         pgcnt_t ci_fastscan;
 390         pgcnt_t ci_slowscan;
 391         pgcnt_t ci_handspreadpages;
 392         uint_t  ci_despagescanners;
 393 } clockinit = { .ci_init = false };
 394
 395 static inline pgcnt_t
 396 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 397 {
 398         if (value < minimum)
 399                 return (minimum);
 400         else if (value > maximum)
 401                 return (maximum);
 402         else
 403                 return (value);
 404 }
 405
 406 static pgcnt_t
 407 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 408 {
 409         if (initval == 0 || initval >= initval_ceiling)
 410                 return (defval);
 411         else
 412                 return (initval);
 413 }
 414
 415 /*
 416  * On large memory systems, multiple instances of the page scanner are run,
 417  * each responsible for a separate region of memory. This speeds up page
 418  * invalidation under low memory conditions.
 419  *
 420  * For testing purposes, despagescanners can be set in /etc/system or via
 421  * mdb(1) and it will be used as a guide for how many page scanners to create;
 422  * the value will be adjusted if it is not sensible. Otherwise, the number of
 423  * page scanners is determined dynamically based on handspreadpages.
 424  */
 425 static void
 426 recalc_pagescanners(void)
 427 {
 428         uint_t des;
 429
 430         /* If the initial calibration has not been done, take no action. */
 431         if (pageout_new_spread == 0)
 432                 return;
 433
 434         /*
 435          * If `clockinit.ci_despagescanners` is non-zero, then a value for
 436          * `despagescanners` was set during initial boot. In this case, if
 437          * `despagescanners` has been reset to 0 then we want to revert to
 438          * that initial boot value.
 439          */
 440         if (despagescanners == 0)
 441                 despagescanners = clockinit.ci_despagescanners;
 442
 443         if (despagescanners != 0) {
 444                 /*
 445                  * We have a desired number of page scanners, either from
 446                  * /etc/system or set via mdb. Try and use it (it will be
 447                  * adjusted below if necessary).
 448                  */
 449                 des = despagescanners;
 450         } else {
 451                 /*
 452                  * Calculate the number of desired scanners based on the
 453                  * system's memory size.
 454                  *
 455                  * A 64GiB region size is used as the basis for calculating how
 456                  * many scanner threads should be created. For systems with up
 457                  * to 64GiB of RAM, a single thread is used; for very large
 458                  * memory systems the threads are limited to MAX_PSCAN_THREADS.
 459                  */
 460                 des = (looppages - 1) / btop(64ULL << 30) + 1;
 461         }
 462
 463         /*
 464          * Clamp the number of scanners so that we have no more than
 465          * MAX_PSCAN_THREADS and so that each scanner covers at least 10% more
 466          * than handspreadpages.
 467          */
 468         pgcnt_t min_scanner_pages = handspreadpages + handspreadpages / 10;
 469         pgcnt_t max_scanners = looppages / min_scanner_pages;
 470         despagescanners = clamp(des, 1,
 471             clamp(max_scanners, 1, MAX_PSCAN_THREADS));
 472 }
 473
 474 /*
 475  * Set up the paging constants for the clock algorithm used by
 476  * pageout_scanner(), and by the virtual memory system overall.  See the
 477  * comments at the top of this file for more information about the threshold
 478  * values and system responses to memory pressure.
 479  *
 480  * This routine is called once by main() at startup, after the initial size of
 481  * physical memory is determined.  It may be called again later if memory is
 482  * added to or removed from the system, or if new measurements of the page scan
 483  * rate become available.
 484  */
 485 void
 486 setupclock(void)
 487 {
 488         bool half = (pageout_threshold_style == 1);
 489         bool recalc = true;
 490
 491         looppages = total_pages;
 492
 493         /*
 494          * The operator may have provided specific values for some of the
 495          * tunables via /etc/system.  On our first call, we preserve those
 496          * values so that they can be used for subsequent recalculations.
 497          *
 498          * A value of zero for any tunable means we will use the default
 499          * sizing.
 500          */
 501         if (!clockinit.ci_init) {
 502                 clockinit.ci_init = true;
 503
 504                 clockinit.ci_lotsfree_min = lotsfree_min;
 505                 clockinit.ci_lotsfree_max = lotsfree_max;
 506                 clockinit.ci_lotsfree = lotsfree;
 507                 clockinit.ci_desfree = desfree;
 508                 clockinit.ci_minfree = minfree;
 509                 clockinit.ci_throttlefree = throttlefree;
 510                 clockinit.ci_pageout_reserve = pageout_reserve;
 511                 clockinit.ci_maxpgio = maxpgio;
 512                 clockinit.ci_maxfastscan = maxfastscan;
 513                 clockinit.ci_fastscan = fastscan;
 514                 clockinit.ci_slowscan = slowscan;
 515                 clockinit.ci_handspreadpages = handspreadpages;
 516                 clockinit.ci_despagescanners = despagescanners;
 517
 518                 /*
 519                  * The first call does not trigger a recalculation, only
 520                  * subsequent calls.
 521                  */
 522                 recalc = false;
 523         }
 524
 525         /*
 526          * Configure paging threshold values.  For more details on what each
 527          * threshold signifies, see the comments at the top of this file.
 528          */
 529         lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
 530             btop(LOTSFREE_MAX_DEFAULT));
 531         lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
 532             btop(LOTSFREE_MIN_DEFAULT));
 533
 534         lotsfree = tune(clockinit.ci_lotsfree, looppages,
 535             clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
 536
 537         desfree = tune(clockinit.ci_desfree, lotsfree,
 538             lotsfree / 2);
 539
 540         minfree = tune(clockinit.ci_minfree, desfree,
 541             half ? desfree / 2 : 3 * desfree / 4);
 542
 543         throttlefree = tune(clockinit.ci_throttlefree, desfree,
 544             minfree);
 545
 546         pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
 547             half ? throttlefree / 2 : 3 * throttlefree / 4);
 548
 549         /*
 550          * Maxpgio thresholds how much paging is acceptable.
 551          * This figures that 2/3 busy on an arm is all that is
 552          * tolerable for paging.  We assume one operation per disk rev.
 553          *
 554          * XXX - Does not account for multiple swap devices.
 555          */
 556         if (clockinit.ci_maxpgio == 0) {
 557                 maxpgio = (DISKRPM * 2) / 3;
 558         } else {
 559                 maxpgio = clockinit.ci_maxpgio;
 560         }
 561
 562         /*
 563          * The clock scan rate varies between fastscan and slowscan
 564          * based on the amount of free memory available.  Fastscan
 565          * rate should be set based on the number pages that can be
 566          * scanned per sec using ~10% of processor time.  Since this
 567          * value depends on the processor, MMU, Mhz etc., it is
 568          * difficult to determine it in a generic manner for all
 569          * architectures.
 570          *
 571          * Instead of trying to determine the number of pages scanned
 572          * per sec for every processor, fastscan is set to be the smaller
 573          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 574          * time is limited to ~4% of processor time.
 575          *
 576          * Setting fastscan to be 1/2 of memory allows pageout to scan
 577          * all of memory in ~2 secs.  This implies that user pages not
 578          * accessed within 1 sec (assuming, handspreadpages == fastscan)
 579          * can be reclaimed when free memory is very low.  Stealing pages
 580          * not accessed within 1 sec seems reasonable and ensures that
 581          * active user processes don't thrash.
 582          *
 583          * Smaller values of fastscan result in scanning fewer pages
 584          * every second and consequently pageout may not be able to free
 585          * sufficient memory to maintain the minimum threshold.  Larger
 586          * values of fastscan result in scanning a lot more pages which
 587          * could lead to thrashing and higher CPU usage.
 588          *
 589          * Fastscan needs to be limited to a maximum value and should not
 590          * scale with memory to prevent pageout from consuming too much
 591          * time for scanning on slow CPU's and avoid thrashing, as a
 592          * result of scanning too many pages, on faster CPU's.
 593          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 594          * (the upper bound for fastscan) based on the average number
 595          * of pages that can potentially be scanned in ~1 sec (using ~4%
 596          * of the CPU) on some of the following machines that currently
 597          * run Solaris 2.x:
 598          *
 599          *                      average memory scanned in ~1 sec
 600          *
 601          *      25 Mhz SS1+:            23 Meg
 602          *      LX:                     37 Meg
 603          *      50 Mhz SC2000:          68 Meg
 604          *
 605          *      40 Mhz 486:             26 Meg
 606          *      66 Mhz 486:             42 Meg
 607          *
 608          * When free memory falls just below lotsfree, the scan rate
 609          * goes from 0 to slowscan (i.e., pageout starts running).  This
 610          * transition needs to be smooth and is achieved by ensuring that
 611          * pageout scans a small number of pages to satisfy the transient
 612          * memory demand.  This is set to not exceed 100 pages/sec (25 per
 613          * wakeup) since scanning that many pages has no noticible impact
 614          * on system performance.
 615          *
 616          * In addition to setting fastscan and slowscan, pageout is
 617          * limited to using ~4% of the CPU.  This results in increasing
 618          * the time taken to scan all of memory, which in turn means that
 619          * user processes have a better opportunity of preventing their
 620          * pages from being stolen.  This has a positive effect on
 621          * interactive and overall system performance when memory demand
 622          * is high.
 623          *
 624          * Thus, the rate at which pages are scanned for replacement will
 625          * vary linearly between slowscan and the number of pages that
 626          * can be scanned using ~4% of processor time instead of varying
 627          * linearly between slowscan and fastscan.
 628          *
 629          * Also, the processor time used by pageout will vary from ~1%
 630          * at slowscan to ~4% at fastscan instead of varying between
 631          * ~1% at slowscan and ~10% at fastscan.
 632          *
 633          * The values chosen for the various VM parameters (fastscan,
 634          * handspreadpages, etc) are not universally true for all machines,
 635          * but appear to be a good rule of thumb for the machines we've
 636          * tested.  They have the following ranges:
 637          *
 638          *      cpu speed:      20 to 70 Mhz
 639          *      page size:      4K to 8K
 640          *      memory size:    16M to 5G
 641          *      page scan rate: 4000 - 17400 4K pages per sec
 642          *
 643          * The values need to be re-examined for machines which don't
 644          * fall into the various ranges (e.g., slower or faster CPUs,
 645          * smaller or larger pagesizes etc) shown above.
 646          *
 647          * On an MP machine, pageout is often unable to maintain the
 648          * minimum paging thresholds under heavy load.  This is due to
 649          * the fact that user processes running on other CPU's can be
 650          * dirtying memory at a much faster pace than pageout can find
 651          * pages to free.  The memory demands could be met by enabling
 652          * more than one CPU to run the clock algorithm in such a manner
 653          * that the various clock hands don't overlap.  This also makes
 654          * it more difficult to determine the values for fastscan, slowscan
 655          * and handspreadpages.
 656          *
 657          * The swapper is currently used to free up memory when pageout
 658          * is unable to meet memory demands by swapping out processes.
 659          * In addition to freeing up memory, swapping also reduces the
 660          * demand for memory by preventing user processes from running
 661          * and thereby consuming memory.
 662          */
 663         if (clockinit.ci_maxfastscan == 0) {
 664                 if (pageout_new_spread != 0) {
 665                         maxfastscan = pageout_new_spread;
 666                 } else {
 667                         maxfastscan = MAXHANDSPREADPAGES;
 668                 }
 669         } else {
 670                 maxfastscan = clockinit.ci_maxfastscan;
 671         }
 672
 673         if (clockinit.ci_fastscan == 0) {
 674                 fastscan = MIN(looppages / loopfraction, maxfastscan);
 675         } else {
 676                 fastscan = clockinit.ci_fastscan;
 677         }
 678
 679         if (fastscan > looppages / loopfraction) {
 680                 fastscan = looppages / loopfraction;
 681         }
 682
 683         /*
 684          * Set slow scan time to 1/10 the fast scan time, but
 685          * not to exceed maxslowscan.
 686          */
 687         if (clockinit.ci_slowscan == 0) {
 688                 slowscan = MIN(fastscan / 10, maxslowscan);
 689         } else {
 690                 slowscan = clockinit.ci_slowscan;
 691         }
 692
 693         if (slowscan > fastscan / 2) {
 694                 slowscan = fastscan / 2;
 695         }
 696
 697         /*
 698          * Handspreadpages is the distance (in pages) between front and back
 699          * pageout daemon hands.  The amount of time to reclaim a page
 700          * once pageout examines it increases with this distance and
 701          * decreases as the scan rate rises. It must be < the amount
 702          * of pageable memory.
 703          *
 704          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 705          * to be "fastscan" results in the front hand being a few secs
 706          * (varies based on the processor speed) ahead of the back hand
 707          * at fastscan rates.  This distance can be further reduced, if
 708          * necessary, by increasing the processor time used by pageout
 709          * to be more than ~4% and preferrably not more than ~10%.
 710          *
 711          * As a result, user processes have a much better chance of
 712          * referencing their pages before the back hand examines them.
 713          * This also significantly lowers the number of reclaims from
 714          * the freelist since pageout does not end up freeing pages which
 715          * may be referenced a sec later.
 716          */
 717         if (clockinit.ci_handspreadpages == 0) {
 718                 handspreadpages = fastscan;
 719         } else {
 720                 handspreadpages = clockinit.ci_handspreadpages;
 721         }
 722
 723         /*
 724          * Make sure that back hand follows front hand by at least
 725          * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 726          * back hand to look at a page during the same wakeup of the pageout
 727          * daemon in which the front hand cleared its ref bit.
 728          */
 729         if (handspreadpages >= looppages) {
 730                 handspreadpages = looppages - 1;
 731         }
 732
 733         /*
 734          * Establish the minimum and maximum length of time to be spent
 735          * scanning pages per wakeup, limiting the scanner duty cycle. The
 736          * input percentage values (0-100) must be converted to a fraction of
 737          * the number of nanoseconds in a second of wall time, then further
 738          * scaled down by the number of scanner wakeups in a second.
 739          */
 740         min_pageout_nsec = MAX(1,
 741             NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
 742         max_pageout_nsec = MAX(min_pageout_nsec,
 743             NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
 744
 745         /*
 746          * If not called for recalculation, return and skip the remaining
 747          * steps.
 748          */
 749         if (!recalc)
 750                 return;
 751
 752         /*
 753          * Set a flag to re-evaluate the clock hand positions.
 754          */
 755         for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
 756                 reset_hands[i] = true;
 757
 758         recalc_pagescanners();
 759 }
 760
 761 static kmutex_t pageout_mutex;
 762
 763 /*
 764  * Pool of available async pageout putpage requests.
 765  */
 766 static struct async_reqs *push_req;
 767 static struct async_reqs *req_freelist; /* available req structs */
 768 static struct async_reqs *push_list;    /* pending reqs */
 769 static kmutex_t push_lock;              /* protects req pool */
 770 static kcondvar_t push_cv;
 771
 772 /*
 773  * If pageout() is stuck on a single push for this many seconds,
 774  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
 775  * to 0, the deadman will have no effect.
 776  *
 777  * Note that we are only looking for stalls in the calls that pageout() makes
 778  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
 779  * I/O, which should not take long unless the underlying strategy call blocks
 780  * indefinitely for memory.  The actual I/O request happens (or fails) later.
 781  */
 782 uint_t pageout_deadman_seconds = 90;
 783
 784 static uint_t pageout_stucktime = 0;
 785 static bool pageout_pushing = false;
 786 static uint64_t pageout_pushcount = 0;
 787 static uint64_t pageout_pushcount_seen = 0;
 788
 789 int async_list_size = 8192;
 790
 791 static void pageout_scanner(void *);
 792
 793 /*
 794  * If a page is being shared more than "po_share" times
 795  * then leave it alone- don't page it out.
 796  */
 797 #define MIN_PO_SHARE    (8)
 798 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 799 ulong_t po_share = MIN_PO_SHARE;
 800
 801 /*
 802  * Schedule rate for paging.
 803  * Rate is linear interpolation between
 804  * slowscan with lotsfree and fastscan when out of memory.
 805  */
 806 static void
 807 schedpaging(void *arg)
 808 {
 809         spgcnt_t vavail;
 810
 811         if (freemem < lotsfree + needfree + kmem_reapahead)
 812                 kmem_reap();
 813
 814         if (freemem < lotsfree + needfree)
 815                 seg_preap();
 816
 817         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 818                 kcage_cageout_wakeup();
 819
 820         if (mutex_tryenter(&pageout_mutex)) {
 821                 if (pageouts_running != 0)
 822                         goto out;
 823
 824                 /* No pageout scanner threads running. */
 825                 nscan = 0;
 826                 vavail = freemem - deficit;
 827                 if (pageout_new_spread != 0)
 828                         vavail -= needfree;
 829                 /* Note that vavail is signed so don't use clamp() here */
 830                 if (vavail < 0)
 831                         vavail = 0;
 832                 if (vavail > lotsfree)
 833                         vavail = lotsfree;
 834
 835                 if (needfree > 0 && pageout_new_spread == 0) {
 836                         /*
 837                          * If we've not yet collected enough samples to
 838                          * calculate a spread, use the old logic of kicking
 839                          * into high gear anytime needfree is non-zero.
 840                          */
 841                         desscan = fastscan / SCHEDPAGING_HZ;
 842                 } else {
 843                         /*
 844                          * Once we've calculated a spread based on system
 845                          * memory and usage, just treat needfree as another
 846                          * form of deficit.
 847                          */
 848                         spgcnt_t faststmp, slowstmp, result;
 849
 850                         slowstmp = slowscan * vavail;
 851                         faststmp = fastscan * (lotsfree - vavail);
 852                         result = (slowstmp + faststmp) /
 853                             nz(lotsfree) / SCHEDPAGING_HZ;
 854                         desscan = (pgcnt_t)result;
 855                 }
 856
 857                 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
 858                     (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
 859
 860                 DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
 861                     pageout_nsec);
 862
 863                 if (pageout_new_spread != 0 && despagescanners != 0 &&
 864                     despagescanners != n_page_scanners) {
 865                         /*
 866                          * We have finished the pagescan initialisation and the
 867                          * desired number of page scanners has changed, either
 868                          * because sampling just finished, because of a memory
 869                          * DR, or because despagescanners has been modified on
 870                          * the fly (e.g. via mdb(1)).
 871                          */
 872                         uint_t curr_nscan = n_page_scanners;
 873                         uint_t i;
 874
 875                         /* Re-validate despagescanners */
 876                         recalc_pagescanners();
 877
 878                         n_page_scanners = despagescanners;
 879
 880                         for (i = 0; i < MAX_PSCAN_THREADS; i++)
 881                                 reset_hands[i] = true;
 882
 883                         /* If we need more scanners, start them now. */
 884                         for (i = curr_nscan; i < n_page_scanners; i++) {
 885                                 (void) lwp_kernel_create(proc_pageout,
 886                                     pageout_scanner, (void *)(uintptr_t)i,
 887                                     TS_RUN, curthread->t_pri);
 888                         }
 889
 890                         /*
 891                          * If the number of scanners has decreased, trigger a
 892                          * wakeup so that the excess threads will terminate.
 893                          */
 894                         if (n_page_scanners < curr_nscan) {
 895                                 WAKE_PAGEOUT_SCANNER(reducing);
 896                         }
 897                 }
 898
 899                 if (pageout_sampling) {
 900                         /*
 901                          * We still need to measure the rate at which the
 902                          * system is able to scan pages of memory. Each of
 903                          * these initial samples is a scan of as much system
 904                          * memory as practical, regardless of whether or not we
 905                          * are experiencing memory pressure.
 906                          */
 907                         desscan = total_pages;
 908                         pageout_nsec = max_pageout_nsec;
 909
 910                         WAKE_PAGEOUT_SCANNER(sampling);
 911                 } else if (freemem < lotsfree + needfree) {
 912                         /*
 913                          * We need more memory.
 914                          */
 915                         low_mem_scan++;
 916                         WAKE_PAGEOUT_SCANNER(lowmem);
 917                 } else {
 918                         /*
 919                          * There are enough free pages, no need to
 920                          * kick the scanner threads.  And next time
 921                          * around, keep more of the `highly shared'
 922                          * pages.
 923                          */
 924                         cv_signal_pageout();
 925                         if (po_share > MIN_PO_SHARE)
 926                                 po_share >>= 1;
 927                 }
 928 out:
 929                 mutex_exit(&pageout_mutex);
 930         }
 931
 932         /*
 933          * Signal threads waiting for available memory.
 934          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 935          * in this case it is not needed - the waiters will be woken up during
 936          * the next invocation of this function.
 937          */
 938         if (kmem_avail() > 0)
 939                 cv_broadcast(&memavail_cv);
 940
 941         (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 942 }
 943
 944 pgcnt_t         pushes;
 945 ulong_t         push_list_size;         /* # of requests on pageout queue */
 946
 947 /*
 948  * Paging out should always be enabled.  This tunable exists to hold pageout
 949  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
 950  * sleep each time it is woken by schedpaging().
 951  */
 952 uint_t dopageout = 1;
 953
 954 /*
 955  * The page out daemon, which runs as process 2.
 956  *
 957  * The daemon treats physical memory as a circular array of pages and scans
 958  * the pages using a 'two-handed clock' algorithm. The front hand moves
 959  * through the pages, clearing the reference bit. The back hand travels a
 960  * distance (handspreadpages) behind the front hand, freeing the pages that
 961  * have not been referenced in the time since the front hand passed. If
 962  * modified, they are first written to their backing store before being
 963  * freed.
 964  *
 965  * In order to make page invalidation more responsive on machines with
 966  * larger memory, multiple pageout_scanner threads may be created. In this
 967  * case, each thread is given a segment of the memory "clock face" so that
 968  * memory can be reclaimed more quickly. As long as there are at least lotsfree
 969  * pages, then pageout_scanner threads are not run.
 970  *
 971  * There are multiple threads that act on behalf of the pageout process. A
 972  * set of threads scan pages (pageout_scanner) and frees them up if they
 973  * don't require any VOP_PUTPAGE operation. If a page must be written back
 974  * to its backing store, the request is put on a list and the other
 975  * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
 976  * requests from the list, and processes them. Some filesystems may require
 977  * resources for the VOP_PUTPAGE operations (like memory) and hence can
 978  * block the pageout thread, but the scanner thread can still operate.
 979  * There is still no guarantee that memory deadlocks cannot occur.
 980  */
 981 void
 982 pageout()
 983 {
 984         struct async_reqs *arg;
 985         pri_t pageout_pri;
 986         int i;
 987         pgcnt_t max_pushes;
 988         callb_cpr_t cprinfo;
 989
 990         proc_pageout = ttoproc(curthread);
 991         proc_pageout->p_cstime = 0;
 992         proc_pageout->p_stime =  0;
 993         proc_pageout->p_cutime =  0;
 994         proc_pageout->p_utime = 0;
 995         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
 996         bcopy("pageout", PTOU(curproc)->u_comm, 7);
 997
 998         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
 999         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1000
1001         /*
1002          * Allocate and initialize the async request structures for pageout.
1003          */
1004         push_req = (struct async_reqs *)
1005             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1006
1007         req_freelist = push_req;
1008         for (i = 0; i < async_list_size - 1; i++) {
1009                 push_req[i].a_next = &push_req[i + 1];
1010         }
1011
1012         pageout_pri = curthread->t_pri;
1013
1014         /* Create the first pageout scanner thread. */
1015         (void) lwp_kernel_create(proc_pageout, pageout_scanner,
1016             (void *)0,  /* this is instance 0, not NULL */
1017             TS_RUN, pageout_pri - 1);
1018
1019         /*
1020          * kick off the pageout scheduler.
1021          */
1022         schedpaging(NULL);
1023
1024         /*
1025          * Create kernel cage thread.
1026          * The kernel cage thread is started under the pageout process
1027          * to take advantage of the less restricted page allocation
1028          * in page_create_throttle().
1029          */
1030         kcage_cageout_init();
1031
1032         /*
1033          * Limit pushes to avoid saturating pageout devices.
1034          */
1035         max_pushes = maxpgio / SCHEDPAGING_HZ;
1036         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1037
1038         for (;;) {
1039                 mutex_enter(&push_lock);
1040
1041                 while ((arg = push_list) == NULL || pushes > max_pushes) {
1042                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1043                         cv_wait(&push_cv, &push_lock);
1044                         pushes = 0;
1045                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1046                 }
1047                 push_list = arg->a_next;
1048                 arg->a_next = NULL;
1049                 pageout_pushing = true;
1050                 mutex_exit(&push_lock);
1051
1052                 DTRACE_PROBE(pageout__push);
1053
1054                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1055                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1056                         pushes++;
1057                 }
1058
1059                 /* vp held by checkpage() */
1060                 VN_RELE(arg->a_vp);
1061
1062                 mutex_enter(&push_lock);
1063                 pageout_pushing = false;
1064                 pageout_pushcount++;
1065                 arg->a_next = req_freelist;     /* back on freelist */
1066                 req_freelist = arg;
1067                 push_list_size--;
1068                 mutex_exit(&push_lock);
1069         }
1070 }
1071
1072 static void
1073 pageout_sample_add(pgcnt_t count, hrtime_t elapsed)
1074 {
1075         VERIFY(pageout_sampling);
1076
1077         /*
1078          * The global variables used below are only modified during initial
1079          * scanning when there is a single page scanner thread running.
1080          */
1081         pageout_sample_pages += count;
1082         pageout_sample_etime += elapsed;
1083         pageout_sample_cnt++;
1084
1085         if (pageout_sample_cnt >= pageout_sample_lim) {
1086                 /*
1087                  * We have enough samples, set the spread.
1088                  */
1089                 pageout_sampling = false;
1090                 pageout_rate = (hrrate_t)pageout_sample_pages *
1091                     (hrrate_t)(NANOSEC) / pageout_sample_etime;
1092                 pageout_new_spread = pageout_rate / 10;
1093         }
1094 }
1095
1096 static inline page_t *
1097 wrapping_page_next(page_t *cur, page_t *start, page_t *end)
1098 {
1099         if (cur == end)
1100                 return (start);
1101         return (page_nextn(cur, 1));
1102 }
1103
1104 /*
1105  * Kernel thread that scans pages looking for ones to free
1106  */
1107 static void
1108 pageout_scanner(void *a)
1109 {
1110         page_t *fhand, *bhand, *fhandstart;
1111         page_t *regionstart, *regionend;
1112         uint_t laps;
1113         callb_cpr_t cprinfo;
1114         pgcnt_t nscan_cnt;
1115         pgcnt_t pcount;
1116         hrtime_t sample_start, sample_end;
1117         uint_t inst = (uint_t)(uintptr_t)a;
1118
1119         VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1120
1121         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
1122         mutex_enter(&pageout_mutex);
1123
1124         /*
1125          * The restart case does not attempt to point the hands at roughly
1126          * the right point on the assumption that after one circuit things
1127          * will have settled down, and restarts shouldn't be that often.
1128          */
1129         reset_hands[inst] = true;
1130
1131         pageouts_running++;
1132         mutex_exit(&pageout_mutex);
1133
1134 loop:
1135         cv_signal_pageout();
1136
1137         mutex_enter(&pageout_mutex);
1138         pageouts_running--;
1139         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1140         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
1141         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
1142         pageouts_running++;
1143         mutex_exit(&pageout_mutex);
1144
1145         /*
1146          * Check if pageout has been disabled for debugging purposes.
1147          */
1148         if (dopageout == 0)
1149                 goto loop;
1150
1151         /*
1152          * One may reset the clock hands and scanned region for debugging
1153          * purposes. Hands will also be reset on first thread startup, if
1154          * the number of scanning threads (n_page_scanners) changes, or if
1155          * memory is added to, or removed from, the system.
1156          */
1157         if (reset_hands[inst]) {
1158                 page_t *first;
1159
1160                 reset_hands[inst] = false;
1161
1162                 if (inst >= n_page_scanners) {
1163                         /*
1164                          * The desired number of page scanners has been
1165                          * reduced and this instance is no longer wanted.
1166                          * Exit the lwp.
1167                          */
1168                         VERIFY3U(inst, !=, 0);
1169                         DTRACE_PROBE1(pageout__exit, uint_t, inst);
1170                         mutex_enter(&pageout_mutex);
1171                         pageouts_running--;
1172                         mutex_exit(&pageout_mutex);
1173                         mutex_enter(&curproc->p_lock);
1174                         lwp_exit();
1175                         /* NOTREACHED */
1176                 }
1177
1178                 first = page_first();
1179
1180                 /*
1181                  * Each scanner thread gets its own sector of the memory
1182                  * clock face.
1183                  */
1184                 pgcnt_t span, offset;
1185
1186                 span = looppages / n_page_scanners;
1187                 VERIFY3U(span, >, handspreadpages);
1188
1189                 offset = inst * span;
1190                 regionstart = page_nextn(first, offset);
1191                 if (inst == n_page_scanners - 1) {
1192                         /* The last instance goes up to the last page */
1193                         regionend = page_nextn(first, looppages - 1);
1194                 } else {
1195                         regionend = page_nextn(regionstart, span - 1);
1196                 }
1197
1198                 bhand = regionstart;
1199                 fhand = page_nextn(bhand, handspreadpages);
1200
1201                 DTRACE_PROBE4(pageout__reset, uint_t, inst,
1202                     pgcnt_t, regionstart, pgcnt_t, regionend,
1203                     pgcnt_t, fhand);
1204         }
1205
1206         /*
1207          * This CPU kstat is only incremented here and we're on this CPU, so no
1208          * lock.
1209          */
1210         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1211
1212         /*
1213          * Keep track of the number of times we have scanned all the way around
1214          * the loop on this wakeup.
1215          */
1216         laps = 0;
1217
1218         /*
1219          * Track the number of pages visited during this scan so that we can
1220          * periodically measure our duty cycle.
1221          */
1222         nscan_cnt = 0;
1223         pcount = 0;
1224
1225         DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
1226             hrtime_t, pageout_nsec, page_t *, bhand, page_t *, fhand);
1227
1228         /*
1229          * Record the initial position of the front hand for this cycle so
1230          * that we can detect when the hand wraps around.
1231          */
1232         fhandstart = fhand;
1233
1234         sample_start = gethrtime();
1235
1236         /*
1237          * Scan the appropriate number of pages for a single duty cycle.
1238          */
1239         while (nscan_cnt < desscan) {
1240                 checkpage_result_t rvfront, rvback;
1241
1242                 if (!pageout_sampling && freemem >= lotsfree + needfree) {
1243                         /*
1244                          * We are not sampling and enough memory has become
1245                          * available that scanning is no longer required.
1246                          */
1247                         DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1248                         break;
1249                 }
1250
1251                 DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1252
1253                 /*
1254                  * Periodically check to see if we have exceeded the CPU duty
1255                  * cycle for a single wakeup.
1256                  */
1257                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1258                         hrtime_t pageout_cycle_nsec;
1259
1260                         pageout_cycle_nsec = gethrtime() - sample_start;
1261                         if (pageout_cycle_nsec >= pageout_nsec) {
1262                                 atomic_inc_64(&pageout_timeouts);
1263                                 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1264                                 break;
1265                         }
1266                 }
1267
1268                 /*
1269                  * If checkpage manages to add a page to the free list,
1270                  * we give ourselves another couple of trips around the loop.
1271                  */
1272                 if ((rvfront = checkpage(fhand, POH_FRONT)) == CKP_FREED) {
1273                         laps = 0;
1274                 }
1275                 if ((rvback = checkpage(bhand, POH_BACK)) == CKP_FREED) {
1276                         laps = 0;
1277                 }
1278
1279                 ++pcount;
1280
1281                 /*
1282                  * This CPU kstat is only incremented here and we're on this
1283                  * CPU, so no lock.
1284                  */
1285                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1286
1287                 /*
1288                  * Don't include ineligible pages in the number scanned.
1289                  */
1290                 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE)
1291                         nscan_cnt++;
1292
1293                 /*
1294                  * Tick
1295                  */
1296                 bhand = wrapping_page_next(bhand, regionstart, regionend);
1297                 fhand = wrapping_page_next(fhand, regionstart, regionend);
1298
1299                 /*
1300                  * The front hand has wrapped around during this wakeup.
1301                  */
1302                 if (fhand == fhandstart) {
1303                         laps++;
1304                         DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
1305                             uint_t, laps);
1306
1307                         /*
1308                          * This CPU kstat is only incremented here and we're
1309                          * on this CPU, so no lock.
1310                          */
1311                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
1312
1313                         if (laps > 1) {
1314                                 /*
1315                                  * Extremely unlikely, but it happens.
1316                                  * We went around the loop at least once
1317                                  * and didn't get far enough.
1318                                  * If we are still skipping `highly shared'
1319                                  * pages, skip fewer of them.  Otherwise,
1320                                  * give up till the next clock tick.
1321                                  */
1322                                 if (po_share < MAX_PO_SHARE) {
1323                                         po_share <<= 1;
1324                                 } else {
1325                                         break;
1326                                 }
1327                         }
1328                 }
1329         }
1330
1331         sample_end = gethrtime();
1332         atomic_add_long(&nscan, nscan_cnt);
1333
1334         DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
1335             pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1336
1337         /*
1338          * Continue accumulating samples until we have enough to get a
1339          * reasonable value for average scan rate.
1340          */
1341         if (pageout_sampling) {
1342                 VERIFY3U(inst, ==, 0);
1343                 pageout_sample_add(pcount, sample_end - sample_start);
1344                 /*
1345                  * If, after the sample just added, we have finished sampling,
1346                  * set up the paging constants.
1347                  */
1348                 if (!pageout_sampling)
1349                         setupclock();
1350         }
1351
1352         goto loop;
1353 }
1354
1355 /*
1356  * The pageout deadman is run once per second by clock().
1357  */
1358 void
1359 pageout_deadman(void)
1360 {
1361         if (panicstr != NULL) {
1362                 /*
1363                  * There is no pageout after panic.
1364                  */
1365                 return;
1366         }
1367
1368         if (pageout_deadman_seconds == 0) {
1369                 /*
1370                  * The deadman is not enabled.
1371                  */
1372                 return;
1373         }
1374
1375         if (!pageout_pushing) {
1376                 goto reset;
1377         }
1378
1379         /*
1380          * We are pushing a page.  Check to see if it is the same call we saw
1381          * last time we looked:
1382          */
1383         if (pageout_pushcount != pageout_pushcount_seen) {
1384                 /*
1385                  * It is a different call from the last check, so we are not
1386                  * stuck.
1387                  */
1388                 goto reset;
1389         }
1390
1391         if (++pageout_stucktime >= pageout_deadman_seconds) {
1392                 panic("pageout_deadman: stuck pushing the same page for %d "
1393                     "seconds (freemem is %lu)", pageout_deadman_seconds,
1394                     freemem);
1395         }
1396
1397         return;
1398
1399 reset:
1400         /*
1401          * Reset our tracking state to reflect that we are not stuck:
1402          */
1403         pageout_stucktime = 0;
1404         pageout_pushcount_seen = pageout_pushcount;
1405 }
1406
1407 /*
1408  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1409  * system (u., page table) or free, then leave it alone.  Otherwise,
1410  * if we are running the front hand, turn off the page's reference bit.
1411  * If the proc is over maxrss, we take it.  If running the back hand,
1412  * check whether the page has been reclaimed.  If not, free the page,
1413  * pushing it to disk first if necessary.
1414  *
1415  * Return values:
1416  *      CKP_INELIGIBLE if the page is not a candidate at all,
1417  *      CKP_NOT_FREED  if the page was not freed, or
1418  *      CKP_FREED      if we freed it.
1419  */
1420 static checkpage_result_t
1421 checkpage(page_t *pp, pageout_hand_t whichhand)
1422 {
1423         int ppattr;
1424         int isfs = 0;
1425         int isexec = 0;
1426         int pagesync_flag;
1427
1428         /*
1429          * Skip pages:
1430          *      - associated with the kernel vnode since
1431          *          they are always "exclusively" locked.
1432          *      - that are free
1433          *      - that are shared more than po_share'd times
1434          *      - its already locked
1435          *
1436          * NOTE:  These optimizations assume that reads are atomic.
1437          */
1438
1439         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1440             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1441             hat_page_checkshare(pp, po_share)) {
1442                 return (CKP_INELIGIBLE);
1443         }
1444
1445         if (!page_trylock(pp, SE_EXCL)) {
1446                 /*
1447                  * Skip the page if we can't acquire the "exclusive" lock.
1448                  */
1449                 return (CKP_INELIGIBLE);
1450         } else if (PP_ISFREE(pp)) {
1451                 /*
1452                  * It became free between the above check and our actually
1453                  * locking the page.  Oh well, there will be other pages.
1454                  */
1455                 page_unlock(pp);
1456                 return (CKP_INELIGIBLE);
1457         }
1458
1459         /*
1460          * Reject pages that cannot be freed. The page_struct_lock
1461          * need not be acquired to examine these
1462          * fields since the page has an "exclusive" lock.
1463          */
1464         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1465                 page_unlock(pp);
1466                 return (CKP_INELIGIBLE);
1467         }
1468
1469         /*
1470          * Maintain statistics for what we are freeing
1471          */
1472         if (pp->p_vnode != NULL) {
1473                 if (pp->p_vnode->v_flag & VVMEXEC)
1474                         isexec = 1;
1475
1476                 if (!IS_SWAPFSVP(pp->p_vnode))
1477                         isfs = 1;
1478         }
1479
1480         /*
1481          * Turn off REF and MOD bits with the front hand.
1482          * The back hand examines the REF bit and always considers
1483          * SHARED pages as referenced.
1484          */
1485         if (whichhand == POH_FRONT) {
1486                 pagesync_flag = HAT_SYNC_ZERORM;
1487         } else {
1488                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1489                     HAT_SYNC_STOPON_SHARED;
1490         }
1491
1492         ppattr = hat_pagesync(pp, pagesync_flag);
1493
1494 recheck:
1495         /*
1496          * If page is referenced; make unreferenced but reclaimable.
1497          * If this page is not referenced, then it must be reclaimable
1498          * and we can add it to the free list.
1499          */
1500         if (ppattr & P_REF) {
1501                 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1502                     pageout_hand_t, whichhand);
1503
1504                 if (whichhand == POH_FRONT) {
1505                         /*
1506                          * Checking of rss or madvise flags needed here...
1507                          *
1508                          * If not "well-behaved", fall through into the code
1509                          * for not referenced.
1510                          */
1511                         hat_clrref(pp);
1512                 }
1513
1514                 /*
1515                  * Somebody referenced the page since the front
1516                  * hand went by, so it's not a candidate for
1517                  * freeing up.
1518                  */
1519                 page_unlock(pp);
1520                 return (CKP_NOT_FREED);
1521         }
1522
1523         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1524
1525         /*
1526          * If large page, attempt to demote it. If successfully demoted,
1527          * retry the checkpage.
1528          */
1529         if (pp->p_szc != 0) {
1530                 if (!page_try_demote_pages(pp)) {
1531                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1532                         page_unlock(pp);
1533                         return (CKP_INELIGIBLE);
1534                 }
1535
1536                 ASSERT(pp->p_szc == 0);
1537                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1538
1539                 /*
1540                  * Since page_try_demote_pages() could have unloaded some
1541                  * mappings it makes sense to reload ppattr.
1542                  */
1543                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1544         }
1545
1546         /*
1547          * If the page is currently dirty, we have to arrange to have it
1548          * cleaned before it can be freed.
1549          *
1550          * XXX - ASSERT(pp->p_vnode != NULL);
1551          */
1552         if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1553                 struct vnode *vp = pp->p_vnode;
1554                 u_offset_t offset = pp->p_offset;
1555
1556                 /*
1557                  * XXX - Test for process being swapped out or about to exit?
1558                  * [Can't get back to process(es) using the page.]
1559                  */
1560
1561                 /*
1562                  * Hold the vnode before releasing the page lock to
1563                  * prevent it from being freed and re-used by some
1564                  * other thread.
1565                  */
1566                 VN_HOLD(vp);
1567                 page_unlock(pp);
1568
1569                 /*
1570                  * Queue I/O request for the pageout thread.
1571                  */
1572                 if (!queue_io_request(vp, offset)) {
1573                         VN_RELE(vp);
1574                         return (CKP_NOT_FREED);
1575                 }
1576                 return (CKP_FREED);
1577         }
1578
1579         /*
1580          * Now we unload all the translations and put the page back on to the
1581          * free list.  If the page was used (referenced or modified) after the
1582          * pagesync but before it was unloaded we catch it and handle the page
1583          * properly.
1584          */
1585         DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1586         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1587         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1588         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1589                 goto recheck;
1590         }
1591
1592         VN_DISPOSE(pp, B_FREE, 0, kcred);
1593
1594         CPU_STATS_ADD_K(vm, dfree, 1);
1595
1596         if (isfs) {
1597                 if (isexec) {
1598                         CPU_STATS_ADD_K(vm, execfree, 1);
1599                 } else {
1600                         CPU_STATS_ADD_K(vm, fsfree, 1);
1601                 }
1602         } else {
1603                 CPU_STATS_ADD_K(vm, anonfree, 1);
1604         }
1605
1606         return (CKP_FREED);
1607 }
1608
1609 /*
1610  * Queue async i/o request from pageout_scanner and segment swapout
1611  * routines on one common list.  This ensures that pageout devices (swap)
1612  * are not saturated by pageout_scanner or swapout requests.
1613  * The pageout thread empties this list by initiating i/o operations.
1614  */
1615 int
1616 queue_io_request(vnode_t *vp, u_offset_t off)
1617 {
1618         struct async_reqs *arg;
1619
1620         /*
1621          * If we cannot allocate an async request struct,
1622          * skip this page.
1623          */
1624         mutex_enter(&push_lock);
1625         if ((arg = req_freelist) == NULL) {
1626                 mutex_exit(&push_lock);
1627                 return (0);
1628         }
1629         req_freelist = arg->a_next;             /* adjust freelist */
1630         push_list_size++;
1631
1632         arg->a_vp = vp;
1633         arg->a_off = off;
1634         arg->a_len = PAGESIZE;
1635         arg->a_flags = B_ASYNC | B_FREE;
1636         arg->a_cred = kcred;            /* always held */
1637
1638         /*
1639          * Add to list of pending write requests.
1640          */
1641         arg->a_next = push_list;
1642         push_list = arg;
1643
1644         if (req_freelist == NULL) {
1645                 /*
1646                  * No free async requests left. The lock is held so we
1647                  * might as well signal the pusher thread now.
1648                  */
1649                 cv_signal(&push_cv);
1650         }
1651         mutex_exit(&push_lock);
1652         return (1);
1653 }
1654
1655 /*
1656  * Wake up pageout to initiate i/o if push_list is not empty.
1657  */
1658 void
1659 cv_signal_pageout()
1660 {
1661         if (push_list != NULL) {
1662                 mutex_enter(&push_lock);
1663                 cv_signal(&push_cv);
1664                 mutex_exit(&push_lock);
1665         }
1666 }