sys/vm/vm_page2.h

   1 /*-
   2  * Copyright (c) 1982, 1986, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of the University nor the names of its contributors
  14  *    may be used to endorse or promote products derived from this software
  15  *    without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  *      @(#)vmmeter.h   8.2 (Berkeley) 7/10/94
  30  * $FreeBSD: src/sys/sys/vmmeter.h,v 1.21.2.2 2002/10/10 19:28:21 dillon Exp $
  31  */
  32
  33 #ifndef _VM_VM_PAGE2_H_
  34 #define _VM_VM_PAGE2_H_
  35
  36 #ifdef _KERNEL
  37
  38 #ifndef _SYS_VMMETER_H_
  39 #include <sys/vmmeter.h>
  40 #endif
  41 #ifndef _SYS_QUEUE_H_
  42 #include <sys/queue.h>
  43 #endif
  44 #ifndef _VM_VM_PAGE_H_
  45 #include <vm/vm_page.h>
  46 #endif
  47 #ifndef _SYS_SPINLOCK_H_
  48 #include <sys/spinlock.h>
  49 #endif
  50 #ifndef _SYS_SPINLOCK2_H_
  51 #include <sys/spinlock2.h>
  52 #endif
  53
  54 /*
  55  * SMP NOTE
  56  *
  57  * VM fault rates are highly dependent on SMP locking conflicts and, on
  58  * multi-socket systems, cache mastership changes for globals due to atomic
  59  * ops (even simple atomic_add_*() calls).  Cache mastership changes can
  60  * limit the aggregate fault rate.
  61  *
  62  * For this reason we go through some hoops to access VM statistics for
  63  * low-memory handling, pageout, and other triggers.  Each cpu collects
  64  * adjustments in gd->gd_vmstats_adj.  These get rolled up into the global
  65  * vmstats structure.  The global vmstats structure is then pulled into
  66  * gd->gd_vmstats by each cpu when it needs it.  Critical path checks always
  67  * use the pcpu gd->gd_vmstats structure.
  68  */
  69 /*
  70  * Return TRUE if we are under our severe low-free-pages threshold
  71  *
  72  * This causes user processes to stall to avoid exhausting memory that
  73  * the kernel might need.
  74  *
  75  * reserved < severe < minimum < wait < start < target1 < target2
  76  */
  77 static __inline
  78 int
  79 vm_paging_severe(void)
  80 {
  81         globaldata_t gd = mycpu;
  82
  83         if (__predict_false(gd->gd_vmstats.v_free_severe >
  84                             gd->gd_vmstats.v_free_count +
  85                             gd->gd_vmstats.v_cache_count))
  86         {
  87                 return 1;
  88         }
  89         if (__predict_false(gd->gd_vmstats.v_free_reserved >
  90                             gd->gd_vmstats.v_free_count))
  91         {
  92                 return 1;
  93         }
  94         return 0;
  95 }
  96
  97 /*
  98  * Return TRUE if we are under our minimum low-free-pages threshold.  We
  99  * will not count (donotcount) free pages as being free (used mainly for
 100  * hystersis tests).
 101  *
 102  * This will cause most normal page faults to block and activate the
 103  * pageout daemon.
 104  *
 105  * The pageout daemon should already be active due to vm_paging_start(n)
 106  * and will typically continue running until it hits target2
 107  *
 108  * reserved < severe < minimum < wait < start < target1 < target2
 109  */
 110 static __inline
 111 int
 112 vm_paging_min_dnc(long donotcount)
 113 {
 114         globaldata_t gd = mycpu;
 115
 116         if (__predict_false(gd->gd_vmstats.v_free_min + donotcount >
 117                             (gd->gd_vmstats.v_free_count +
 118                              gd->gd_vmstats.v_cache_count)))
 119         {
 120                 return 1;
 121         }
 122         if (__predict_false(gd->gd_vmstats.v_free_reserved >
 123                             gd->gd_vmstats.v_free_count))
 124         {
 125                 return 1;
 126         }
 127         return 0;
 128 }
 129
 130 /*
 131  * Returns TRUE if the number of FREE+CACHE pages falls below vm_paging_wait,
 132  * based on the nice value the trip point can be between vm_paging_min and
 133  * vm_paging_wait.
 134  *
 135  * Used by vm_fault (see vm_wait_pfault()) to block a process on low-memory
 136  * based on the process 'nice' value (-20 to +20).
 137  */
 138 static __inline
 139 int
 140 vm_paging_min_nice(int nice)
 141 {
 142         long count;
 143         long delta;
 144
 145         count = 0;
 146         if (nice) {
 147                 delta = vmstats.v_paging_wait - vmstats.v_free_min - 1;
 148                 delta = delta >> 1;
 149                 if (delta > 0) {
 150                         /* range 0-40, 0 is high priority, 40 is low */
 151                         count = (nice + 20) * delta / 40;
 152                 }
 153         }
 154         return vm_paging_min_dnc(count);
 155 }
 156
 157 static __inline
 158 int
 159 vm_paging_min(void)
 160 {
 161         return vm_paging_min_dnc(0);
 162 }
 163
 164 /*
 165  * Return TRUE if nominal userland / VM-system allocations should slow
 166  * down (but not stop) due to low free pages in the system.  This is
 167  * typically 1/2 way between min and start.
 168  *
 169  * reserved < severe < minimum < wait < start < target1 < target2
 170  */
 171 static __inline
 172 int
 173 vm_paging_wait(void)
 174 {
 175         globaldata_t gd = mycpu;
 176
 177         if (__predict_false(gd->gd_vmstats.v_paging_wait >
 178                             (gd->gd_vmstats.v_free_count +
 179                              gd->gd_vmstats.v_cache_count)))
 180         {
 181                 return 1;
 182         }
 183         if (__predict_false(gd->gd_vmstats.v_free_reserved >
 184                             gd->gd_vmstats.v_free_count))
 185         {
 186                 return 1;
 187         }
 188         return 0;
 189 }
 190
 191 /*
 192  * Return TRUE if the pageout daemon should be started up or continue
 193  * running.  Available pages have dropped to a level where we need to
 194  * think about freeing some up.
 195  *
 196  * Also handles edge cases for required 'actually-free' pages.
 197  *
 198  * reserved < severe < minimum < wait < start < target1 < target2
 199  */
 200 static __inline
 201 int
 202 vm_paging_start(int adj)
 203 {
 204         globaldata_t gd = mycpu;
 205
 206         if (__predict_false(gd->gd_vmstats.v_paging_start >
 207                             (gd->gd_vmstats.v_free_count +
 208                              gd->gd_vmstats.v_cache_count + adj)))
 209         {
 210                 return 1;
 211         }
 212         if (__predict_false(gd->gd_vmstats.v_free_min >
 213                             gd->gd_vmstats.v_free_count + adj))
 214         {
 215                 return 1;
 216         }
 217         if (__predict_false(gd->gd_vmstats.v_free_reserved >
 218                             gd->gd_vmstats.v_free_count))
 219         {
 220                 return 1;
 221         }
 222         return 0;
 223 }
 224
 225 /*
 226  * Return TRUE if the pageout daemon has not yet reached its initial target.
 227  * The pageout daemon works hard to reach target1.
 228  *
 229  * reserved < severe < minimum < wait < start < target1 < target2
 230  */
 231 static __inline
 232 int
 233 vm_paging_target1(void)
 234 {
 235         globaldata_t gd = mycpu;
 236
 237         if (__predict_false(gd->gd_vmstats.v_paging_target1 >
 238                             (gd->gd_vmstats.v_free_count +
 239                              gd->gd_vmstats.v_cache_count)))
 240         {
 241                 return 1;
 242         }
 243         if (__predict_false(gd->gd_vmstats.v_free_reserved >
 244                             gd->gd_vmstats.v_free_count))
 245         {
 246                 return 1;
 247         }
 248         return 0;
 249 }
 250
 251 static __inline
 252 long
 253 vm_paging_target1_count(void)
 254 {
 255         globaldata_t gd = mycpu;
 256         long delta;
 257
 258         delta = gd->gd_vmstats.v_paging_target1 -
 259                 (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count);
 260         return delta;
 261 }
 262
 263 /*
 264  * Return TRUE if the pageout daemon has not yet reached its final target.
 265  * The pageout daemon takes it easy on its way between target1 and target2.
 266  *
 267  * reserved < severe < minimum < wait < start < target1 < target2
 268  */
 269 static __inline
 270 int
 271 vm_paging_target2(void)
 272 {
 273         globaldata_t gd = mycpu;
 274
 275         if (__predict_false(gd->gd_vmstats.v_paging_target2 >
 276                             (gd->gd_vmstats.v_free_count +
 277                              gd->gd_vmstats.v_cache_count)))
 278         {
 279                 return 1;
 280         }
 281         if (__predict_false(gd->gd_vmstats.v_free_reserved >
 282                             gd->gd_vmstats.v_free_count))
 283         {
 284                 return 1;
 285         }
 286         return 0;
 287 }
 288
 289 static __inline
 290 long
 291 vm_paging_target2_count(void)
 292 {
 293         globaldata_t gd = mycpu;
 294         long delta;
 295
 296         delta = gd->gd_vmstats.v_paging_target2 -
 297                 (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count);
 298         return delta;
 299 }
 300
 301 /*
 302  * Returns TRUE if additional pages must be deactivated, either during a
 303  * pageout operation or during the page stats scan.
 304  *
 305  * Inactive tests are used in two places.  During heavy paging the
 306  * inactive_target is used to refill the inactive queue in staged.
 307  * Those pages are then ultimately flushed and moved to the cache or free
 308  * queues.
 309  *
 310  * The inactive queue is also used to manage scans to update page stats
 311  * (m->act_count).  The page stats scan occurs lazily in small batches to
 312  * update m->act_count for pages in the active queue and to move pages
 313  * (limited by inactive_target) to the inactive queue.  Page stats scanning
 314  * and active deactivations only run while the inactive queue is below target.
 315  * After this, additional page stats scanning just to update m->act_count
 316  * (but not do further deactivations) continues to run for a limited period
 317  * of time after any pageout daemon activity.
 318  */
 319 static __inline
 320 int
 321 vm_paging_inactive(void)
 322 {
 323         globaldata_t gd = mycpu;
 324
 325         if (__predict_false((gd->gd_vmstats.v_free_count +
 326                              gd->gd_vmstats.v_cache_count +
 327                              gd->gd_vmstats.v_inactive_count) <
 328                             (gd->gd_vmstats.v_free_min +
 329                              gd->gd_vmstats.v_inactive_target)))
 330         {
 331                 return 1;
 332         }
 333         return 0;
 334 }
 335
 336 /*
 337  * Return number of pages that need to be deactivated to achieve the inactive
 338  * target as a positive number.  A negative number indicates that there are
 339  * already a sufficient number of inactive pages.
 340  */
 341 static __inline
 342 long
 343 vm_paging_inactive_count(void)
 344 {
 345         globaldata_t gd = mycpu;
 346         long delta;
 347
 348         delta = (gd->gd_vmstats.v_free_min + gd->gd_vmstats.v_inactive_target) -
 349                 (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count +
 350                  gd->gd_vmstats.v_inactive_count);
 351
 352         return delta;
 353 }
 354
 355 /*
 356  * Clear dirty bits in the VM page but truncate the
 357  * end to a DEV_BSIZE'd boundary.
 358  *
 359  * Used when reading data in, typically via getpages.
 360  * The partial device block at the end of the truncation
 361  * range should not lose its dirty bit.
 362  *
 363  * NOTE: This function does not clear the pmap modified bit.
 364  */
 365 static __inline
 366 void
 367 vm_page_clear_dirty_end_nonincl(vm_page_t m, int base, int size)
 368 {
 369     size = (base + size) & ~DEV_BMASK;
 370     if (base < size)
 371         vm_page_clear_dirty(m, base, size - base);
 372 }
 373
 374 /*
 375  * Clear dirty bits in the VM page but truncate the
 376  * beginning to a DEV_BSIZE'd boundary.
 377  *
 378  * Used when truncating a buffer.  The partial device
 379  * block at the beginning of the truncation range
 380  * should not lose its dirty bit.
 381  *
 382  * NOTE: This function does not clear the pmap modified bit.
 383  */
 384 static __inline
 385 void
 386 vm_page_clear_dirty_beg_nonincl(vm_page_t m, int base, int size)
 387 {
 388     size += base;
 389     base = (base + DEV_BMASK) & ~DEV_BMASK;
 390     if (base < size)
 391         vm_page_clear_dirty(m, base, size - base);
 392 }
 393
 394 static __inline
 395 void
 396 vm_page_spin_lock(vm_page_t m)
 397 {
 398     spin_lock(&m->spin);
 399 }
 400
 401 static __inline
 402 void
 403 vm_page_spin_unlock(vm_page_t m)
 404 {
 405     spin_unlock(&m->spin);
 406 }
 407
 408 /*
 409  * Wire a vm_page that is already wired.  Does not require a busied
 410  * page.
 411  */
 412 static __inline
 413 void
 414 vm_page_wire_quick(vm_page_t m)
 415 {
 416     if (atomic_fetchadd_int(&m->wire_count, 1) == 0)
 417         panic("vm_page_wire_quick: wire_count was 0");
 418 }
 419
 420 /*
 421  * Unwire a vm_page quickly, does not require a busied page.
 422  *
 423  * This routine refuses to drop the wire_count to 0 and will return
 424  * TRUE if it would have had to (instead of decrementing it to 0).
 425  * The caller can then busy the page and deal with it.
 426  */
 427 static __inline
 428 int
 429 vm_page_unwire_quick(vm_page_t m)
 430 {
 431     KKASSERT(m->wire_count > 0);
 432     for (;;) {
 433         u_int wire_count = m->wire_count;
 434
 435         cpu_ccfence();
 436         if (wire_count == 1)
 437                 return TRUE;
 438         if (atomic_cmpset_int(&m->wire_count, wire_count, wire_count - 1))
 439                 return FALSE;
 440     }
 441 }
 442
 443 /*
 444  *      Functions implemented as macros
 445  */
 446
 447 static __inline void
 448 vm_page_flag_set(vm_page_t m, unsigned int bits)
 449 {
 450         atomic_set_int(&(m)->flags, bits);
 451 }
 452
 453 static __inline void
 454 vm_page_flag_clear(vm_page_t m, unsigned int bits)
 455 {
 456         atomic_clear_int(&(m)->flags, bits);
 457 }
 458
 459 /*
 460  * Wakeup anyone waiting for the page after potentially unbusying
 461  * (hard or soft) or doing other work on a page that might make a
 462  * waiter ready.  The setting of PBUSY_WANTED is integrated into the
 463  * related flags and it can't be set once the flags are already
 464  * clear, so there should be no races here.
 465  */
 466 static __inline void
 467 vm_page_flash(vm_page_t m)
 468 {
 469         if (m->busy_count & PBUSY_WANTED) {
 470                 atomic_clear_int(&m->busy_count, PBUSY_WANTED);
 471                 wakeup(m);
 472         }
 473 }
 474
 475 /*
 476  * Adjust the soft-busy count on a page.  The drop code will issue an
 477  * integrated wakeup if busy_count becomes 0.
 478  */
 479 static __inline void
 480 vm_page_sbusy_hold(vm_page_t m)
 481 {
 482         atomic_add_int(&m->busy_count, 1);
 483 }
 484
 485 static __inline void
 486 vm_page_sbusy_drop(vm_page_t m)
 487 {
 488         uint32_t ocount;
 489
 490         ocount = atomic_fetchadd_int(&m->busy_count, -1);
 491         if (ocount - 1 == PBUSY_WANTED) {
 492                 /* WANTED and no longer BUSY or SBUSY */
 493                 atomic_clear_int(&m->busy_count, PBUSY_WANTED);
 494                 wakeup(m);
 495         }
 496 }
 497
 498 /*
 499  * Reduce the protection of a page.  This routine never raises the
 500  * protection and therefore can be safely called if the page is already
 501  * at VM_PROT_NONE (it will be a NOP effectively ).
 502  *
 503  * VM_PROT_NONE will remove all user mappings of a page.  This is often
 504  * necessary when a page changes state (for example, turns into a copy-on-write
 505  * page or needs to be frozen for write I/O) in order to force a fault, or
 506  * to force a page's dirty bits to be synchronized and avoid hardware
 507  * (modified/accessed) bit update races with pmap changes.
 508  *
 509  * Since 'prot' is usually a constant, this inline usually winds up optimizing
 510  * out the primary conditional.
 511  *
 512  * Must be called with (m) hard-busied.
 513  *
 514  * WARNING: VM_PROT_NONE can block, but will loop until all mappings have
 515  *          been cleared.  Callers should be aware that other page related
 516  *          elements might have changed, however.
 517  */
 518 static __inline void
 519 vm_page_protect(vm_page_t m, int prot)
 520 {
 521         KKASSERT(m->busy_count & PBUSY_LOCKED);
 522         if (prot == VM_PROT_NONE) {
 523                 if (pmap_mapped_sync(m) & (PG_MAPPED | PG_WRITEABLE)) {
 524                         pmap_page_protect(m, VM_PROT_NONE);
 525                         /* PG_WRITEABLE & PG_MAPPED cleared by call */
 526                 }
 527         } else if ((prot == VM_PROT_READ) &&
 528                    (m->flags & PG_WRITEABLE) &&
 529                    (pmap_mapped_sync(m) & PG_WRITEABLE)) {
 530                 pmap_page_protect(m, VM_PROT_READ);
 531                 /* PG_WRITEABLE cleared by call */
 532         }
 533 }
 534
 535 /*
 536  * Zero-fill the specified page.  The entire contents of the page will be
 537  * zero'd out.
 538  */
 539 static __inline boolean_t
 540 vm_page_zero_fill(vm_page_t m)
 541 {
 542         pmap_zero_page(VM_PAGE_TO_PHYS(m));
 543         return (TRUE);
 544 }
 545
 546 /*
 547  * Copy the contents of src_m to dest_m.  The pages must be stable but spl
 548  * and other protections depend on context.
 549  */
 550 static __inline void
 551 vm_page_copy(vm_page_t src_m, vm_page_t dest_m)
 552 {
 553         pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m));
 554         dest_m->valid = VM_PAGE_BITS_ALL;
 555         dest_m->dirty = VM_PAGE_BITS_ALL;
 556 }
 557
 558 /*
 559  * Free a page.  The page must be marked BUSY.
 560  */
 561 static __inline void
 562 vm_page_free(vm_page_t m)
 563 {
 564         vm_page_free_toq(m);
 565 }
 566
 567 /*
 568  * Free a page to the zerod-pages queue.  The caller must ensure that the
 569  * page has been zerod.
 570  */
 571 static __inline void
 572 vm_page_free_zero(vm_page_t m)
 573 {
 574 #ifdef PMAP_DEBUG
 575 #ifdef PHYS_TO_DMAP
 576         char *p = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 577         int i;
 578
 579         for (i = 0; i < PAGE_SIZE; i++) {
 580                 if (p[i] != 0) {
 581                         panic("non-zero page in vm_page_free_zero()");
 582                 }
 583         }
 584 #endif
 585 #endif
 586         vm_page_free_toq(m);
 587 }
 588
 589 /*
 590  * Set page to not be dirty.  Note: does not clear pmap modify bits .
 591  */
 592 static __inline void
 593 vm_page_undirty(vm_page_t m)
 594 {
 595         m->dirty = 0;
 596 }
 597
 598 #endif  /* _KERNEL */
 599 #endif  /* _VM_VM_PAGE2_H_ */
 600