sys/vm/vm_page.c

   1 /*
   2  * Copyright (c) 1991 Regents of the University of California.
   3  * All rights reserved.
   4  * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
   5  *
   6  * This code is derived from software contributed to Berkeley by
   7  * The Mach Operating System project at Carnegie-Mellon University.
   8  *
   9  * This code is derived from software contributed to The DragonFly Project
  10  * by Matthew Dillon <dillon@backplane.com>
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      from: @(#)vm_page.c     7.4 (Berkeley) 5/7/91
  37  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
  38  */
  39
  40 /*
  41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  42  * All rights reserved.
  43  *
  44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  45  *
  46  * Permission to use, copy, modify and distribute this software and
  47  * its documentation is hereby granted, provided that both the copyright
  48  * notice and this permission notice appear in all copies of the
  49  * software, derivative works or modified versions, and any portions
  50  * thereof, and that both notices appear in supporting documentation.
  51  *
  52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  55  *
  56  * Carnegie Mellon requests users of this software to return to
  57  *
  58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  59  *  School of Computer Science
  60  *  Carnegie Mellon University
  61  *  Pittsburgh PA 15213-3890
  62  *
  63  * any improvements or extensions that they make and grant Carnegie the
  64  * rights to redistribute these changes.
  65  */
  66 /*
  67  * Resident memory management module.  The module manipulates 'VM pages'.
  68  * A VM page is the core building block for memory management.
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/malloc.h>
  74 #include <sys/proc.h>
  75 #include <sys/vmmeter.h>
  76 #include <sys/vnode.h>
  77 #include <sys/kernel.h>
  78 #include <sys/alist.h>
  79 #include <sys/sysctl.h>
  80 #include <sys/cpu_topology.h>
  81
  82 #include <vm/vm.h>
  83 #include <vm/vm_param.h>
  84 #include <sys/lock.h>
  85 #include <vm/vm_kern.h>
  86 #include <vm/pmap.h>
  87 #include <vm/vm_map.h>
  88 #include <vm/vm_object.h>
  89 #include <vm/vm_page.h>
  90 #include <vm/vm_pageout.h>
  91 #include <vm/vm_pager.h>
  92 #include <vm/vm_extern.h>
  93 #include <vm/swap_pager.h>
  94
  95 #include <machine/inttypes.h>
  96 #include <machine/md_var.h>
  97 #include <machine/specialreg.h>
  98 #include <machine/bus_dma.h>
  99
 100 #include <vm/vm_page2.h>
 101 #include <sys/spinlock2.h>
 102
 103 /*
 104  * SET - Minimum required set associative size, must be a power of 2.  We
 105  *       want this to match or exceed the set-associativeness of the cpu.
 106  *
 107  * GRP - A larger set that allows bleed-over into the domains of other
 108  *       nearby cpus.  Also must be a power of 2.  Used by the page zeroing
 109  *       code to smooth things out a bit.
 110  */
 111 #define PQ_SET_ASSOC            16
 112 #define PQ_SET_ASSOC_MASK       (PQ_SET_ASSOC - 1)
 113
 114 #define PQ_GRP_ASSOC            (PQ_SET_ASSOC * 2)
 115 #define PQ_GRP_ASSOC_MASK       (PQ_GRP_ASSOC - 1)
 116
 117 static void vm_page_queue_init(void);
 118 static void vm_page_free_wakeup(void);
 119 static vm_page_t vm_page_select_cache(u_short pg_color);
 120 static vm_page_t _vm_page_list_find2(int basequeue, int index);
 121 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
 122 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
 123
 124 /*
 125  * Array of tailq lists
 126  */
 127 __cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
 128
 129 static volatile int vm_pages_waiting;
 130 static struct alist vm_contig_alist;
 131 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
 132 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
 133
 134 static u_long vm_dma_reserved = 0;
 135 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
 136 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
 137             "Memory reserved for DMA");
 138 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
 139             &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
 140
 141 static int vm_contig_verbose = 0;
 142 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
 143
 144 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
 145              vm_pindex_t, pindex);
 146
 147 static void
 148 vm_page_queue_init(void)
 149 {
 150         int i;
 151
 152         for (i = 0; i < PQ_L2_SIZE; i++)
 153                 vm_page_queues[PQ_FREE+i].cnt_offset =
 154                         offsetof(struct vmstats, v_free_count);
 155         for (i = 0; i < PQ_L2_SIZE; i++)
 156                 vm_page_queues[PQ_CACHE+i].cnt_offset =
 157                         offsetof(struct vmstats, v_cache_count);
 158         for (i = 0; i < PQ_L2_SIZE; i++)
 159                 vm_page_queues[PQ_INACTIVE+i].cnt_offset =
 160                         offsetof(struct vmstats, v_inactive_count);
 161         for (i = 0; i < PQ_L2_SIZE; i++)
 162                 vm_page_queues[PQ_ACTIVE+i].cnt_offset =
 163                         offsetof(struct vmstats, v_active_count);
 164         for (i = 0; i < PQ_L2_SIZE; i++)
 165                 vm_page_queues[PQ_HOLD+i].cnt_offset =
 166                         offsetof(struct vmstats, v_active_count);
 167         /* PQ_NONE has no queue */
 168
 169         for (i = 0; i < PQ_COUNT; i++) {
 170                 TAILQ_INIT(&vm_page_queues[i].pl);
 171                 spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
 172         }
 173 }
 174
 175 /*
 176  * note: place in initialized data section?  Is this necessary?
 177  */
 178 vm_pindex_t first_page = 0;
 179 vm_pindex_t vm_page_array_size = 0;
 180 vm_page_t vm_page_array = NULL;
 181 vm_paddr_t vm_low_phys_reserved;
 182
 183 /*
 184  * (low level boot)
 185  *
 186  * Sets the page size, perhaps based upon the memory size.
 187  * Must be called before any use of page-size dependent functions.
 188  */
 189 void
 190 vm_set_page_size(void)
 191 {
 192         if (vmstats.v_page_size == 0)
 193                 vmstats.v_page_size = PAGE_SIZE;
 194         if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
 195                 panic("vm_set_page_size: page size not a power of two");
 196 }
 197
 198 /*
 199  * (low level boot)
 200  *
 201  * Add a new page to the freelist for use by the system.  New pages
 202  * are added to both the head and tail of the associated free page
 203  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
 204  * requests pull 'recent' adds (higher physical addresses) first.
 205  *
 206  * Beware that the page zeroing daemon will also be running soon after
 207  * boot, moving pages from the head to the tail of the PQ_FREE queues.
 208  *
 209  * Must be called in a critical section.
 210  */
 211 static void
 212 vm_add_new_page(vm_paddr_t pa)
 213 {
 214         struct vpgqueues *vpq;
 215         vm_page_t m;
 216
 217         m = PHYS_TO_VM_PAGE(pa);
 218         m->phys_addr = pa;
 219         m->flags = 0;
 220         m->pat_mode = PAT_WRITE_BACK;
 221         m->pc = (pa >> PAGE_SHIFT);
 222
 223         /*
 224          * Twist for cpu localization in addition to page coloring, so
 225          * different cpus selecting by m->queue get different page colors.
 226          */
 227         m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
 228         m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
 229         m->pc &= PQ_L2_MASK;
 230
 231         /*
 232          * Reserve a certain number of contiguous low memory pages for
 233          * contigmalloc() to use.
 234          */
 235         if (pa < vm_low_phys_reserved) {
 236                 atomic_add_long(&vmstats.v_page_count, 1);
 237                 atomic_add_long(&vmstats.v_dma_pages, 1);
 238                 m->queue = PQ_NONE;
 239                 m->wire_count = 1;
 240                 atomic_add_long(&vmstats.v_wire_count, 1);
 241                 alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
 242                 return;
 243         }
 244
 245         /*
 246          * General page
 247          */
 248         m->queue = m->pc + PQ_FREE;
 249         KKASSERT(m->dirty == 0);
 250
 251         atomic_add_long(&vmstats.v_page_count, 1);
 252         atomic_add_long(&vmstats.v_free_count, 1);
 253         vpq = &vm_page_queues[m->queue];
 254         TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
 255         ++vpq->lcnt;
 256 }
 257
 258 /*
 259  * (low level boot)
 260  *
 261  * Initializes the resident memory module.
 262  *
 263  * Preallocates memory for critical VM structures and arrays prior to
 264  * kernel_map becoming available.
 265  *
 266  * Memory is allocated from (virtual2_start, virtual2_end) if available,
 267  * otherwise memory is allocated from (virtual_start, virtual_end).
 268  *
 269  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
 270  * large enough to hold vm_page_array & other structures for machines with
 271  * large amounts of ram, so we want to use virtual2* when available.
 272  */
 273 void
 274 vm_page_startup(void)
 275 {
 276         vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
 277         vm_offset_t mapped;
 278         vm_pindex_t npages;
 279         vm_paddr_t page_range;
 280         vm_paddr_t new_end;
 281         int i;
 282         vm_paddr_t pa;
 283         vm_paddr_t last_pa;
 284         vm_paddr_t end;
 285         vm_paddr_t biggestone, biggestsize;
 286         vm_paddr_t total;
 287         vm_page_t m;
 288
 289         total = 0;
 290         biggestsize = 0;
 291         biggestone = 0;
 292         vaddr = round_page(vaddr);
 293
 294         /*
 295          * Make sure ranges are page-aligned.
 296          */
 297         for (i = 0; phys_avail[i].phys_end; ++i) {
 298                 phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
 299                 phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
 300                 if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
 301                         phys_avail[i].phys_end = phys_avail[i].phys_beg;
 302         }
 303
 304         /*
 305          * Locate largest block
 306          */
 307         for (i = 0; phys_avail[i].phys_end; ++i) {
 308                 vm_paddr_t size = phys_avail[i].phys_end -
 309                                   phys_avail[i].phys_beg;
 310
 311                 if (size > biggestsize) {
 312                         biggestone = i;
 313                         biggestsize = size;
 314                 }
 315                 total += size;
 316         }
 317         --i;    /* adjust to last entry for use down below */
 318
 319         end = phys_avail[biggestone].phys_end;
 320         end = trunc_page(end);
 321
 322         /*
 323          * Initialize the queue headers for the free queue, the active queue
 324          * and the inactive queue.
 325          */
 326         vm_page_queue_init();
 327
 328 #if !defined(_KERNEL_VIRTUAL)
 329         /*
 330          * VKERNELs don't support minidumps and as such don't need
 331          * vm_page_dump
 332          *
 333          * Allocate a bitmap to indicate that a random physical page
 334          * needs to be included in a minidump.
 335          *
 336          * The amd64 port needs this to indicate which direct map pages
 337          * need to be dumped, via calls to dump_add_page()/dump_drop_page().
 338          *
 339          * However, x86 still needs this workspace internally within the
 340          * minidump code.  In theory, they are not needed on x86, but are
 341          * included should the sf_buf code decide to use them.
 342          */
 343         page_range = phys_avail[i].phys_end / PAGE_SIZE;
 344         vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
 345         end -= vm_page_dump_size;
 346         vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
 347                                         VM_PROT_READ | VM_PROT_WRITE);
 348         bzero((void *)vm_page_dump, vm_page_dump_size);
 349 #endif
 350         /*
 351          * Compute the number of pages of memory that will be available for
 352          * use (taking into account the overhead of a page structure per
 353          * page).
 354          */
 355         first_page = phys_avail[0].phys_beg / PAGE_SIZE;
 356         page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
 357         npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
 358
 359 #ifndef _KERNEL_VIRTUAL
 360         /*
 361          * (only applies to real kernels)
 362          *
 363          * Reserve a large amount of low memory for potential 32-bit DMA
 364          * space allocations.  Once device initialization is complete we
 365          * release most of it, but keep (vm_dma_reserved) memory reserved
 366          * for later use.  Typically for X / graphics.  Through trial and
 367          * error we find that GPUs usually requires ~60-100MB or so.
 368          *
 369          * By default, 128M is left in reserve on machines with 2G+ of ram.
 370          */
 371         vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
 372         if (vm_low_phys_reserved > total / 4)
 373                 vm_low_phys_reserved = total / 4;
 374         if (vm_dma_reserved == 0) {
 375                 vm_dma_reserved = 128 * 1024 * 1024;    /* 128MB */
 376                 if (vm_dma_reserved > total / 16)
 377                         vm_dma_reserved = total / 16;
 378         }
 379 #endif
 380         alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
 381                    ALIST_RECORDS_65536);
 382
 383         /*
 384          * Initialize the mem entry structures now, and put them in the free
 385          * queue.
 386          */
 387         if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
 388                 kprintf("initializing vm_page_array ");
 389         new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 390         mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
 391         vm_page_array = (vm_page_t)mapped;
 392
 393 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
 394         /*
 395          * since pmap_map on amd64 returns stuff out of a direct-map region,
 396          * we have to manually add these pages to the minidump tracking so
 397          * that they can be dumped, including the vm_page_array.
 398          */
 399         for (pa = new_end;
 400              pa < phys_avail[biggestone].phys_end;
 401              pa += PAGE_SIZE) {
 402                 dump_add_page(pa);
 403         }
 404 #endif
 405
 406         /*
 407          * Clear all of the page structures, run basic initialization so
 408          * PHYS_TO_VM_PAGE() operates properly even on pages not in the
 409          * map.
 410          */
 411         bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
 412         vm_page_array_size = page_range;
 413         if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
 414                 kprintf("size = 0x%zx\n", vm_page_array_size);
 415
 416         m = &vm_page_array[0];
 417         pa = ptoa(first_page);
 418         for (i = 0; i < page_range; ++i) {
 419                 spin_init(&m->spin, "vm_page");
 420                 m->phys_addr = pa;
 421                 pa += PAGE_SIZE;
 422                 ++m;
 423         }
 424
 425         /*
 426          * Construct the free queue(s) in ascending order (by physical
 427          * address) so that the first 16MB of physical memory is allocated
 428          * last rather than first.  On large-memory machines, this avoids
 429          * the exhaustion of low physical memory before isa_dma_init has run.
 430          */
 431         vmstats.v_page_count = 0;
 432         vmstats.v_free_count = 0;
 433         for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
 434                 pa = phys_avail[i].phys_beg;
 435                 if (i == biggestone)
 436                         last_pa = new_end;
 437                 else
 438                         last_pa = phys_avail[i].phys_end;
 439                 while (pa < last_pa && npages-- > 0) {
 440                         vm_add_new_page(pa);
 441                         pa += PAGE_SIZE;
 442                 }
 443         }
 444         if (virtual2_start)
 445                 virtual2_start = vaddr;
 446         else
 447                 virtual_start = vaddr;
 448         mycpu->gd_vmstats = vmstats;
 449 }
 450
 451 /*
 452  * Reorganize VM pages based on numa data.  May be called as many times as
 453  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
 454  * to allow vm_page_alloc() to choose pages based on socket affinity.
 455  *
 456  * NOTE: This function is only called while we are still in UP mode, so
 457  *       we only need a critical section to protect the queues (which
 458  *       saves a lot of time, there are likely a ton of pages).
 459  */
 460 void
 461 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
 462 {
 463         vm_paddr_t scan_beg;
 464         vm_paddr_t scan_end;
 465         vm_paddr_t ran_end;
 466         struct vpgqueues *vpq;
 467         vm_page_t m;
 468         vm_page_t mend;
 469         int i;
 470         int socket_mod;
 471         int socket_value;
 472
 473         /*
 474          * Check if no physical information, or there was only one socket
 475          * (so don't waste time doing nothing!).
 476          */
 477         if (cpu_topology_phys_ids <= 1 ||
 478             cpu_topology_core_ids == 0) {
 479                 return;
 480         }
 481
 482         /*
 483          * Setup for our iteration.  Note that ACPI may iterate CPU
 484          * sockets starting at 0 or 1 or some other number.  The
 485          * cpu_topology code mod's it against the socket count.
 486          */
 487         ran_end = ran_beg + bytes;
 488
 489         socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
 490         socket_value = (physid % cpu_topology_phys_ids) * socket_mod;
 491         mend = &vm_page_array[vm_page_array_size];
 492
 493         crit_enter();
 494
 495         /*
 496          * Adjust cpu_topology's phys_mem parameter
 497          */
 498         if (root_cpu_node)
 499                 vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes);
 500
 501         /*
 502          * Adjust vm_page->pc and requeue all affected pages.  The
 503          * allocator will then be able to localize memory allocations
 504          * to some degree.
 505          */
 506         for (i = 0; phys_avail[i].phys_end; ++i) {
 507                 scan_beg = phys_avail[i].phys_beg;
 508                 scan_end = phys_avail[i].phys_end;
 509                 if (scan_end <= ran_beg)
 510                         continue;
 511                 if (scan_beg >= ran_end)
 512                         continue;
 513                 if (scan_beg < ran_beg)
 514                         scan_beg = ran_beg;
 515                 if (scan_end > ran_end)
 516                         scan_end = ran_end;
 517                 if (atop(scan_end) > first_page + vm_page_array_size)
 518                         scan_end = ptoa(first_page + vm_page_array_size);
 519
 520                 m = PHYS_TO_VM_PAGE(scan_beg);
 521                 while (scan_beg < scan_end) {
 522                         KKASSERT(m < mend);
 523                         if (m->queue != PQ_NONE) {
 524                                 vpq = &vm_page_queues[m->queue];
 525                                 TAILQ_REMOVE(&vpq->pl, m, pageq);
 526                                 --vpq->lcnt;
 527                                 /* queue doesn't change, no need to adj cnt */
 528                                 m->queue -= m->pc;
 529                                 m->pc %= socket_mod;
 530                                 m->pc += socket_value;
 531                                 m->pc &= PQ_L2_MASK;
 532                                 m->queue += m->pc;
 533                                 vpq = &vm_page_queues[m->queue];
 534                                 TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
 535                                 ++vpq->lcnt;
 536                                 /* queue doesn't change, no need to adj cnt */
 537                         } else {
 538                                 m->pc %= socket_mod;
 539                                 m->pc += socket_value;
 540                                 m->pc &= PQ_L2_MASK;
 541                         }
 542                         scan_beg += PAGE_SIZE;
 543                         ++m;
 544                 }
 545         }
 546         crit_exit();
 547 }
 548
 549 static
 550 void
 551 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes)
 552 {
 553         int cpuid;
 554         int i;
 555
 556         switch(cpup->type) {
 557         case PACKAGE_LEVEL:
 558                 cpup->phys_mem += bytes;
 559                 break;
 560         case CHIP_LEVEL:
 561                 /*
 562                  * All members should have the same chipid, so we only need
 563                  * to pull out one member.
 564                  */
 565                 if (CPUMASK_TESTNZERO(cpup->members)) {
 566                         cpuid = BSFCPUMASK(cpup->members);
 567                         if (physid ==
 568                             get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) {
 569                                 cpup->phys_mem += bytes;
 570                         }
 571                 }
 572                 break;
 573         case CORE_LEVEL:
 574         case THREAD_LEVEL:
 575                 /*
 576                  * Just inherit from the parent node
 577                  */
 578                 cpup->phys_mem = cpup->parent_node->phys_mem;
 579                 break;
 580         }
 581         for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i)
 582                 vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes);
 583 }
 584
 585 /*
 586  * We tended to reserve a ton of memory for contigmalloc().  Now that most
 587  * drivers have initialized we want to return most the remaining free
 588  * reserve back to the VM page queues so they can be used for normal
 589  * allocations.
 590  *
 591  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
 592  */
 593 static void
 594 vm_page_startup_finish(void *dummy __unused)
 595 {
 596         alist_blk_t blk;
 597         alist_blk_t rblk;
 598         alist_blk_t count;
 599         alist_blk_t xcount;
 600         alist_blk_t bfree;
 601         vm_page_t m;
 602
 603         spin_lock(&vm_contig_spin);
 604         for (;;) {
 605                 bfree = alist_free_info(&vm_contig_alist, &blk, &count);
 606                 if (bfree <= vm_dma_reserved / PAGE_SIZE)
 607                         break;
 608                 if (count == 0)
 609                         break;
 610
 611                 /*
 612                  * Figure out how much of the initial reserve we have to
 613                  * free in order to reach our target.
 614                  */
 615                 bfree -= vm_dma_reserved / PAGE_SIZE;
 616                 if (count > bfree) {
 617                         blk += count - bfree;
 618                         count = bfree;
 619                 }
 620
 621                 /*
 622                  * Calculate the nearest power of 2 <= count.
 623                  */
 624                 for (xcount = 1; xcount <= count; xcount <<= 1)
 625                         ;
 626                 xcount >>= 1;
 627                 blk += count - xcount;
 628                 count = xcount;
 629
 630                 /*
 631                  * Allocate the pages from the alist, then free them to
 632                  * the normal VM page queues.
 633                  *
 634                  * Pages allocated from the alist are wired.  We have to
 635                  * busy, unwire, and free them.  We must also adjust
 636                  * vm_low_phys_reserved before freeing any pages to prevent
 637                  * confusion.
 638                  */
 639                 rblk = alist_alloc(&vm_contig_alist, blk, count);
 640                 if (rblk != blk) {
 641                         kprintf("vm_page_startup_finish: Unable to return "
 642                                 "dma space @0x%08x/%d -> 0x%08x\n",
 643                                 blk, count, rblk);
 644                         break;
 645                 }
 646                 atomic_add_long(&vmstats.v_dma_pages, -(long)count);
 647                 spin_unlock(&vm_contig_spin);
 648
 649                 m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
 650                 vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
 651                 while (count) {
 652                         vm_page_busy_wait(m, FALSE, "cpgfr");
 653                         vm_page_unwire(m, 0);
 654                         vm_page_free(m);
 655                         --count;
 656                         ++m;
 657                 }
 658                 spin_lock(&vm_contig_spin);
 659         }
 660         spin_unlock(&vm_contig_spin);
 661
 662         /*
 663          * Print out how much DMA space drivers have already allocated and
 664          * how much is left over.
 665          */
 666         kprintf("DMA space used: %jdk, remaining available: %jdk\n",
 667                 (intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
 668                 (PAGE_SIZE / 1024),
 669                 (intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
 670 }
 671 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
 672         vm_page_startup_finish, NULL);
 673
 674
 675 /*
 676  * Scan comparison function for Red-Black tree scans.  An inclusive
 677  * (start,end) is expected.  Other fields are not used.
 678  */
 679 int
 680 rb_vm_page_scancmp(struct vm_page *p, void *data)
 681 {
 682         struct rb_vm_page_scan_info *info = data;
 683
 684         if (p->pindex < info->start_pindex)
 685                 return(-1);
 686         if (p->pindex > info->end_pindex)
 687                 return(1);
 688         return(0);
 689 }
 690
 691 int
 692 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
 693 {
 694         if (p1->pindex < p2->pindex)
 695                 return(-1);
 696         if (p1->pindex > p2->pindex)
 697                 return(1);
 698         return(0);
 699 }
 700
 701 void
 702 vm_page_init(vm_page_t m)
 703 {
 704         /* do nothing for now.  Called from pmap_page_init() */
 705 }
 706
 707 /*
 708  * Each page queue has its own spin lock, which is fairly optimal for
 709  * allocating and freeing pages at least.
 710  *
 711  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
 712  * queue spinlock via this function.  Also note that m->queue cannot change
 713  * unless both the page and queue are locked.
 714  */
 715 static __inline
 716 void
 717 _vm_page_queue_spin_lock(vm_page_t m)
 718 {
 719         u_short queue;
 720
 721         queue = m->queue;
 722         if (queue != PQ_NONE) {
 723                 spin_lock(&vm_page_queues[queue].spin);
 724                 KKASSERT(queue == m->queue);
 725         }
 726 }
 727
 728 static __inline
 729 void
 730 _vm_page_queue_spin_unlock(vm_page_t m)
 731 {
 732         u_short queue;
 733
 734         queue = m->queue;
 735         cpu_ccfence();
 736         if (queue != PQ_NONE)
 737                 spin_unlock(&vm_page_queues[queue].spin);
 738 }
 739
 740 static __inline
 741 void
 742 _vm_page_queues_spin_lock(u_short queue)
 743 {
 744         cpu_ccfence();
 745         if (queue != PQ_NONE)
 746                 spin_lock(&vm_page_queues[queue].spin);
 747 }
 748
 749
 750 static __inline
 751 void
 752 _vm_page_queues_spin_unlock(u_short queue)
 753 {
 754         cpu_ccfence();
 755         if (queue != PQ_NONE)
 756                 spin_unlock(&vm_page_queues[queue].spin);
 757 }
 758
 759 void
 760 vm_page_queue_spin_lock(vm_page_t m)
 761 {
 762         _vm_page_queue_spin_lock(m);
 763 }
 764
 765 void
 766 vm_page_queues_spin_lock(u_short queue)
 767 {
 768         _vm_page_queues_spin_lock(queue);
 769 }
 770
 771 void
 772 vm_page_queue_spin_unlock(vm_page_t m)
 773 {
 774         _vm_page_queue_spin_unlock(m);
 775 }
 776
 777 void
 778 vm_page_queues_spin_unlock(u_short queue)
 779 {
 780         _vm_page_queues_spin_unlock(queue);
 781 }
 782
 783 /*
 784  * This locks the specified vm_page and its queue in the proper order
 785  * (page first, then queue).  The queue may change so the caller must
 786  * recheck on return.
 787  */
 788 static __inline
 789 void
 790 _vm_page_and_queue_spin_lock(vm_page_t m)
 791 {
 792         vm_page_spin_lock(m);
 793         _vm_page_queue_spin_lock(m);
 794 }
 795
 796 static __inline
 797 void
 798 _vm_page_and_queue_spin_unlock(vm_page_t m)
 799 {
 800         _vm_page_queues_spin_unlock(m->queue);
 801         vm_page_spin_unlock(m);
 802 }
 803
 804 void
 805 vm_page_and_queue_spin_unlock(vm_page_t m)
 806 {
 807         _vm_page_and_queue_spin_unlock(m);
 808 }
 809
 810 void
 811 vm_page_and_queue_spin_lock(vm_page_t m)
 812 {
 813         _vm_page_and_queue_spin_lock(m);
 814 }
 815
 816 /*
 817  * Helper function removes vm_page from its current queue.
 818  * Returns the base queue the page used to be on.
 819  *
 820  * The vm_page and the queue must be spinlocked.
 821  * This function will unlock the queue but leave the page spinlocked.
 822  */
 823 static __inline u_short
 824 _vm_page_rem_queue_spinlocked(vm_page_t m)
 825 {
 826         struct vpgqueues *pq;
 827         u_short queue;
 828         u_short oqueue;
 829         long *cnt;
 830
 831         queue = m->queue;
 832         if (queue != PQ_NONE) {
 833                 pq = &vm_page_queues[queue];
 834                 TAILQ_REMOVE(&pq->pl, m, pageq);
 835
 836                 /*
 837                  * Adjust our pcpu stats.  In order for the nominal low-memory
 838                  * algorithms to work properly we don't let any pcpu stat get
 839                  * too negative before we force it to be rolled-up into the
 840                  * global stats.  Otherwise our pageout and vm_wait tests
 841                  * will fail badly.
 842                  *
 843                  * The idea here is to reduce unnecessary SMP cache
 844                  * mastership changes in the global vmstats, which can be
 845                  * particularly bad in multi-socket systems.
 846                  */
 847                 cnt = (long *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
 848                 atomic_add_long(cnt, -1);
 849                 if (*cnt < -VMMETER_SLOP_COUNT) {
 850                         u_long copy = atomic_swap_long(cnt, 0);
 851                         cnt = (long *)((char *)&vmstats + pq->cnt_offset);
 852                         atomic_add_long(cnt, copy);
 853                         cnt = (long *)((char *)&mycpu->gd_vmstats +
 854                                       pq->cnt_offset);
 855                         atomic_add_long(cnt, copy);
 856                 }
 857                 pq->lcnt--;
 858                 m->queue = PQ_NONE;
 859                 oqueue = queue;
 860                 queue -= m->pc;
 861                 vm_page_queues_spin_unlock(oqueue);     /* intended */
 862         }
 863         return queue;
 864 }
 865
 866 /*
 867  * Helper function places the vm_page on the specified queue.  Generally
 868  * speaking only PQ_FREE pages are placed at the head, to allow them to
 869  * be allocated sooner rather than later on the assumption that they
 870  * are cache-hot.
 871  *
 872  * The vm_page must be spinlocked.
 873  * This function will return with both the page and the queue locked.
 874  */
 875 static __inline void
 876 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
 877 {
 878         struct vpgqueues *pq;
 879         u_long *cnt;
 880
 881         KKASSERT(m->queue == PQ_NONE);
 882
 883         if (queue != PQ_NONE) {
 884                 vm_page_queues_spin_lock(queue);
 885                 pq = &vm_page_queues[queue];
 886                 ++pq->lcnt;
 887
 888                 /*
 889                  * Adjust our pcpu stats.  If a system entity really needs
 890                  * to incorporate the count it will call vmstats_rollup()
 891                  * to roll it all up into the global vmstats strufture.
 892                  */
 893                 cnt = (long *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
 894                 atomic_add_long(cnt, 1);
 895
 896                 /*
 897                  * PQ_FREE is always handled LIFO style to try to provide
 898                  * cache-hot pages to programs.
 899                  */
 900                 m->queue = queue;
 901                 if (queue - m->pc == PQ_FREE) {
 902                         TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
 903                 } else if (athead) {
 904                         TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
 905                 } else {
 906                         TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
 907                 }
 908                 /* leave the queue spinlocked */
 909         }
 910 }
 911
 912 /*
 913  * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
 914  * until the page is no longer BUSY or SBUSY (busy_count field is 0).
 915  *
 916  * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
 917  * call will be made before returning.
 918  *
 919  * This function does NOT busy the page and on return the page is not
 920  * guaranteed to be available.
 921  */
 922 void
 923 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
 924 {
 925         u_int32_t busy_count;
 926
 927         for (;;) {
 928                 busy_count = m->busy_count;
 929                 cpu_ccfence();
 930
 931                 if ((busy_count & PBUSY_LOCKED) == 0 &&
 932                     (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
 933                         break;
 934                 }
 935                 tsleep_interlock(m, 0);
 936                 if (atomic_cmpset_int(&m->busy_count, busy_count,
 937                                       busy_count | PBUSY_WANTED)) {
 938                         atomic_set_int(&m->flags, PG_REFERENCED);
 939                         tsleep(m, PINTERLOCKED, msg, 0);
 940                         break;
 941                 }
 942         }
 943 }
 944
 945 /*
 946  * This calculates and returns a page color given an optional VM object and
 947  * either a pindex or an iterator.  We attempt to return a cpu-localized
 948  * pg_color that is still roughly 16-way set-associative.  The CPU topology
 949  * is used if it was probed.
 950  *
 951  * The caller may use the returned value to index into e.g. PQ_FREE when
 952  * allocating a page in order to nominally obtain pages that are hopefully
 953  * already localized to the requesting cpu.  This function is not able to
 954  * provide any sort of guarantee of this, but does its best to improve
 955  * hardware cache management performance.
 956  *
 957  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
 958  */
 959 u_short
 960 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
 961 {
 962         u_short pg_color;
 963         int phys_id;
 964         int core_id;
 965         int object_pg_color;
 966
 967         phys_id = get_cpu_phys_id(cpuid);
 968         core_id = get_cpu_core_id(cpuid);
 969         object_pg_color = object ? object->pg_color : 0;
 970
 971         if (cpu_topology_phys_ids && cpu_topology_core_ids) {
 972                 int grpsize;
 973
 974                 /*
 975                  * Break us down by socket and cpu
 976                  */
 977                 pg_color = phys_id * PQ_L2_SIZE / cpu_topology_phys_ids;
 978                 pg_color += core_id * PQ_L2_SIZE /
 979                             (cpu_topology_core_ids * cpu_topology_phys_ids);
 980
 981                 /*
 982                  * Calculate remaining component for object/queue color
 983                  */
 984                 grpsize = PQ_L2_SIZE / (cpu_topology_core_ids *
 985                                         cpu_topology_phys_ids);
 986                 if (grpsize >= 8) {
 987                         pg_color += (pindex + object_pg_color) % grpsize;
 988                 } else {
 989                         if (grpsize <= 2) {
 990                                 grpsize = 8;
 991                         } else {
 992                                 /* 3->9, 4->8, 5->10, 6->12, 7->14 */
 993                                 grpsize += grpsize;
 994                                 if (grpsize < 8)
 995                                         grpsize += grpsize;
 996                         }
 997                         pg_color += (pindex + object_pg_color) % grpsize;
 998                 }
 999         } else {
1000                 /*
1001                  * Unknown topology, distribute things evenly.
1002                  */
1003                 pg_color = cpuid * PQ_L2_SIZE / ncpus;
1004                 pg_color += pindex + object_pg_color;
1005         }
1006         return (pg_color & PQ_L2_MASK);
1007 }
1008
1009 /*
1010  * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
1011  * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
1012  */
1013 void
1014 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
1015                                      int also_m_busy, const char *msg
1016                                      VM_PAGE_DEBUG_ARGS)
1017 {
1018         u_int32_t busy_count;
1019
1020         for (;;) {
1021                 busy_count = m->busy_count;
1022                 cpu_ccfence();
1023                 if (busy_count & PBUSY_LOCKED) {
1024                         tsleep_interlock(m, 0);
1025                         if (atomic_cmpset_int(&m->busy_count, busy_count,
1026                                           busy_count | PBUSY_WANTED)) {
1027                                 atomic_set_int(&m->flags, PG_REFERENCED);
1028                                 tsleep(m, PINTERLOCKED, msg, 0);
1029                         }
1030                 } else if (also_m_busy && busy_count) {
1031                         tsleep_interlock(m, 0);
1032                         if (atomic_cmpset_int(&m->busy_count, busy_count,
1033                                           busy_count | PBUSY_WANTED)) {
1034                                 atomic_set_int(&m->flags, PG_REFERENCED);
1035                                 tsleep(m, PINTERLOCKED, msg, 0);
1036                         }
1037                 } else {
1038                         if (atomic_cmpset_int(&m->busy_count, busy_count,
1039                                               busy_count | PBUSY_LOCKED)) {
1040 #ifdef VM_PAGE_DEBUG
1041                                 m->busy_func = func;
1042                                 m->busy_line = lineno;
1043 #endif
1044                                 break;
1045                         }
1046                 }
1047         }
1048 }
1049
1050 /*
1051  * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
1052  * m->busy_count is also 0.
1053  *
1054  * Returns non-zero on failure.
1055  */
1056 int
1057 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1058                                     VM_PAGE_DEBUG_ARGS)
1059 {
1060         u_int32_t busy_count;
1061
1062         for (;;) {
1063                 busy_count = m->busy_count;
1064                 cpu_ccfence();
1065                 if (busy_count & PBUSY_LOCKED)
1066                         return TRUE;
1067                 if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1068                         return TRUE;
1069                 if (atomic_cmpset_int(&m->busy_count, busy_count,
1070                                       busy_count | PBUSY_LOCKED)) {
1071 #ifdef VM_PAGE_DEBUG
1072                                 m->busy_func = func;
1073                                 m->busy_line = lineno;
1074 #endif
1075                         return FALSE;
1076                 }
1077         }
1078 }
1079
1080 /*
1081  * Clear the BUSY flag and return non-zero to indicate to the caller
1082  * that a wakeup() should be performed.
1083  *
1084  * The vm_page must be spinlocked and will remain spinlocked on return.
1085  * The related queue must NOT be spinlocked (which could deadlock us).
1086  *
1087  * (inline version)
1088  */
1089 static __inline
1090 int
1091 _vm_page_wakeup(vm_page_t m)
1092 {
1093         u_int32_t busy_count;
1094
1095         for (;;) {
1096                 busy_count = m->busy_count;
1097                 cpu_ccfence();
1098                 if (atomic_cmpset_int(&m->busy_count, busy_count,
1099                                       busy_count &
1100                                       ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1101                         break;
1102                 }
1103         }
1104         return((int)(busy_count & PBUSY_WANTED));
1105 }
1106
1107 /*
1108  * Clear the BUSY flag and wakeup anyone waiting for the page.  This
1109  * is typically the last call you make on a page before moving onto
1110  * other things.
1111  */
1112 void
1113 vm_page_wakeup(vm_page_t m)
1114 {
1115         KASSERT(m->busy_count & PBUSY_LOCKED,
1116                 ("vm_page_wakeup: page not busy!!!"));
1117         vm_page_spin_lock(m);
1118         if (_vm_page_wakeup(m)) {
1119                 vm_page_spin_unlock(m);
1120                 wakeup(m);
1121         } else {
1122                 vm_page_spin_unlock(m);
1123         }
1124 }
1125
1126 /*
1127  * Holding a page keeps it from being reused.  Other parts of the system
1128  * can still disassociate the page from its current object and free it, or
1129  * perform read or write I/O on it and/or otherwise manipulate the page,
1130  * but if the page is held the VM system will leave the page and its data
1131  * intact and not reuse the page for other purposes until the last hold
1132  * reference is released.  (see vm_page_wire() if you want to prevent the
1133  * page from being disassociated from its object too).
1134  *
1135  * The caller must still validate the contents of the page and, if necessary,
1136  * wait for any pending I/O (e.g. vm_page_sleep_busy() loop) to complete
1137  * before manipulating the page.
1138  *
1139  * XXX get vm_page_spin_lock() here and move FREE->HOLD if necessary
1140  */
1141 void
1142 vm_page_hold(vm_page_t m)
1143 {
1144         vm_page_spin_lock(m);
1145         atomic_add_int(&m->hold_count, 1);
1146         if (m->queue - m->pc == PQ_FREE) {
1147                 _vm_page_queue_spin_lock(m);
1148                 _vm_page_rem_queue_spinlocked(m);
1149                 _vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
1150                 _vm_page_queue_spin_unlock(m);
1151         }
1152         vm_page_spin_unlock(m);
1153 }
1154
1155 /*
1156  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1157  * it was freed while held and must be moved back to the FREE queue.
1158  */
1159 void
1160 vm_page_unhold(vm_page_t m)
1161 {
1162         KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1163                 ("vm_page_unhold: pg %p illegal hold_count (%d) or on FREE queue (%d)",
1164                  m, m->hold_count, m->queue - m->pc));
1165         vm_page_spin_lock(m);
1166         atomic_add_int(&m->hold_count, -1);
1167         if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1168                 _vm_page_queue_spin_lock(m);
1169                 _vm_page_rem_queue_spinlocked(m);
1170                 _vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1171                 _vm_page_queue_spin_unlock(m);
1172         }
1173         vm_page_spin_unlock(m);
1174 }
1175
1176 /*
1177  *      vm_page_getfake:
1178  *
1179  *      Create a fictitious page with the specified physical address and
1180  *      memory attribute.  The memory attribute is the only the machine-
1181  *      dependent aspect of a fictitious page that must be initialized.
1182  */
1183
1184 void
1185 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1186 {
1187
1188         if ((m->flags & PG_FICTITIOUS) != 0) {
1189                 /*
1190                  * The page's memattr might have changed since the
1191                  * previous initialization.  Update the pmap to the
1192                  * new memattr.
1193                  */
1194                 goto memattr;
1195         }
1196         m->phys_addr = paddr;
1197         m->queue = PQ_NONE;
1198         /* Fictitious pages don't use "segind". */
1199         /* Fictitious pages don't use "order" or "pool". */
1200         m->flags = PG_FICTITIOUS | PG_UNMANAGED;
1201         m->busy_count = PBUSY_LOCKED;
1202         m->wire_count = 1;
1203         spin_init(&m->spin, "fake_page");
1204         pmap_page_init(m);
1205 memattr:
1206         pmap_page_set_memattr(m, memattr);
1207 }
1208
1209 /*
1210  * Inserts the given vm_page into the object and object list.
1211  *
1212  * The pagetables are not updated but will presumably fault the page
1213  * in if necessary, or if a kernel page the caller will at some point
1214  * enter the page into the kernel's pmap.  We are not allowed to block
1215  * here so we *can't* do this anyway.
1216  *
1217  * This routine may not block.
1218  * This routine must be called with the vm_object held.
1219  * This routine must be called with a critical section held.
1220  *
1221  * This routine returns TRUE if the page was inserted into the object
1222  * successfully, and FALSE if the page already exists in the object.
1223  */
1224 int
1225 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1226 {
1227         ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1228         if (m->object != NULL)
1229                 panic("vm_page_insert: already inserted");
1230
1231         atomic_add_int(&object->generation, 1);
1232
1233         /*
1234          * Record the object/offset pair in this page and add the
1235          * pv_list_count of the page to the object.
1236          *
1237          * The vm_page spin lock is required for interactions with the pmap.
1238          */
1239         vm_page_spin_lock(m);
1240         m->object = object;
1241         m->pindex = pindex;
1242         if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1243                 m->object = NULL;
1244                 m->pindex = 0;
1245                 vm_page_spin_unlock(m);
1246                 return FALSE;
1247         }
1248         ++object->resident_page_count;
1249         ++mycpu->gd_vmtotal.t_rm;
1250         vm_page_spin_unlock(m);
1251
1252         /*
1253          * Since we are inserting a new and possibly dirty page,
1254          * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1255          */
1256         if ((m->valid & m->dirty) ||
1257             (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1258                 vm_object_set_writeable_dirty(object);
1259
1260         /*
1261          * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1262          */
1263         swap_pager_page_inserted(m);
1264         return TRUE;
1265 }
1266
1267 /*
1268  * Removes the given vm_page_t from the (object,index) table
1269  *
1270  * The underlying pmap entry (if any) is NOT removed here.
1271  * This routine may not block.
1272  *
1273  * The page must be BUSY and will remain BUSY on return.
1274  * No other requirements.
1275  *
1276  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1277  *       it busy.
1278  */
1279 void
1280 vm_page_remove(vm_page_t m)
1281 {
1282         vm_object_t object;
1283
1284         if (m->object == NULL) {
1285                 return;
1286         }
1287
1288         if ((m->busy_count & PBUSY_LOCKED) == 0)
1289                 panic("vm_page_remove: page not busy");
1290
1291         object = m->object;
1292
1293         vm_object_hold(object);
1294
1295         /*
1296          * Remove the page from the object and update the object.
1297          *
1298          * The vm_page spin lock is required for interactions with the pmap.
1299          */
1300         vm_page_spin_lock(m);
1301         vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1302         --object->resident_page_count;
1303         --mycpu->gd_vmtotal.t_rm;
1304         m->object = NULL;
1305         atomic_add_int(&object->generation, 1);
1306         vm_page_spin_unlock(m);
1307
1308         vm_object_drop(object);
1309 }
1310
1311 /*
1312  * Locate and return the page at (object, pindex), or NULL if the
1313  * page could not be found.
1314  *
1315  * The caller must hold the vm_object token.
1316  */
1317 vm_page_t
1318 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1319 {
1320         vm_page_t m;
1321
1322         /*
1323          * Search the hash table for this object/offset pair
1324          */
1325         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1326         m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1327         KKASSERT(m == NULL || (m->object == object && m->pindex == pindex));
1328         return(m);
1329 }
1330
1331 vm_page_t
1332 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1333                                             vm_pindex_t pindex,
1334                                             int also_m_busy, const char *msg
1335                                             VM_PAGE_DEBUG_ARGS)
1336 {
1337         u_int32_t busy_count;
1338         vm_page_t m;
1339
1340         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1341         m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1342         while (m) {
1343                 KKASSERT(m->object == object && m->pindex == pindex);
1344                 busy_count = m->busy_count;
1345                 cpu_ccfence();
1346                 if (busy_count & PBUSY_LOCKED) {
1347                         tsleep_interlock(m, 0);
1348                         if (atomic_cmpset_int(&m->busy_count, busy_count,
1349                                           busy_count | PBUSY_WANTED)) {
1350                                 atomic_set_int(&m->flags, PG_REFERENCED);
1351                                 tsleep(m, PINTERLOCKED, msg, 0);
1352                                 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1353                                                               pindex);
1354                         }
1355                 } else if (also_m_busy && busy_count) {
1356                         tsleep_interlock(m, 0);
1357                         if (atomic_cmpset_int(&m->busy_count, busy_count,
1358                                           busy_count | PBUSY_WANTED)) {
1359                                 atomic_set_int(&m->flags, PG_REFERENCED);
1360                                 tsleep(m, PINTERLOCKED, msg, 0);
1361                                 m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1362                                                               pindex);
1363                         }
1364                 } else if (atomic_cmpset_int(&m->busy_count, busy_count,
1365                                              busy_count | PBUSY_LOCKED)) {
1366 #ifdef VM_PAGE_DEBUG
1367                         m->busy_func = func;
1368                         m->busy_line = lineno;
1369 #endif
1370                         break;
1371                 }
1372         }
1373         return m;
1374 }
1375
1376 /*
1377  * Attempt to lookup and busy a page.
1378  *
1379  * Returns NULL if the page could not be found
1380  *
1381  * Returns a vm_page and error == TRUE if the page exists but could not
1382  * be busied.
1383  *
1384  * Returns a vm_page and error == FALSE on success.
1385  */
1386 vm_page_t
1387 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1388                                            vm_pindex_t pindex,
1389                                            int also_m_busy, int *errorp
1390                                            VM_PAGE_DEBUG_ARGS)
1391 {
1392         u_int32_t busy_count;
1393         vm_page_t m;
1394
1395         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1396         m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1397         *errorp = FALSE;
1398         while (m) {
1399                 KKASSERT(m->object == object && m->pindex == pindex);
1400                 busy_count = m->busy_count;
1401                 cpu_ccfence();
1402                 if (busy_count & PBUSY_LOCKED) {
1403                         *errorp = TRUE;
1404                         break;
1405                 }
1406                 if (also_m_busy && busy_count) {
1407                         *errorp = TRUE;
1408                         break;
1409                 }
1410                 if (atomic_cmpset_int(&m->busy_count, busy_count,
1411                                       busy_count | PBUSY_LOCKED)) {
1412 #ifdef VM_PAGE_DEBUG
1413                         m->busy_func = func;
1414                         m->busy_line = lineno;
1415 #endif
1416                         break;
1417                 }
1418         }
1419         return m;
1420 }
1421
1422 /*
1423  * Returns a page that is only soft-busied for use by the caller in
1424  * a read-only fashion.  Returns NULL if the page could not be found,
1425  * the soft busy could not be obtained, or the page data is invalid.
1426  */
1427 vm_page_t
1428 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1429                          int pgoff, int pgbytes)
1430 {
1431         vm_page_t m;
1432
1433         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1434         m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1435         if (m) {
1436                 if ((m->valid != VM_PAGE_BITS_ALL &&
1437                      !vm_page_is_valid(m, pgoff, pgbytes)) ||
1438                     (m->flags & PG_FICTITIOUS)) {
1439                         m = NULL;
1440                 } else if (vm_page_sbusy_try(m)) {
1441                         m = NULL;
1442                 } else if ((m->valid != VM_PAGE_BITS_ALL &&
1443                             !vm_page_is_valid(m, pgoff, pgbytes)) ||
1444                            (m->flags & PG_FICTITIOUS)) {
1445                         vm_page_sbusy_drop(m);
1446                         m = NULL;
1447                 }
1448         }
1449         return m;
1450 }
1451
1452 /*
1453  * Caller must hold the related vm_object
1454  */
1455 vm_page_t
1456 vm_page_next(vm_page_t m)
1457 {
1458         vm_page_t next;
1459
1460         next = vm_page_rb_tree_RB_NEXT(m);
1461         if (next && next->pindex != m->pindex + 1)
1462                 next = NULL;
1463         return (next);
1464 }
1465
1466 /*
1467  * vm_page_rename()
1468  *
1469  * Move the given vm_page from its current object to the specified
1470  * target object/offset.  The page must be busy and will remain so
1471  * on return.
1472  *
1473  * new_object must be held.
1474  * This routine might block. XXX ?
1475  *
1476  * NOTE: Swap associated with the page must be invalidated by the move.  We
1477  *       have to do this for several reasons:  (1) we aren't freeing the
1478  *       page, (2) we are dirtying the page, (3) the VM system is probably
1479  *       moving the page from object A to B, and will then later move
1480  *       the backing store from A to B and we can't have a conflict.
1481  *
1482  * NOTE: We *always* dirty the page.  It is necessary both for the
1483  *       fact that we moved it, and because we may be invalidating
1484  *       swap.  If the page is on the cache, we have to deactivate it
1485  *       or vm_page_dirty() will panic.  Dirty pages are not allowed
1486  *       on the cache.
1487  */
1488 void
1489 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1490 {
1491         KKASSERT(m->busy_count & PBUSY_LOCKED);
1492         ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1493         if (m->object) {
1494                 ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1495                 vm_page_remove(m);
1496         }
1497         if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1498                 panic("vm_page_rename: target exists (%p,%"PRIu64")",
1499                       new_object, new_pindex);
1500         }
1501         if (m->queue - m->pc == PQ_CACHE)
1502                 vm_page_deactivate(m);
1503         vm_page_dirty(m);
1504 }
1505
1506 /*
1507  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1508  * is to remain BUSYied by the caller.
1509  *
1510  * This routine may not block.
1511  */
1512 void
1513 vm_page_unqueue_nowakeup(vm_page_t m)
1514 {
1515         vm_page_and_queue_spin_lock(m);
1516         (void)_vm_page_rem_queue_spinlocked(m);
1517         vm_page_spin_unlock(m);
1518 }
1519
1520 /*
1521  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1522  * if necessary.
1523  *
1524  * This routine may not block.
1525  */
1526 void
1527 vm_page_unqueue(vm_page_t m)
1528 {
1529         u_short queue;
1530
1531         vm_page_and_queue_spin_lock(m);
1532         queue = _vm_page_rem_queue_spinlocked(m);
1533         if (queue == PQ_FREE || queue == PQ_CACHE) {
1534                 vm_page_spin_unlock(m);
1535                 pagedaemon_wakeup();
1536         } else {
1537                 vm_page_spin_unlock(m);
1538         }
1539 }
1540
1541 /*
1542  * vm_page_list_find()
1543  *
1544  * Find a page on the specified queue with color optimization.
1545  *
1546  * The page coloring optimization attempts to locate a page that does
1547  * not overload other nearby pages in the object in the cpu's L1 or L2
1548  * caches.  We need this optimization because cpu caches tend to be
1549  * physical caches, while object spaces tend to be virtual.
1550  *
1551  * The page coloring optimization also, very importantly, tries to localize
1552  * memory to cpus and physical sockets.
1553  *
1554  * On MP systems each PQ_FREE and PQ_CACHE color queue has its own spinlock
1555  * and the algorithm is adjusted to localize allocations on a per-core basis.
1556  * This is done by 'twisting' the colors.
1557  *
1558  * The page is returned spinlocked and removed from its queue (it will
1559  * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
1560  * is responsible for dealing with the busy-page case (usually by
1561  * deactivating the page and looping).
1562  *
1563  * NOTE:  This routine is carefully inlined.  A non-inlined version
1564  *        is available for outside callers but the only critical path is
1565  *        from within this source file.
1566  *
1567  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1568  *        represent stable storage, allowing us to order our locks vm_page
1569  *        first, then queue.
1570  */
1571 static __inline
1572 vm_page_t
1573 _vm_page_list_find(int basequeue, int index)
1574 {
1575         vm_page_t m;
1576
1577         for (;;) {
1578                 m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
1579                 if (m == NULL) {
1580                         m = _vm_page_list_find2(basequeue, index);
1581                         return(m);
1582                 }
1583                 vm_page_and_queue_spin_lock(m);
1584                 if (m->queue == basequeue + index) {
1585                         _vm_page_rem_queue_spinlocked(m);
1586                         /* vm_page_t spin held, no queue spin */
1587                         break;
1588                 }
1589                 vm_page_and_queue_spin_unlock(m);
1590         }
1591         return(m);
1592 }
1593
1594 /*
1595  * If we could not find the page in the desired queue try to find it in
1596  * a nearby queue.
1597  */
1598 static vm_page_t
1599 _vm_page_list_find2(int basequeue, int index)
1600 {
1601         struct vpgqueues *pq;
1602         vm_page_t m = NULL;
1603         int pqmask = PQ_SET_ASSOC_MASK >> 1;
1604         int pqi;
1605         int i;
1606
1607         index &= PQ_L2_MASK;
1608         pq = &vm_page_queues[basequeue];
1609
1610         /*
1611          * Run local sets of 16, 32, 64, 128, and the whole queue if all
1612          * else fails (PQ_L2_MASK which is 255).
1613          */
1614         do {
1615                 pqmask = (pqmask << 1) | 1;
1616                 for (i = 0; i <= pqmask; ++i) {
1617                         pqi = (index & ~pqmask) | ((index + i) & pqmask);
1618                         m = TAILQ_FIRST(&pq[pqi].pl);
1619                         if (m) {
1620                                 _vm_page_and_queue_spin_lock(m);
1621                                 if (m->queue == basequeue + pqi) {
1622                                         _vm_page_rem_queue_spinlocked(m);
1623                                         return(m);
1624                                 }
1625                                 _vm_page_and_queue_spin_unlock(m);
1626                                 --i;
1627                                 continue;
1628                         }
1629                 }
1630         } while (pqmask != PQ_L2_MASK);
1631
1632         return(m);
1633 }
1634
1635 /*
1636  * Returns a vm_page candidate for allocation.  The page is not busied so
1637  * it can move around.  The caller must busy the page (and typically
1638  * deactivate it if it cannot be busied!)
1639  *
1640  * Returns a spinlocked vm_page that has been removed from its queue.
1641  */
1642 vm_page_t
1643 vm_page_list_find(int basequeue, int index)
1644 {
1645         return(_vm_page_list_find(basequeue, index));
1646 }
1647
1648 /*
1649  * Find a page on the cache queue with color optimization, remove it
1650  * from the queue, and busy it.  The returned page will not be spinlocked.
1651  *
1652  * A candidate failure will be deactivated.  Candidates can fail due to
1653  * being busied by someone else, in which case they will be deactivated.
1654  *
1655  * This routine may not block.
1656  *
1657  */
1658 static vm_page_t
1659 vm_page_select_cache(u_short pg_color)
1660 {
1661         vm_page_t m;
1662
1663         for (;;) {
1664                 m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK);
1665                 if (m == NULL)
1666                         break;
1667                 /*
1668                  * (m) has been removed from its queue and spinlocked
1669                  */
1670                 if (vm_page_busy_try(m, TRUE)) {
1671                         _vm_page_deactivate_locked(m, 0);
1672                         vm_page_spin_unlock(m);
1673                 } else {
1674                         /*
1675                          * We successfully busied the page
1676                          */
1677                         if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0 &&
1678                             m->hold_count == 0 &&
1679                             m->wire_count == 0 &&
1680                             (m->dirty & m->valid) == 0) {
1681                                 vm_page_spin_unlock(m);
1682                                 pagedaemon_wakeup();
1683                                 return(m);
1684                         }
1685
1686                         /*
1687                          * The page cannot be recycled, deactivate it.
1688                          */
1689                         _vm_page_deactivate_locked(m, 0);
1690                         if (_vm_page_wakeup(m)) {
1691                                 vm_page_spin_unlock(m);
1692                                 wakeup(m);
1693                         } else {
1694                                 vm_page_spin_unlock(m);
1695                         }
1696                 }
1697         }
1698         return (m);
1699 }
1700
1701 /*
1702  * Find a free page.  We attempt to inline the nominal case and fall back
1703  * to _vm_page_select_free() otherwise.  A busied page is removed from
1704  * the queue and returned.
1705  *
1706  * This routine may not block.
1707  */
1708 static __inline vm_page_t
1709 vm_page_select_free(u_short pg_color)
1710 {
1711         vm_page_t m;
1712
1713         for (;;) {
1714                 m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK);
1715                 if (m == NULL)
1716                         break;
1717                 if (vm_page_busy_try(m, TRUE)) {
1718                         /*
1719                          * Various mechanisms such as a pmap_collect can
1720                          * result in a busy page on the free queue.  We
1721                          * have to move the page out of the way so we can
1722                          * retry the allocation.  If the other thread is not
1723                          * allocating the page then m->valid will remain 0 and
1724                          * the pageout daemon will free the page later on.
1725                          *
1726                          * Since we could not busy the page, however, we
1727                          * cannot make assumptions as to whether the page
1728                          * will be allocated by the other thread or not,
1729                          * so all we can do is deactivate it to move it out
1730                          * of the way.  In particular, if the other thread
1731                          * wires the page it may wind up on the inactive
1732                          * queue and the pageout daemon will have to deal
1733                          * with that case too.
1734                          */
1735                         _vm_page_deactivate_locked(m, 0);
1736                         vm_page_spin_unlock(m);
1737                 } else {
1738                         /*
1739                          * Theoretically if we are able to busy the page
1740                          * atomic with the queue removal (using the vm_page
1741                          * lock) nobody else should be able to mess with the
1742                          * page before us.
1743                          */
1744                         KKASSERT((m->flags & (PG_UNMANAGED |
1745                                               PG_NEED_COMMIT)) == 0);
1746                         KASSERT(m->hold_count == 0, ("m->hold_count is not zero "
1747                                                      "pg %p q=%d flags=%08x hold=%d wire=%d",
1748                                                      m, m->queue, m->flags, m->hold_count, m->wire_count));
1749                         KKASSERT(m->wire_count == 0);
1750                         vm_page_spin_unlock(m);
1751                         pagedaemon_wakeup();
1752
1753                         /* return busied and removed page */
1754                         return(m);
1755                 }
1756         }
1757         return(m);
1758 }
1759
1760 /*
1761  * vm_page_alloc()
1762  *
1763  * Allocate and return a memory cell associated with this VM object/offset
1764  * pair.  If object is NULL an unassociated page will be allocated.
1765  *
1766  * The returned page will be busied and removed from its queues.  This
1767  * routine can block and may return NULL if a race occurs and the page
1768  * is found to already exist at the specified (object, pindex).
1769  *
1770  *      VM_ALLOC_NORMAL         allow use of cache pages, nominal free drain
1771  *      VM_ALLOC_QUICK          like normal but cannot use cache
1772  *      VM_ALLOC_SYSTEM         greater free drain
1773  *      VM_ALLOC_INTERRUPT      allow free list to be completely drained
1774  *      VM_ALLOC_ZERO           advisory request for pre-zero'd page only
1775  *      VM_ALLOC_FORCE_ZERO     advisory request for pre-zero'd page only
1776  *      VM_ALLOC_NULL_OK        ok to return NULL on insertion collision
1777  *                              (see vm_page_grab())
1778  *      VM_ALLOC_USE_GD         ok to use per-gd cache
1779  *
1780  *      VM_ALLOC_CPU(n)         allocate using specified cpu localization
1781  *
1782  * The object must be held if not NULL
1783  * This routine may not block
1784  *
1785  * Additional special handling is required when called from an interrupt
1786  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
1787  * in this case.
1788  */
1789 vm_page_t
1790 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
1791 {
1792         globaldata_t gd;
1793         vm_object_t obj;
1794         vm_page_t m;
1795         u_short pg_color;
1796         int cpuid_local;
1797
1798 #if 0
1799         /*
1800          * Special per-cpu free VM page cache.  The pages are pre-busied
1801          * and pre-zerod for us.
1802          */
1803         if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
1804                 crit_enter_gd(gd);
1805                 if (gd->gd_vmpg_count) {
1806                         m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
1807                         crit_exit_gd(gd);
1808                         goto done;
1809                 }
1810                 crit_exit_gd(gd);
1811         }
1812 #endif
1813         m = NULL;
1814
1815         /*
1816          * CPU LOCALIZATION
1817          *
1818          * CPU localization algorithm.  Break the page queues up by physical
1819          * id and core id (note that two cpu threads will have the same core
1820          * id, and core_id != gd_cpuid).
1821          *
1822          * This is nowhere near perfect, for example the last pindex in a
1823          * subgroup will overflow into the next cpu or package.  But this
1824          * should get us good page reuse locality in heavy mixed loads.
1825          *
1826          * (may be executed before the APs are started, so other GDs might
1827          *  not exist!)
1828          */
1829         if (page_req & VM_ALLOC_CPU_SPEC)
1830                 cpuid_local = VM_ALLOC_GETCPU(page_req);
1831         else
1832                 cpuid_local = mycpu->gd_cpuid;
1833
1834         pg_color = vm_get_pg_color(cpuid_local, object, pindex);
1835
1836         KKASSERT(page_req &
1837                 (VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
1838                  VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
1839
1840         /*
1841          * Certain system threads (pageout daemon, buf_daemon's) are
1842          * allowed to eat deeper into the free page list.
1843          */
1844         if (curthread->td_flags & TDF_SYSTHREAD)
1845                 page_req |= VM_ALLOC_SYSTEM;
1846
1847         /*
1848          * Impose various limitations.  Note that the v_free_reserved test
1849          * must match the opposite of vm_page_count_target() to avoid
1850          * livelocks, be careful.
1851          */
1852 loop:
1853         gd = mycpu;
1854         if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
1855             ((page_req & VM_ALLOC_INTERRUPT) &&
1856              gd->gd_vmstats.v_free_count > 0) ||
1857             ((page_req & VM_ALLOC_SYSTEM) &&
1858              gd->gd_vmstats.v_cache_count == 0 &&
1859                 gd->gd_vmstats.v_free_count >
1860                 gd->gd_vmstats.v_interrupt_free_min)
1861         ) {
1862                 /*
1863                  * The free queue has sufficient free pages to take one out.
1864                  */
1865                 m = vm_page_select_free(pg_color);
1866         } else if (page_req & VM_ALLOC_NORMAL) {
1867                 /*
1868                  * Allocatable from the cache (non-interrupt only).  On
1869                  * success, we must free the page and try again, thus
1870                  * ensuring that vmstats.v_*_free_min counters are replenished.
1871                  */
1872 #ifdef INVARIANTS
1873                 if (curthread->td_preempted) {
1874                         kprintf("vm_page_alloc(): warning, attempt to allocate"
1875                                 " cache page from preempting interrupt\n");
1876                         m = NULL;
1877                 } else {
1878                         m = vm_page_select_cache(pg_color);
1879                 }
1880 #else
1881                 m = vm_page_select_cache(pg_color);
1882 #endif
1883                 /*
1884                  * On success move the page into the free queue and loop.
1885                  *
1886                  * Only do this if we can safely acquire the vm_object lock,
1887                  * because this is effectively a random page and the caller
1888                  * might be holding the lock shared, we don't want to
1889                  * deadlock.
1890                  */
1891                 if (m != NULL) {
1892                         KASSERT(m->dirty == 0,
1893                                 ("Found dirty cache page %p", m));
1894                         if ((obj = m->object) != NULL) {
1895                                 if (vm_object_hold_try(obj)) {
1896                                         vm_page_protect(m, VM_PROT_NONE);
1897                                         vm_page_free(m);
1898                                         /* m->object NULL here */
1899                                         vm_object_drop(obj);
1900                                 } else {
1901                                         vm_page_deactivate(m);
1902                                         vm_page_wakeup(m);
1903                                 }
1904                         } else {
1905                                 vm_page_protect(m, VM_PROT_NONE);
1906                                 vm_page_free(m);
1907                         }
1908                         goto loop;
1909                 }
1910
1911                 /*
1912                  * On failure return NULL
1913                  */
1914                 atomic_add_int(&vm_pageout_deficit, 1);
1915                 pagedaemon_wakeup();
1916                 return (NULL);
1917         } else {
1918                 /*
1919                  * No pages available, wakeup the pageout daemon and give up.
1920                  */
1921                 atomic_add_int(&vm_pageout_deficit, 1);
1922                 pagedaemon_wakeup();
1923                 return (NULL);
1924         }
1925
1926         /*
1927          * v_free_count can race so loop if we don't find the expected
1928          * page.
1929          */
1930         if (m == NULL) {
1931                 vmstats_rollup();
1932                 goto loop;
1933         }
1934
1935         /*
1936          * Good page found.  The page has already been busied for us and
1937          * removed from its queues.
1938          */
1939         KASSERT(m->dirty == 0,
1940                 ("vm_page_alloc: free/cache page %p was dirty", m));
1941         KKASSERT(m->queue == PQ_NONE);
1942
1943 #if 0
1944 done:
1945 #endif
1946         /*
1947          * Initialize the structure, inheriting some flags but clearing
1948          * all the rest.  The page has already been busied for us.
1949          */
1950         vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
1951
1952         KKASSERT(m->wire_count == 0);
1953         KKASSERT((m->busy_count & PBUSY_MASK) == 0);
1954         m->act_count = 0;
1955         m->valid = 0;
1956
1957         /*
1958          * Caller must be holding the object lock (asserted by
1959          * vm_page_insert()).
1960          *
1961          * NOTE: Inserting a page here does not insert it into any pmaps
1962          *       (which could cause us to block allocating memory).
1963          *
1964          * NOTE: If no object an unassociated page is allocated, m->pindex
1965          *       can be used by the caller for any purpose.
1966          */
1967         if (object) {
1968                 if (vm_page_insert(m, object, pindex) == FALSE) {
1969                         vm_page_free(m);
1970                         if ((page_req & VM_ALLOC_NULL_OK) == 0)
1971                                 panic("PAGE RACE %p[%ld]/%p",
1972                                       object, (long)pindex, m);
1973                         m = NULL;
1974                 }
1975         } else {
1976                 m->pindex = pindex;
1977         }
1978
1979         /*
1980          * Don't wakeup too often - wakeup the pageout daemon when
1981          * we would be nearly out of memory.
1982          */
1983         pagedaemon_wakeup();
1984
1985         /*
1986          * A BUSY page is returned.
1987          */
1988         return (m);
1989 }
1990
1991 /*
1992  * Returns number of pages available in our DMA memory reserve
1993  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
1994  */
1995 vm_size_t
1996 vm_contig_avail_pages(void)
1997 {
1998         alist_blk_t blk;
1999         alist_blk_t count;
2000         alist_blk_t bfree;
2001         spin_lock(&vm_contig_spin);
2002         bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2003         spin_unlock(&vm_contig_spin);
2004
2005         return bfree;
2006 }
2007
2008 /*
2009  * Attempt to allocate contiguous physical memory with the specified
2010  * requirements.
2011  */
2012 vm_page_t
2013 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2014                      unsigned long alignment, unsigned long boundary,
2015                      unsigned long size, vm_memattr_t memattr)
2016 {
2017         alist_blk_t blk;
2018         vm_page_t m;
2019         vm_pindex_t i;
2020 #if 0
2021         static vm_pindex_t contig_rover;
2022 #endif
2023
2024         alignment >>= PAGE_SHIFT;
2025         if (alignment == 0)
2026                 alignment = 1;
2027         boundary >>= PAGE_SHIFT;
2028         if (boundary == 0)
2029                 boundary = 1;
2030         size = (size + PAGE_MASK) >> PAGE_SHIFT;
2031
2032 #if 0
2033         /*
2034          * Disabled temporarily until we find a solution for DRM (a flag
2035          * to always use the free space reserve, for performance).
2036          */
2037         if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
2038             boundary <= PAGE_SIZE && size == 1 &&
2039             memattr == VM_MEMATTR_DEFAULT) {
2040                 /*
2041                  * Any page will work, use vm_page_alloc()
2042                  * (e.g. when used from kmem_alloc_attr())
2043                  */
2044                 m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
2045                                   VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2046                                   VM_ALLOC_INTERRUPT);
2047                 m->valid = VM_PAGE_BITS_ALL;
2048                 vm_page_wire(m);
2049                 vm_page_wakeup(m);
2050         } else
2051 #endif
2052         {
2053                 /*
2054                  * Use the low-memory dma reserve
2055                  */
2056                 spin_lock(&vm_contig_spin);
2057                 blk = alist_alloc(&vm_contig_alist, 0, size);
2058                 if (blk == ALIST_BLOCK_NONE) {
2059                         spin_unlock(&vm_contig_spin);
2060                         if (bootverbose) {
2061                                 kprintf("vm_page_alloc_contig: %ldk nospace\n",
2062                                         (size << PAGE_SHIFT) / 1024);
2063                                 print_backtrace(5);
2064                         }
2065                         return(NULL);
2066                 }
2067                 if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2068                         alist_free(&vm_contig_alist, blk, size);
2069                         spin_unlock(&vm_contig_spin);
2070                         if (bootverbose) {
2071                                 kprintf("vm_page_alloc_contig: %ldk high "
2072                                         "%016jx failed\n",
2073                                         (size << PAGE_SHIFT) / 1024,
2074                                         (intmax_t)high);
2075                         }
2076                         return(NULL);
2077                 }
2078                 spin_unlock(&vm_contig_spin);
2079                 m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2080         }
2081         if (vm_contig_verbose) {
2082                 kprintf("vm_page_alloc_contig: %016jx/%ldk "
2083                         "(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2084                         (intmax_t)m->phys_addr,
2085                         (size << PAGE_SHIFT) / 1024,
2086                         low, high, alignment, boundary, size, memattr);
2087         }
2088         if (memattr != VM_MEMATTR_DEFAULT) {
2089                 for (i = 0;i < size; i++)
2090                         pmap_page_set_memattr(&m[i], memattr);
2091         }
2092         return m;
2093 }
2094
2095 /*
2096  * Free contiguously allocated pages.  The pages will be wired but not busy.
2097  * When freeing to the alist we leave them wired and not busy.
2098  */
2099 void
2100 vm_page_free_contig(vm_page_t m, unsigned long size)
2101 {
2102         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2103         vm_pindex_t start = pa >> PAGE_SHIFT;
2104         vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2105
2106         if (vm_contig_verbose) {
2107                 kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2108                         (intmax_t)pa, size / 1024);
2109         }
2110         if (pa < vm_low_phys_reserved) {
2111                 KKASSERT(pa + size <= vm_low_phys_reserved);
2112                 spin_lock(&vm_contig_spin);
2113                 alist_free(&vm_contig_alist, start, pages);
2114                 spin_unlock(&vm_contig_spin);
2115         } else {
2116                 while (pages) {
2117                         vm_page_busy_wait(m, FALSE, "cpgfr");
2118                         vm_page_unwire(m, 0);
2119                         vm_page_free(m);
2120                         --pages;
2121                         ++m;
2122                 }
2123
2124         }
2125 }
2126
2127
2128 /*
2129  * Wait for sufficient free memory for nominal heavy memory use kernel
2130  * operations.
2131  *
2132  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2133  *           will trivially deadlock the system.
2134  */
2135 void
2136 vm_wait_nominal(void)
2137 {
2138         while (vm_page_count_min(0))
2139                 vm_wait(0);
2140 }
2141
2142 /*
2143  * Test if vm_wait_nominal() would block.
2144  */
2145 int
2146 vm_test_nominal(void)
2147 {
2148         if (vm_page_count_min(0))
2149                 return(1);
2150         return(0);
2151 }
2152
2153 /*
2154  * Block until free pages are available for allocation, called in various
2155  * places before memory allocations.
2156  *
2157  * The caller may loop if vm_page_count_min() == FALSE so we cannot be
2158  * more generous then that.
2159  */
2160 void
2161 vm_wait(int timo)
2162 {
2163         /*
2164          * never wait forever
2165          */
2166         if (timo == 0)
2167                 timo = hz;
2168         lwkt_gettoken(&vm_token);
2169
2170         if (curthread == pagethread ||
2171             curthread == emergpager) {
2172                 /*
2173                  * The pageout daemon itself needs pages, this is bad.
2174                  */
2175                 if (vm_page_count_min(0)) {
2176                         vm_pageout_pages_needed = 1;
2177                         tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2178                 }
2179         } else {
2180                 /*
2181                  * Wakeup the pageout daemon if necessary and wait.
2182                  *
2183                  * Do not wait indefinitely for the target to be reached,
2184                  * as load might prevent it from being reached any time soon.
2185                  * But wait a little to try to slow down page allocations
2186                  * and to give more important threads (the pagedaemon)
2187                  * allocation priority.
2188                  */
2189                 if (vm_page_count_target()) {
2190                         if (vm_pages_needed == 0) {
2191                                 vm_pages_needed = 1;
2192                                 wakeup(&vm_pages_needed);
2193                         }
2194                         ++vm_pages_waiting;     /* SMP race ok */
2195                         tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2196                 }
2197         }
2198         lwkt_reltoken(&vm_token);
2199 }
2200
2201 /*
2202  * Block until free pages are available for allocation
2203  *
2204  * Called only from vm_fault so that processes page faulting can be
2205  * easily tracked.
2206  */
2207 void
2208 vm_wait_pfault(void)
2209 {
2210         /*
2211          * Wakeup the pageout daemon if necessary and wait.
2212          *
2213          * Do not wait indefinitely for the target to be reached,
2214          * as load might prevent it from being reached any time soon.
2215          * But wait a little to try to slow down page allocations
2216          * and to give more important threads (the pagedaemon)
2217          * allocation priority.
2218          */
2219         if (vm_page_count_min(0)) {
2220                 lwkt_gettoken(&vm_token);
2221                 while (vm_page_count_severe()) {
2222                         if (vm_page_count_target()) {
2223                                 thread_t td;
2224
2225                                 if (vm_pages_needed == 0) {
2226                                         vm_pages_needed = 1;
2227                                         wakeup(&vm_pages_needed);
2228                                 }
2229                                 ++vm_pages_waiting;     /* SMP race ok */
2230                                 tsleep(&vmstats.v_free_count, 0, "pfault", hz);
2231
2232                                 /*
2233                                  * Do not stay stuck in the loop if the system is trying
2234                                  * to kill the process.
2235                                  */
2236                                 td = curthread;
2237                                 if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL))
2238                                         break;
2239                         }
2240                 }
2241                 lwkt_reltoken(&vm_token);
2242         }
2243 }
2244
2245 /*
2246  * Put the specified page on the active list (if appropriate).  Ensure
2247  * that act_count is at least ACT_INIT but do not otherwise mess with it.
2248  *
2249  * The caller should be holding the page busied ? XXX
2250  * This routine may not block.
2251  */
2252 void
2253 vm_page_activate(vm_page_t m)
2254 {
2255         u_short oqueue;
2256
2257         vm_page_spin_lock(m);
2258         if (m->queue - m->pc != PQ_ACTIVE) {
2259                 _vm_page_queue_spin_lock(m);
2260                 oqueue = _vm_page_rem_queue_spinlocked(m);
2261                 /* page is left spinlocked, queue is unlocked */
2262
2263                 if (oqueue == PQ_CACHE)
2264                         mycpu->gd_cnt.v_reactivated++;
2265                 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2266                         if (m->act_count < ACT_INIT)
2267                                 m->act_count = ACT_INIT;
2268                         _vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
2269                 }
2270                 _vm_page_and_queue_spin_unlock(m);
2271                 if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
2272                         pagedaemon_wakeup();
2273         } else {
2274                 if (m->act_count < ACT_INIT)
2275                         m->act_count = ACT_INIT;
2276                 vm_page_spin_unlock(m);
2277         }
2278 }
2279
2280 /*
2281  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
2282  * routine is called when a page has been added to the cache or free
2283  * queues.
2284  *
2285  * This routine may not block.
2286  */
2287 static __inline void
2288 vm_page_free_wakeup(void)
2289 {
2290         globaldata_t gd = mycpu;
2291
2292         /*
2293          * If the pageout daemon itself needs pages, then tell it that
2294          * there are some free.
2295          */
2296         if (vm_pageout_pages_needed &&
2297             gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
2298             gd->gd_vmstats.v_pageout_free_min
2299         ) {
2300                 vm_pageout_pages_needed = 0;
2301                 wakeup(&vm_pageout_pages_needed);
2302         }
2303
2304         /*
2305          * Wakeup processes that are waiting on memory.
2306          *
2307          * Generally speaking we want to wakeup stuck processes as soon as
2308          * possible.  !vm_page_count_min(0) is the absolute minimum point
2309          * where we can do this.  Wait a bit longer to reduce degenerate
2310          * re-blocking (vm_page_free_hysteresis).  The target check is just
2311          * to make sure the min-check w/hysteresis does not exceed the
2312          * normal target.
2313          */
2314         if (vm_pages_waiting) {
2315                 if (!vm_page_count_min(vm_page_free_hysteresis) ||
2316                     !vm_page_count_target()) {
2317                         vm_pages_waiting = 0;
2318                         wakeup(&vmstats.v_free_count);
2319                         ++mycpu->gd_cnt.v_ppwakeups;
2320                 }
2321 #if 0
2322                 if (!vm_page_count_target()) {
2323                         /*
2324                          * Plenty of pages are free, wakeup everyone.
2325                          */
2326                         vm_pages_waiting = 0;
2327                         wakeup(&vmstats.v_free_count);
2328                         ++mycpu->gd_cnt.v_ppwakeups;
2329                 } else if (!vm_page_count_min(0)) {
2330                         /*
2331                          * Some pages are free, wakeup someone.
2332                          */
2333                         int wcount = vm_pages_waiting;
2334                         if (wcount > 0)
2335                                 --wcount;
2336                         vm_pages_waiting = wcount;
2337                         wakeup_one(&vmstats.v_free_count);
2338                         ++mycpu->gd_cnt.v_ppwakeups;
2339                 }
2340 #endif
2341         }
2342 }
2343
2344 /*
2345  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
2346  * it from its VM object.
2347  *
2348  * The vm_page must be BUSY on entry.  BUSY will be released on
2349  * return (the page will have been freed).
2350  */
2351 void
2352 vm_page_free_toq(vm_page_t m)
2353 {
2354         mycpu->gd_cnt.v_tfree++;
2355         KKASSERT((m->flags & PG_MAPPED) == 0);
2356         KKASSERT(m->busy_count & PBUSY_LOCKED);
2357
2358         if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
2359                 kprintf("vm_page_free: pindex(%lu), busy %08x, "
2360                         "hold(%d)\n",
2361                         (u_long)m->pindex, m->busy_count, m->hold_count);
2362                 if ((m->queue - m->pc) == PQ_FREE)
2363                         panic("vm_page_free: freeing free page");
2364                 else
2365                         panic("vm_page_free: freeing busy page");
2366         }
2367
2368         /*
2369          * Remove from object, spinlock the page and its queues and
2370          * remove from any queue.  No queue spinlock will be held
2371          * after this section (because the page was removed from any
2372          * queue).
2373          */
2374         vm_page_remove(m);
2375         vm_page_and_queue_spin_lock(m);
2376         _vm_page_rem_queue_spinlocked(m);
2377
2378         /*
2379          * No further management of fictitious pages occurs beyond object
2380          * and queue removal.
2381          */
2382         if ((m->flags & PG_FICTITIOUS) != 0) {
2383                 vm_page_spin_unlock(m);
2384                 vm_page_wakeup(m);
2385                 return;
2386         }
2387
2388         m->valid = 0;
2389         vm_page_undirty(m);
2390
2391         if (m->wire_count != 0) {
2392                 if (m->wire_count > 1) {
2393                     panic(
2394                         "vm_page_free: invalid wire count (%d), pindex: 0x%lx",
2395                         m->wire_count, (long)m->pindex);
2396                 }
2397                 panic("vm_page_free: freeing wired page");
2398         }
2399
2400         /*
2401          * Clear the UNMANAGED flag when freeing an unmanaged page.
2402          * Clear the NEED_COMMIT flag
2403          */
2404         if (m->flags & PG_UNMANAGED)
2405                 vm_page_flag_clear(m, PG_UNMANAGED);
2406         if (m->flags & PG_NEED_COMMIT)
2407                 vm_page_flag_clear(m, PG_NEED_COMMIT);
2408
2409         if (m->hold_count != 0) {
2410                 _vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
2411         } else {
2412                 _vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
2413         }
2414
2415         /*
2416          * This sequence allows us to clear BUSY while still holding
2417          * its spin lock, which reduces contention vs allocators.  We
2418          * must not leave the queue locked or _vm_page_wakeup() may
2419          * deadlock.
2420          */
2421         _vm_page_queue_spin_unlock(m);
2422         if (_vm_page_wakeup(m)) {
2423                 vm_page_spin_unlock(m);
2424                 wakeup(m);
2425         } else {
2426                 vm_page_spin_unlock(m);
2427         }
2428         vm_page_free_wakeup();
2429 }
2430
2431 /*
2432  * vm_page_unmanage()
2433  *
2434  * Prevent PV management from being done on the page.  The page is
2435  * removed from the paging queues as if it were wired, and as a
2436  * consequence of no longer being managed the pageout daemon will not
2437  * touch it (since there is no way to locate the pte mappings for the
2438  * page).  madvise() calls that mess with the pmap will also no longer
2439  * operate on the page.
2440  *
2441  * Beyond that the page is still reasonably 'normal'.  Freeing the page
2442  * will clear the flag.
2443  *
2444  * This routine is used by OBJT_PHYS objects - objects using unswappable
2445  * physical memory as backing store rather then swap-backed memory and
2446  * will eventually be extended to support 4MB unmanaged physical
2447  * mappings.
2448  *
2449  * Caller must be holding the page busy.
2450  */
2451 void
2452 vm_page_unmanage(vm_page_t m)
2453 {
2454         KKASSERT(m->busy_count & PBUSY_LOCKED);
2455         if ((m->flags & PG_UNMANAGED) == 0) {
2456                 if (m->wire_count == 0)
2457                         vm_page_unqueue(m);
2458         }
2459         vm_page_flag_set(m, PG_UNMANAGED);
2460 }
2461
2462 /*
2463  * Mark this page as wired down by yet another map, removing it from
2464  * paging queues as necessary.
2465  *
2466  * Caller must be holding the page busy.
2467  */
2468 void
2469 vm_page_wire(vm_page_t m)
2470 {
2471         /*
2472          * Only bump the wire statistics if the page is not already wired,
2473          * and only unqueue the page if it is on some queue (if it is unmanaged
2474          * it is already off the queues).  Don't do anything with fictitious
2475          * pages because they are always wired.
2476          */
2477         KKASSERT(m->busy_count & PBUSY_LOCKED);
2478         if ((m->flags & PG_FICTITIOUS) == 0) {
2479                 if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
2480                         if ((m->flags & PG_UNMANAGED) == 0)
2481                                 vm_page_unqueue(m);
2482                         atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
2483                 }
2484                 KASSERT(m->wire_count != 0,
2485                         ("vm_page_wire: wire_count overflow m=%p", m));
2486         }
2487 }
2488
2489 /*
2490  * Release one wiring of this page, potentially enabling it to be paged again.
2491  *
2492  * Many pages placed on the inactive queue should actually go
2493  * into the cache, but it is difficult to figure out which.  What
2494  * we do instead, if the inactive target is well met, is to put
2495  * clean pages at the head of the inactive queue instead of the tail.
2496  * This will cause them to be moved to the cache more quickly and
2497  * if not actively re-referenced, freed more quickly.  If we just
2498  * stick these pages at the end of the inactive queue, heavy filesystem
2499  * meta-data accesses can cause an unnecessary paging load on memory bound
2500  * processes.  This optimization causes one-time-use metadata to be
2501  * reused more quickly.
2502  *
2503  * Pages marked PG_NEED_COMMIT are always activated and never placed on
2504  * the inactive queue.  This helps the pageout daemon determine memory
2505  * pressure and act on out-of-memory situations more quickly.
2506  *
2507  * BUT, if we are in a low-memory situation we have no choice but to
2508  * put clean pages on the cache queue.
2509  *
2510  * A number of routines use vm_page_unwire() to guarantee that the page
2511  * will go into either the inactive or active queues, and will NEVER
2512  * be placed in the cache - for example, just after dirtying a page.
2513  * dirty pages in the cache are not allowed.
2514  *
2515  * This routine may not block.
2516  */
2517 void
2518 vm_page_unwire(vm_page_t m, int activate)
2519 {
2520         KKASSERT(m->busy_count & PBUSY_LOCKED);
2521         if (m->flags & PG_FICTITIOUS) {
2522                 /* do nothing */
2523         } else if (m->wire_count <= 0) {
2524                 panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
2525         } else {
2526                 if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
2527                         atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
2528                         if (m->flags & PG_UNMANAGED) {
2529                                 ;
2530                         } else if (activate || (m->flags & PG_NEED_COMMIT)) {
2531                                 vm_page_spin_lock(m);
2532                                 _vm_page_add_queue_spinlocked(m,
2533                                                         PQ_ACTIVE + m->pc, 0);
2534                                 _vm_page_and_queue_spin_unlock(m);
2535                         } else {
2536                                 vm_page_spin_lock(m);
2537                                 vm_page_flag_clear(m, PG_WINATCFLS);
2538                                 _vm_page_add_queue_spinlocked(m,
2539                                                         PQ_INACTIVE + m->pc, 0);
2540                                 ++vm_swapcache_inactive_heuristic;
2541                                 _vm_page_and_queue_spin_unlock(m);
2542                         }
2543                 }
2544         }
2545 }
2546
2547 /*
2548  * Move the specified page to the inactive queue.  If the page has
2549  * any associated swap, the swap is deallocated.
2550  *
2551  * Normally athead is 0 resulting in LRU operation.  athead is set
2552  * to 1 if we want this page to be 'as if it were placed in the cache',
2553  * except without unmapping it from the process address space.
2554  *
2555  * vm_page's spinlock must be held on entry and will remain held on return.
2556  * This routine may not block.
2557  */
2558 static void
2559 _vm_page_deactivate_locked(vm_page_t m, int athead)
2560 {
2561         u_short oqueue;
2562
2563         /*
2564          * Ignore if already inactive.
2565          */
2566         if (m->queue - m->pc == PQ_INACTIVE)
2567                 return;
2568         _vm_page_queue_spin_lock(m);
2569         oqueue = _vm_page_rem_queue_spinlocked(m);
2570
2571         if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2572                 if (oqueue == PQ_CACHE)
2573                         mycpu->gd_cnt.v_reactivated++;
2574                 vm_page_flag_clear(m, PG_WINATCFLS);
2575                 _vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
2576                 if (athead == 0)
2577                         ++vm_swapcache_inactive_heuristic;
2578         }
2579         /* NOTE: PQ_NONE if condition not taken */
2580         _vm_page_queue_spin_unlock(m);
2581         /* leaves vm_page spinlocked */
2582 }
2583
2584 /*
2585  * Attempt to deactivate a page.
2586  *
2587  * No requirements.
2588  */
2589 void
2590 vm_page_deactivate(vm_page_t m)
2591 {
2592         vm_page_spin_lock(m);
2593         _vm_page_deactivate_locked(m, 0);
2594         vm_page_spin_unlock(m);
2595 }
2596
2597 void
2598 vm_page_deactivate_locked(vm_page_t m)
2599 {
2600         _vm_page_deactivate_locked(m, 0);
2601 }
2602
2603 /*
2604  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
2605  *
2606  * This function returns non-zero if it successfully moved the page to
2607  * PQ_CACHE.
2608  *
2609  * This function unconditionally unbusies the page on return.
2610  */
2611 int
2612 vm_page_try_to_cache(vm_page_t m)
2613 {
2614         vm_page_spin_lock(m);
2615         if (m->dirty || m->hold_count || m->wire_count ||
2616             (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT))) {
2617                 if (_vm_page_wakeup(m)) {
2618                         vm_page_spin_unlock(m);
2619                         wakeup(m);
2620                 } else {
2621                         vm_page_spin_unlock(m);
2622                 }
2623                 return(0);
2624         }
2625         vm_page_spin_unlock(m);
2626
2627         /*
2628          * Page busied by us and no longer spinlocked.  Dirty pages cannot
2629          * be moved to the cache.
2630          */
2631         vm_page_test_dirty(m);
2632         if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2633                 vm_page_wakeup(m);
2634                 return(0);
2635         }
2636         vm_page_cache(m);
2637         return(1);
2638 }
2639
2640 /*
2641  * Attempt to free the page.  If we cannot free it, we do nothing.
2642  * 1 is returned on success, 0 on failure.
2643  *
2644  * No requirements.
2645  */
2646 int
2647 vm_page_try_to_free(vm_page_t m)
2648 {
2649         vm_page_spin_lock(m);
2650         if (vm_page_busy_try(m, TRUE)) {
2651                 vm_page_spin_unlock(m);
2652                 return(0);
2653         }
2654
2655         /*
2656          * The page can be in any state, including already being on the free
2657          * queue.  Check to see if it really can be freed.
2658          */
2659         if (m->dirty ||                         /* can't free if it is dirty */
2660             m->hold_count ||                    /* or held (XXX may be wrong) */
2661             m->wire_count ||                    /* or wired */
2662             (m->flags & (PG_UNMANAGED |         /* or unmanaged */
2663                          PG_NEED_COMMIT)) ||    /* or needs a commit */
2664             m->queue - m->pc == PQ_FREE ||      /* already on PQ_FREE */
2665             m->queue - m->pc == PQ_HOLD) {      /* already on PQ_HOLD */
2666                 if (_vm_page_wakeup(m)) {
2667                         vm_page_spin_unlock(m);
2668                         wakeup(m);
2669                 } else {
2670                         vm_page_spin_unlock(m);
2671                 }
2672                 return(0);
2673         }
2674         vm_page_spin_unlock(m);
2675
2676         /*
2677          * We can probably free the page.
2678          *
2679          * Page busied by us and no longer spinlocked.  Dirty pages will
2680          * not be freed by this function.    We have to re-test the
2681          * dirty bit after cleaning out the pmaps.
2682          */
2683         vm_page_test_dirty(m);
2684         if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2685                 vm_page_wakeup(m);
2686                 return(0);
2687         }
2688         vm_page_protect(m, VM_PROT_NONE);
2689         if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2690                 vm_page_wakeup(m);
2691                 return(0);
2692         }
2693         vm_page_free(m);
2694         return(1);
2695 }
2696
2697 /*
2698  * vm_page_cache
2699  *
2700  * Put the specified page onto the page cache queue (if appropriate).
2701  *
2702  * The page must be busy, and this routine will release the busy and
2703  * possibly even free the page.
2704  */
2705 void
2706 vm_page_cache(vm_page_t m)
2707 {
2708         /*
2709          * Not suitable for the cache
2710          */
2711         if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
2712             (m->busy_count & PBUSY_MASK) ||
2713             m->wire_count || m->hold_count) {
2714                 vm_page_wakeup(m);
2715                 return;
2716         }
2717
2718         /*
2719          * Already in the cache (and thus not mapped)
2720          */
2721         if ((m->queue - m->pc) == PQ_CACHE) {
2722                 KKASSERT((m->flags & PG_MAPPED) == 0);
2723                 vm_page_wakeup(m);
2724                 return;
2725         }
2726
2727         /*
2728          * Caller is required to test m->dirty, but note that the act of
2729          * removing the page from its maps can cause it to become dirty
2730          * on an SMP system due to another cpu running in usermode.
2731          */
2732         if (m->dirty) {
2733                 panic("vm_page_cache: caching a dirty page, pindex: %ld",
2734                         (long)m->pindex);
2735         }
2736
2737         /*
2738          * Remove all pmaps and indicate that the page is not
2739          * writeable or mapped.  Our vm_page_protect() call may
2740          * have blocked (especially w/ VM_PROT_NONE), so recheck
2741          * everything.
2742          */
2743         vm_page_protect(m, VM_PROT_NONE);
2744         if ((m->flags & (PG_UNMANAGED | PG_MAPPED)) ||
2745             (m->busy_count & PBUSY_MASK) ||
2746             m->wire_count || m->hold_count) {
2747                 vm_page_wakeup(m);
2748         } else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2749                 vm_page_deactivate(m);
2750                 vm_page_wakeup(m);
2751         } else {
2752                 _vm_page_and_queue_spin_lock(m);
2753                 _vm_page_rem_queue_spinlocked(m);
2754                 _vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
2755                 _vm_page_queue_spin_unlock(m);
2756                 if (_vm_page_wakeup(m)) {
2757                         vm_page_spin_unlock(m);
2758                         wakeup(m);
2759                 } else {
2760                         vm_page_spin_unlock(m);
2761                 }
2762                 vm_page_free_wakeup();
2763         }
2764 }
2765
2766 /*
2767  * vm_page_dontneed()
2768  *
2769  * Cache, deactivate, or do nothing as appropriate.  This routine
2770  * is typically used by madvise() MADV_DONTNEED.
2771  *
2772  * Generally speaking we want to move the page into the cache so
2773  * it gets reused quickly.  However, this can result in a silly syndrome
2774  * due to the page recycling too quickly.  Small objects will not be
2775  * fully cached.  On the otherhand, if we move the page to the inactive
2776  * queue we wind up with a problem whereby very large objects
2777  * unnecessarily blow away our inactive and cache queues.
2778  *
2779  * The solution is to move the pages based on a fixed weighting.  We
2780  * either leave them alone, deactivate them, or move them to the cache,
2781  * where moving them to the cache has the highest weighting.
2782  * By forcing some pages into other queues we eventually force the
2783  * system to balance the queues, potentially recovering other unrelated
2784  * space from active.  The idea is to not force this to happen too
2785  * often.
2786  *
2787  * The page must be busied.
2788  */
2789 void
2790 vm_page_dontneed(vm_page_t m)
2791 {
2792         static int dnweight;
2793         int dnw;
2794         int head;
2795
2796         dnw = ++dnweight;
2797
2798         /*
2799          * occassionally leave the page alone
2800          */
2801         if ((dnw & 0x01F0) == 0 ||
2802             m->queue - m->pc == PQ_INACTIVE ||
2803             m->queue - m->pc == PQ_CACHE
2804         ) {
2805                 if (m->act_count >= ACT_INIT)
2806                         --m->act_count;
2807                 return;
2808         }
2809
2810         /*
2811          * If vm_page_dontneed() is inactivating a page, it must clear
2812          * the referenced flag; otherwise the pagedaemon will see references
2813          * on the page in the inactive queue and reactivate it. Until the
2814          * page can move to the cache queue, madvise's job is not done.
2815          */
2816         vm_page_flag_clear(m, PG_REFERENCED);
2817         pmap_clear_reference(m);
2818
2819         if (m->dirty == 0)
2820                 vm_page_test_dirty(m);
2821
2822         if (m->dirty || (dnw & 0x0070) == 0) {
2823                 /*
2824                  * Deactivate the page 3 times out of 32.
2825                  */
2826                 head = 0;
2827         } else {
2828                 /*
2829                  * Cache the page 28 times out of every 32.  Note that
2830                  * the page is deactivated instead of cached, but placed
2831                  * at the head of the queue instead of the tail.
2832                  */
2833                 head = 1;
2834         }
2835         vm_page_spin_lock(m);
2836         _vm_page_deactivate_locked(m, head);
2837         vm_page_spin_unlock(m);
2838 }
2839
2840 /*
2841  * These routines manipulate the 'soft busy' count for a page.  A soft busy
2842  * is almost like a hard BUSY except that it allows certain compatible
2843  * operations to occur on the page while it is busy.  For example, a page
2844  * undergoing a write can still be mapped read-only.
2845  *
2846  * We also use soft-busy to quickly pmap_enter shared read-only pages
2847  * without having to hold the page locked.
2848  *
2849  * The soft-busy count can be > 1 in situations where multiple threads
2850  * are pmap_enter()ing the same page simultaneously, or when two buffer
2851  * cache buffers overlap the same page.
2852  *
2853  * The caller must hold the page BUSY when making these two calls.
2854  */
2855 void
2856 vm_page_io_start(vm_page_t m)
2857 {
2858         uint32_t ocount;
2859
2860         ocount = atomic_fetchadd_int(&m->busy_count, 1);
2861         KKASSERT(ocount & PBUSY_LOCKED);
2862 }
2863
2864 void
2865 vm_page_io_finish(vm_page_t m)
2866 {
2867         uint32_t ocount;
2868
2869         ocount = atomic_fetchadd_int(&m->busy_count, -1);
2870         KKASSERT(ocount & PBUSY_MASK);
2871 #if 0
2872         if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
2873                 wakeup(m);
2874 #endif
2875 }
2876
2877 /*
2878  * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
2879  *
2880  * We can't use fetchadd here because we might race a hard-busy and the
2881  * page freeing code asserts on a non-zero soft-busy count (even if only
2882  * temporary).
2883  *
2884  * Returns 0 on success, non-zero on failure.
2885  */
2886 int
2887 vm_page_sbusy_try(vm_page_t m)
2888 {
2889         uint32_t ocount;
2890
2891         for (;;) {
2892                 ocount = m->busy_count;
2893                 cpu_ccfence();
2894                 if (ocount & PBUSY_LOCKED)
2895                         return 1;
2896                 if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1))
2897                         break;
2898         }
2899         return 0;
2900 #if 0
2901         if (m->busy_count & PBUSY_LOCKED)
2902                 return 1;
2903         ocount = atomic_fetchadd_int(&m->busy_count, 1);
2904         if (ocount & PBUSY_LOCKED) {
2905                 vm_page_sbusy_drop(m);
2906                 return 1;
2907         }
2908         return 0;
2909 #endif
2910 }
2911
2912 /*
2913  * Indicate that a clean VM page requires a filesystem commit and cannot
2914  * be reused.  Used by tmpfs.
2915  */
2916 void
2917 vm_page_need_commit(vm_page_t m)
2918 {
2919         vm_page_flag_set(m, PG_NEED_COMMIT);
2920         vm_object_set_writeable_dirty(m->object);
2921 }
2922
2923 void
2924 vm_page_clear_commit(vm_page_t m)
2925 {
2926         vm_page_flag_clear(m, PG_NEED_COMMIT);
2927 }
2928
2929 /*
2930  * Grab a page, blocking if it is busy and allocating a page if necessary.
2931  * A busy page is returned or NULL.  The page may or may not be valid and
2932  * might not be on a queue (the caller is responsible for the disposition of
2933  * the page).
2934  *
2935  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
2936  * page will be zero'd and marked valid.
2937  *
2938  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
2939  * valid even if it already exists.
2940  *
2941  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
2942  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
2943  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
2944  *
2945  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
2946  * always returned if we had blocked.
2947  *
2948  * This routine may not be called from an interrupt.
2949  *
2950  * No other requirements.
2951  */
2952 vm_page_t
2953 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2954 {
2955         vm_page_t m;
2956         int error;
2957         int shared = 1;
2958
2959         KKASSERT(allocflags &
2960                 (VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2961         vm_object_hold_shared(object);
2962         for (;;) {
2963                 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
2964                 if (error) {
2965                         vm_page_sleep_busy(m, TRUE, "pgrbwt");
2966                         if ((allocflags & VM_ALLOC_RETRY) == 0) {
2967                                 m = NULL;
2968                                 break;
2969                         }
2970                         /* retry */
2971                 } else if (m == NULL) {
2972                         if (shared) {
2973                                 vm_object_upgrade(object);
2974                                 shared = 0;
2975                         }
2976                         if (allocflags & VM_ALLOC_RETRY)
2977                                 allocflags |= VM_ALLOC_NULL_OK;
2978                         m = vm_page_alloc(object, pindex,
2979                                           allocflags & ~VM_ALLOC_RETRY);
2980                         if (m)
2981                                 break;
2982                         vm_wait(0);
2983                         if ((allocflags & VM_ALLOC_RETRY) == 0)
2984                                 goto failed;
2985                 } else {
2986                         /* m found */
2987                         break;
2988                 }
2989         }
2990
2991         /*
2992          * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
2993          *
2994          * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
2995          * valid even if already valid.
2996          *
2997          * NOTE!  We have removed all of the PG_ZERO optimizations and also
2998          *        removed the idle zeroing code.  These optimizations actually
2999          *        slow things down on modern cpus because the zerod area is
3000          *        likely uncached, placing a memory-access burden on the
3001          *        accesors taking the fault.
3002          *
3003          *        By always zeroing the page in-line with the fault, no
3004          *        dynamic ram reads are needed and the caches are hot, ready
3005          *        for userland to access the memory.
3006          */
3007         if (m->valid == 0) {
3008                 if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3009                         pmap_zero_page(VM_PAGE_TO_PHYS(m));
3010                         m->valid = VM_PAGE_BITS_ALL;
3011                 }
3012         } else if (allocflags & VM_ALLOC_FORCE_ZERO) {
3013                 pmap_zero_page(VM_PAGE_TO_PHYS(m));
3014                 m->valid = VM_PAGE_BITS_ALL;
3015         }
3016 failed:
3017         vm_object_drop(object);
3018         return(m);
3019 }
3020
3021 /*
3022  * Mapping function for valid bits or for dirty bits in
3023  * a page.  May not block.
3024  *
3025  * Inputs are required to range within a page.
3026  *
3027  * No requirements.
3028  * Non blocking.
3029  */
3030 int
3031 vm_page_bits(int base, int size)
3032 {
3033         int first_bit;
3034         int last_bit;
3035
3036         KASSERT(
3037             base + size <= PAGE_SIZE,
3038             ("vm_page_bits: illegal base/size %d/%d", base, size)
3039         );
3040
3041         if (size == 0)          /* handle degenerate case */
3042                 return(0);
3043
3044         first_bit = base >> DEV_BSHIFT;
3045         last_bit = (base + size - 1) >> DEV_BSHIFT;
3046
3047         return ((2 << last_bit) - (1 << first_bit));
3048 }
3049
3050 /*
3051  * Sets portions of a page valid and clean.  The arguments are expected
3052  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3053  * of any partial chunks touched by the range.  The invalid portion of
3054  * such chunks will be zero'd.
3055  *
3056  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
3057  *       align base to DEV_BSIZE so as not to mark clean a partially
3058  *       truncated device block.  Otherwise the dirty page status might be
3059  *       lost.
3060  *
3061  * This routine may not block.
3062  *
3063  * (base + size) must be less then or equal to PAGE_SIZE.
3064  */
3065 static void
3066 _vm_page_zero_valid(vm_page_t m, int base, int size)
3067 {
3068         int frag;
3069         int endoff;
3070
3071         if (size == 0)  /* handle degenerate case */
3072                 return;
3073
3074         /*
3075          * If the base is not DEV_BSIZE aligned and the valid
3076          * bit is clear, we have to zero out a portion of the
3077          * first block.
3078          */
3079
3080         if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
3081             (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3082         ) {
3083                 pmap_zero_page_area(
3084                     VM_PAGE_TO_PHYS(m),
3085                     frag,
3086                     base - frag
3087                 );
3088         }
3089
3090         /*
3091          * If the ending offset is not DEV_BSIZE aligned and the
3092          * valid bit is clear, we have to zero out a portion of
3093          * the last block.
3094          */
3095
3096         endoff = base + size;
3097
3098         if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
3099             (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3100         ) {
3101                 pmap_zero_page_area(
3102                     VM_PAGE_TO_PHYS(m),
3103                     endoff,
3104                     DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3105                 );
3106         }
3107 }
3108
3109 /*
3110  * Set valid, clear dirty bits.  If validating the entire
3111  * page we can safely clear the pmap modify bit.  We also
3112  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3113  * takes a write fault on a MAP_NOSYNC memory area the flag will
3114  * be set again.
3115  *
3116  * We set valid bits inclusive of any overlap, but we can only
3117  * clear dirty bits for DEV_BSIZE chunks that are fully within
3118  * the range.
3119  *
3120  * Page must be busied?
3121  * No other requirements.
3122  */
3123 void
3124 vm_page_set_valid(vm_page_t m, int base, int size)
3125 {
3126         _vm_page_zero_valid(m, base, size);
3127         m->valid |= vm_page_bits(base, size);
3128 }
3129
3130
3131 /*
3132  * Set valid bits and clear dirty bits.
3133  *
3134  * Page must be busied by caller.
3135  *
3136  * NOTE: This function does not clear the pmap modified bit.
3137  *       Also note that e.g. NFS may use a byte-granular base
3138  *       and size.
3139  *
3140  * No other requirements.
3141  */
3142 void
3143 vm_page_set_validclean(vm_page_t m, int base, int size)
3144 {
3145         int pagebits;
3146
3147         _vm_page_zero_valid(m, base, size);
3148         pagebits = vm_page_bits(base, size);
3149         m->valid |= pagebits;
3150         m->dirty &= ~pagebits;
3151         if (base == 0 && size == PAGE_SIZE) {
3152                 /*pmap_clear_modify(m);*/
3153                 vm_page_flag_clear(m, PG_NOSYNC);
3154         }
3155 }
3156
3157 /*
3158  * Set valid & dirty.  Used by buwrite()
3159  *
3160  * Page must be busied by caller.
3161  */
3162 void
3163 vm_page_set_validdirty(vm_page_t m, int base, int size)
3164 {
3165         int pagebits;
3166
3167         pagebits = vm_page_bits(base, size);
3168         m->valid |= pagebits;
3169         m->dirty |= pagebits;
3170         if (m->object)
3171                vm_object_set_writeable_dirty(m->object);
3172 }
3173
3174 /*
3175  * Clear dirty bits.
3176  *
3177  * NOTE: This function does not clear the pmap modified bit.
3178  *       Also note that e.g. NFS may use a byte-granular base
3179  *       and size.
3180  *
3181  * Page must be busied?
3182  * No other requirements.
3183  */
3184 void
3185 vm_page_clear_dirty(vm_page_t m, int base, int size)
3186 {
3187         m->dirty &= ~vm_page_bits(base, size);
3188         if (base == 0 && size == PAGE_SIZE) {
3189                 /*pmap_clear_modify(m);*/
3190                 vm_page_flag_clear(m, PG_NOSYNC);
3191         }
3192 }
3193
3194 /*
3195  * Make the page all-dirty.
3196  *
3197  * Also make sure the related object and vnode reflect the fact that the
3198  * object may now contain a dirty page.
3199  *
3200  * Page must be busied?
3201  * No other requirements.
3202  */
3203 void
3204 vm_page_dirty(vm_page_t m)
3205 {
3206 #ifdef INVARIANTS
3207         int pqtype = m->queue - m->pc;
3208 #endif
3209         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3210                 ("vm_page_dirty: page in free/cache queue!"));
3211         if (m->dirty != VM_PAGE_BITS_ALL) {
3212                 m->dirty = VM_PAGE_BITS_ALL;
3213                 if (m->object)
3214                         vm_object_set_writeable_dirty(m->object);
3215         }
3216 }
3217
3218 /*
3219  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3220  * valid and dirty bits for the effected areas are cleared.
3221  *
3222  * Page must be busied?
3223  * Does not block.
3224  * No other requirements.
3225  */
3226 void
3227 vm_page_set_invalid(vm_page_t m, int base, int size)
3228 {
3229         int bits;
3230
3231         bits = vm_page_bits(base, size);
3232         m->valid &= ~bits;
3233         m->dirty &= ~bits;
3234         atomic_add_int(&m->object->generation, 1);
3235 }
3236
3237 /*
3238  * The kernel assumes that the invalid portions of a page contain
3239  * garbage, but such pages can be mapped into memory by user code.
3240  * When this occurs, we must zero out the non-valid portions of the
3241  * page so user code sees what it expects.
3242  *
3243  * Pages are most often semi-valid when the end of a file is mapped
3244  * into memory and the file's size is not page aligned.
3245  *
3246  * Page must be busied?
3247  * No other requirements.
3248  */
3249 void
3250 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
3251 {
3252         int b;
3253         int i;
3254
3255         /*
3256          * Scan the valid bits looking for invalid sections that
3257          * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
3258          * valid bit may be set ) have already been zerod by
3259          * vm_page_set_validclean().
3260          */
3261         for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
3262                 if (i == (PAGE_SIZE / DEV_BSIZE) ||
3263                     (m->valid & (1 << i))
3264                 ) {
3265                         if (i > b) {
3266                                 pmap_zero_page_area(
3267                                     VM_PAGE_TO_PHYS(m),
3268                                     b << DEV_BSHIFT,
3269                                     (i - b) << DEV_BSHIFT
3270                                 );
3271                         }
3272                         b = i + 1;
3273                 }
3274         }
3275
3276         /*
3277          * setvalid is TRUE when we can safely set the zero'd areas
3278          * as being valid.  We can do this if there are no cache consistency
3279          * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
3280          */
3281         if (setvalid)
3282                 m->valid = VM_PAGE_BITS_ALL;
3283 }
3284
3285 /*
3286  * Is a (partial) page valid?  Note that the case where size == 0
3287  * will return FALSE in the degenerate case where the page is entirely
3288  * invalid, and TRUE otherwise.
3289  *
3290  * Does not block.
3291  * No other requirements.
3292  */
3293 int
3294 vm_page_is_valid(vm_page_t m, int base, int size)
3295 {
3296         int bits = vm_page_bits(base, size);
3297
3298         if (m->valid && ((m->valid & bits) == bits))
3299                 return 1;
3300         else
3301                 return 0;
3302 }
3303
3304 /*
3305  * update dirty bits from pmap/mmu.  May not block.
3306  *
3307  * Caller must hold the page busy
3308  */
3309 void
3310 vm_page_test_dirty(vm_page_t m)
3311 {
3312         if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
3313                 vm_page_dirty(m);
3314         }
3315 }
3316
3317 #include "opt_ddb.h"
3318 #ifdef DDB
3319 #include <ddb/ddb.h>
3320
3321 DB_SHOW_COMMAND(page, vm_page_print_page_info)
3322 {
3323         db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
3324         db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
3325         db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
3326         db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
3327         db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
3328         db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
3329         db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
3330         db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
3331         db_printf("vmstats.v_cache_min: %ld\n", vmstats.v_cache_min);
3332         db_printf("vmstats.v_inactive_target: %ld\n",
3333                   vmstats.v_inactive_target);
3334 }
3335
3336 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3337 {
3338         int i;
3339         db_printf("PQ_FREE:");
3340         for (i = 0; i < PQ_L2_SIZE; i++) {
3341                 db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
3342         }
3343         db_printf("\n");
3344
3345         db_printf("PQ_CACHE:");
3346         for(i = 0; i < PQ_L2_SIZE; i++) {
3347                 db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
3348         }
3349         db_printf("\n");
3350
3351         db_printf("PQ_ACTIVE:");
3352         for(i = 0; i < PQ_L2_SIZE; i++) {
3353                 db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt);
3354         }
3355         db_printf("\n");
3356
3357         db_printf("PQ_INACTIVE:");
3358         for(i = 0; i < PQ_L2_SIZE; i++) {
3359                 db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt);
3360         }
3361         db_printf("\n");
3362 }
3363 #endif /* DDB */