sys/platform/pc64/amd64/pmap.c

   1 /*
   2  * Copyright (c) 1991 Regents of the University of California.
   3  * Copyright (c) 1994 John S. Dyson
   4  * Copyright (c) 1994 David Greenman
   5  * Copyright (c) 2008 The DragonFly Project.
   6  * Copyright (c) 2008 Jordan Gordeev.
   7  * All rights reserved.
   8  *
   9  * This code is derived from software contributed to Berkeley by
  10  * the Systems Programming Group of the University of Utah Computer
  11  * Science Department and William Jolitz of UUNET Technologies Inc.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  * 3. All advertising materials mentioning features or use of this software
  22  *    must display the following acknowledgement:
  23  *      This product includes software developed by the University of
  24  *      California, Berkeley and its contributors.
  25  * 4. Neither the name of the University nor the names of its contributors
  26  *    may be used to endorse or promote products derived from this software
  27  *    without specific prior written permission.
  28  *
  29  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  30  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  32  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  33  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  37  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  38  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  39  * SUCH DAMAGE.
  40  *
  41  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  42  * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
  43  * $DragonFly: src/sys/platform/pc64/amd64/pmap.c,v 1.3 2008/08/29 17:07:10 dillon Exp $
  44  */
  45
  46 /*
  47  *      Manages physical address maps.
  48  *
  49  *      In addition to hardware address maps, this
  50  *      module is called upon to provide software-use-only
  51  *      maps which may or may not be stored in the same
  52  *      form as hardware maps.  These pseudo-maps are
  53  *      used to store intermediate results from copy
  54  *      operations to and from address spaces.
  55  *
  56  *      Since the information managed by this module is
  57  *      also stored by the logical address mapping module,
  58  *      this module may throw away valid virtual-to-physical
  59  *      mappings at almost any time.  However, invalidations
  60  *      of virtual-to-physical mappings must be done as
  61  *      requested.
  62  *
  63  *      In order to cope with hardware architectures which
  64  *      make virtual-to-physical map invalidates expensive,
  65  *      this module may delay invalidate or reduced protection
  66  *      operations until such time as they are actually
  67  *      necessary.  This module is given full information as
  68  *      to which processors are currently using which maps,
  69  *      and to when physical maps must be made correct.
  70  */
  71
  72 #if JG
  73 #include "opt_disable_pse.h"
  74 #include "opt_pmap.h"
  75 #endif
  76 #include "opt_msgbuf.h"
  77
  78 #include <sys/param.h>
  79 #include <sys/systm.h>
  80 #include <sys/kernel.h>
  81 #include <sys/proc.h>
  82 #include <sys/msgbuf.h>
  83 #include <sys/vmmeter.h>
  84 #include <sys/mman.h>
  85
  86 #include <vm/vm.h>
  87 #include <vm/vm_param.h>
  88 #include <sys/sysctl.h>
  89 #include <sys/lock.h>
  90 #include <vm/vm_kern.h>
  91 #include <vm/vm_page.h>
  92 #include <vm/vm_map.h>
  93 #include <vm/vm_object.h>
  94 #include <vm/vm_extern.h>
  95 #include <vm/vm_pageout.h>
  96 #include <vm/vm_pager.h>
  97 #include <vm/vm_zone.h>
  98
  99 #include <sys/user.h>
 100 #include <sys/thread2.h>
 101 #include <sys/sysref2.h>
 102
 103 #include <machine/cputypes.h>
 104 #include <machine/md_var.h>
 105 #include <machine/specialreg.h>
 106 #include <machine/smp.h>
 107 #include <machine_base/apic/apicreg.h>
 108 #include <machine/globaldata.h>
 109 #include <machine/pmap.h>
 110 #include <machine/pmap_inval.h>
 111
 112 #include <ddb/ddb.h>
 113
 114 #define PMAP_KEEP_PDIRS
 115 #ifndef PMAP_SHPGPERPROC
 116 #define PMAP_SHPGPERPROC 200
 117 #endif
 118
 119 #if defined(DIAGNOSTIC)
 120 #define PMAP_DIAGNOSTIC
 121 #endif
 122
 123 #define MINPV 2048
 124
 125 #if !defined(PMAP_DIAGNOSTIC)
 126 #define PMAP_INLINE __inline
 127 #else
 128 #define PMAP_INLINE
 129 #endif
 130
 131 /*
 132  * Get PDEs and PTEs for user/kernel address space
 133  */
 134 #define pmap_pde(m, v)  (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 135 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 136
 137 #define pmap_pde_v(pte)         ((*(pd_entry_t *)pte & PG_V) != 0)
 138 #define pmap_pte_w(pte)         ((*(pt_entry_t *)pte & PG_W) != 0)
 139 #define pmap_pte_m(pte)         ((*(pt_entry_t *)pte & PG_M) != 0)
 140 #define pmap_pte_u(pte)         ((*(pt_entry_t *)pte & PG_A) != 0)
 141 #define pmap_pte_v(pte)         ((*(pt_entry_t *)pte & PG_V) != 0)
 142
 143
 144 /*
 145  * Given a map and a machine independent protection code,
 146  * convert to a vax protection code.
 147  */
 148 #define pte_prot(m, p)          \
 149         (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
 150 static int protection_codes[8];
 151
 152 struct pmap kernel_pmap;
 153 static TAILQ_HEAD(,pmap)        pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
 154
 155 vm_paddr_t avail_start;         /* PA of first available physical page */
 156 vm_paddr_t avail_end;           /* PA of last available physical page */
 157 vm_offset_t virtual_start;      /* VA of first avail page (after kernel bss) */
 158 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
 159 vm_offset_t KvaStart;           /* VA start of KVA space */
 160 vm_offset_t KvaEnd;             /* VA end of KVA space (non-inclusive) */
 161 vm_offset_t KvaSize;            /* max size of kernel virtual address space */
 162 static boolean_t pmap_initialized = FALSE;      /* Has pmap_init completed? */
 163 static int pgeflag;             /* PG_G or-in */
 164 static int pseflag;             /* PG_PS or-in */
 165
 166 static vm_object_t kptobj;
 167
 168 static int nkpt;
 169 vm_offset_t kernel_vm_end;
 170
 171 /*
 172  * Data for the pv entry allocation mechanism
 173  */
 174 static vm_zone_t pvzone;
 175 static struct vm_zone pvzone_store;
 176 static struct vm_object pvzone_obj;
 177 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
 178 static int pmap_pagedaemon_waken = 0;
 179 static struct pv_entry *pvinit;
 180
 181 /*
 182  * All those kernel PT submaps that BSD is so fond of
 183  */
 184 pt_entry_t *CMAP1 = 0, *ptmmap;
 185 caddr_t CADDR1 = 0, ptvmmap = 0;
 186 static pt_entry_t *msgbufmap;
 187 struct msgbuf *msgbufp=0;
 188
 189 /*
 190  * Crashdump maps.
 191  */
 192 static pt_entry_t *pt_crashdumpmap;
 193 static caddr_t crashdumpmap;
 194
 195 extern uint64_t KPTphys;
 196 extern pt_entry_t *SMPpt;
 197 extern uint64_t SMPptpa;
 198
 199 #define DISABLE_PSE
 200
 201 static PMAP_INLINE void free_pv_entry (pv_entry_t pv);
 202 static pt_entry_t * get_ptbase (pmap_t pmap);
 203 static pv_entry_t get_pv_entry (void);
 204 static void     i386_protection_init (void);
 205 static __inline void    pmap_clearbit (vm_page_t m, int bit);
 206
 207 static void     pmap_remove_all (vm_page_t m);
 208 static void     pmap_enter_quick (pmap_t pmap, vm_offset_t va, vm_page_t m);
 209 static int pmap_remove_pte (struct pmap *pmap, pt_entry_t *ptq,
 210                                 vm_offset_t sva, pmap_inval_info_t info);
 211 static void pmap_remove_page (struct pmap *pmap,
 212                                 vm_offset_t va, pmap_inval_info_t info);
 213 static int pmap_remove_entry (struct pmap *pmap, vm_page_t m,
 214                                 vm_offset_t va, pmap_inval_info_t info);
 215 static boolean_t pmap_testbit (vm_page_t m, int bit);
 216 static void pmap_insert_entry (pmap_t pmap, vm_offset_t va,
 217                 vm_page_t mpte, vm_page_t m);
 218
 219 static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va);
 220
 221 static int pmap_release_free_page (pmap_t pmap, vm_page_t p);
 222 static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex);
 223 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
 224 static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex);
 225 static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t);
 226 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 227
 228 static unsigned pdir4mb;
 229
 230 /*
 231  * Move the kernel virtual free pointer to the next
 232  * 4MB.  This is used to help improve performance
 233  * by using a large (4MB) page for much of the kernel
 234  * (.text, .data, .bss)
 235  */
 236 static vm_offset_t
 237 pmap_kmem_choose(vm_offset_t addr)
 238 {
 239         vm_offset_t newaddr = addr;
 240 #ifndef DISABLE_PSE
 241         if (cpu_feature & CPUID_PSE) {
 242                 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 243         }
 244 #endif
 245         return newaddr;
 246 }
 247
 248 /*
 249  * pmap_pte:
 250  *
 251  *      Extract the page table entry associated with the given map/virtual
 252  *      pair.
 253  *
 254  *      This function may NOT be called from an interrupt.
 255  */
 256 PMAP_INLINE pt_entry_t *
 257 pmap_pte(pmap_t pmap, vm_offset_t va)
 258 {
 259         pd_entry_t *pdeaddr;
 260
 261         if (pmap) {
 262                 pdeaddr = pmap_pde(pmap, va);
 263                 if (*pdeaddr & PG_PS)
 264                         return pdeaddr;
 265                 if (*pdeaddr) {
 266                         return get_ptbase(pmap) + amd64_btop(va);
 267                 }
 268         }
 269         return (0);
 270 }
 271
 272 /*
 273  * pmap_pte_quick:
 274  *
 275  *      Super fast pmap_pte routine best used when scanning the pv lists.
 276  *      This eliminates many course-grained invltlb calls.  Note that many of
 277  *      the pv list scans are across different pmaps and it is very wasteful
 278  *      to do an entire invltlb when checking a single mapping.
 279  *
 280  *      Should only be called while in a critical section.
 281  */
 282 static pt_entry_t *
 283 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 284 {
 285         struct mdglobaldata *gd = mdcpu;
 286         pd_entry_t pde, newpf;
 287
 288         if ((pde = pmap->pm_pdir[va >> PDRSHIFT]) != 0) {
 289                 pd_entry_t frame = pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 290                 vm_pindex_t index = amd64_btop(va);
 291                 /* are we current address space or kernel? */
 292                 if ((pmap == &kernel_pmap) ||
 293                         (frame == (PTDpde & PG_FRAME))) {
 294                         return (pt_entry_t *) PTmap + index;
 295                 }
 296                 newpf = pde & PG_FRAME;
 297                 if ( ((* (pt_entry_t *) gd->gd_PMAP1) & PG_FRAME) != newpf) {
 298                         * (pt_entry_t *) gd->gd_PMAP1 = newpf | PG_RW | PG_V;
 299                         cpu_invlpg(gd->gd_PADDR1);
 300                 }
 301                 return gd->gd_PADDR1 + (index & (NPTEPG - 1));
 302         }
 303         return (0);
 304 }
 305
 306
 307 static u_int64_t
 308 allocpages(vm_paddr_t *firstaddr, int n)
 309 {
 310         u_int64_t ret;
 311
 312         ret = *firstaddr;
 313         bzero((void *)ret, n * PAGE_SIZE);
 314         *firstaddr += n * PAGE_SIZE;
 315         return (ret);
 316 }
 317
 318 void
 319 create_pagetables(vm_paddr_t *firstaddr)
 320 {
 321         int i;
 322         int count;
 323         uint64_t cpu0pp, cpu0idlestk;
 324         int idlestk_page_offset = offsetof(struct privatespace, idlestack) / PAGE_SIZE;
 325
 326         /* we are running (mostly) V=P at this point */
 327
 328         common_lvl4_phys = allocpages(firstaddr, 1);    /* 512 512G mappings */
 329         common_lvl3_phys = allocpages(firstaddr, 1);    /* 512 1G mappings */
 330         KPTphys = allocpages(firstaddr, NKPT);          /* kernel page table */
 331         IdlePTD = allocpages(firstaddr, 1);             /* kernel page dir */
 332         cpu0pp = allocpages(firstaddr, MDGLOBALDATA_BASEALLOC_PAGES);
 333         cpu0idlestk = allocpages(firstaddr, UPAGES);
 334         SMPptpa = allocpages(firstaddr, 1);
 335         SMPpt = (void *)(SMPptpa + KERNBASE);
 336
 337
 338         /*
 339          * Load kernel page table with kernel memory mappings
 340          */
 341         for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
 342                 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
 343                 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V;
 344         }
 345
 346 #ifndef JG
 347         for (i = 0; i < NKPT; i++) {
 348                 ((pd_entry_t *)IdlePTD)[i] = KPTphys + (i << PAGE_SHIFT);
 349                 ((pd_entry_t *)IdlePTD)[i] |= PG_RW | PG_V;
 350         }
 351 #endif
 352
 353         /*
 354          * Set up the kernel page table itself.
 355          */
 356         for (i = 0; i < NKPT; i++) {
 357                 ((pd_entry_t *)IdlePTD)[KPTDI + i] = KPTphys + (i << PAGE_SHIFT);
 358                 ((pd_entry_t *)IdlePTD)[KPTDI + i] |= PG_RW | PG_V;
 359         }
 360
 361 #ifndef JG
 362         count = ISA_HOLE_LENGTH >> PAGE_SHIFT;
 363         for (i = 0; i < count; i++) {
 364                 ((pt_entry_t *)KPTphys)[amd64_btop(ISA_HOLE_START) + i] = \
 365                         (ISA_HOLE_START + i * PAGE_SIZE) | PG_RW | PG_V;
 366         }
 367 #endif
 368
 369         /*
 370          * Self-mapping
 371          */
 372         ((pd_entry_t *)IdlePTD)[PTDPTDI] = (pd_entry_t)IdlePTD | PG_RW | PG_V;
 373
 374         /*
 375          * Map CPU_prvspace[0].mdglobaldata
 376          */
 377         for (i = 0; i < MDGLOBALDATA_BASEALLOC_PAGES; i++) {
 378                 ((pt_entry_t *)SMPptpa)[i] = \
 379                         (cpu0pp + i * PAGE_SIZE) | PG_RW | PG_V;
 380         }
 381
 382         /*
 383          * Map CPU_prvspace[0].idlestack
 384          */
 385         for (i = 0; i < UPAGES; i++) {
 386                 ((pt_entry_t *)SMPptpa)[idlestk_page_offset + i] = \
 387                         (cpu0idlestk + i * PAGE_SIZE) | PG_RW | PG_V;
 388         }
 389
 390         /*
 391          * Link SMPpt.
 392          */
 393         ((pd_entry_t *)IdlePTD)[MPPTDI] = SMPptpa | PG_RW | PG_V;
 394
 395         /*
 396          * PML4 maps level 3
 397          */
 398         ((pml4_entry_t *)common_lvl4_phys)[LINKPML4I] = common_lvl3_phys | PG_RW | PG_V | PG_U;
 399
 400         /*
 401          * location of "virtual CR3" - a PDP entry that is loaded
 402          * with a PD physical address (+ page attributes).
 403          * Matt: location of user page directory entry (representing 1G)
 404          */
 405         link_pdpe = &((pdp_entry_t *)common_lvl3_phys)[LINKPDPI];
 406 }
 407
 408 void
 409 init_paging(vm_paddr_t *firstaddr) {
 410         create_pagetables(firstaddr);
 411
 412         /* switch to the newly created page table */
 413         *link_pdpe = IdlePTD | PG_RW | PG_V | PG_U;
 414         load_cr3(common_lvl4_phys);
 415         link_pdpe = (void *)((char *)link_pdpe + KERNBASE);
 416
 417         KvaStart = (vm_offset_t)VADDR(PTDPTDI, 0);
 418         KvaEnd = (vm_offset_t)VADDR(APTDPTDI, 0);
 419         KvaSize = KvaEnd - KvaStart;
 420 }
 421
 422 /*
 423  *      Bootstrap the system enough to run with virtual memory.
 424  *
 425  *      On the i386 this is called after mapping has already been enabled
 426  *      and just syncs the pmap module with what has already been done.
 427  *      [We can't call it easily with mapping off since the kernel is not
 428  *      mapped with PA == VA, hence we would have to relocate every address
 429  *      from the linked base (virtual) address "KERNBASE" to the actual
 430  *      (physical) address starting relative to 0]
 431  */
 432 void
 433 pmap_bootstrap(vm_paddr_t *firstaddr, vm_paddr_t loadaddr)
 434 {
 435         vm_offset_t va;
 436         pt_entry_t *pte;
 437         struct mdglobaldata *gd;
 438         int i;
 439         int pg;
 440
 441         avail_start = *firstaddr;
 442
 443         /*
 444          * XXX The calculation of virtual_start is wrong. It's NKPT*PAGE_SIZE
 445          * too large. It should instead be correctly calculated in locore.s and
 446          * not based on 'first' (which is a physical address, not a virtual
 447          * address, for the start of unused physical memory). The kernel
 448          * page tables are NOT double mapped and thus should not be included
 449          * in this calculation.
 450          */
 451         virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr;
 452         virtual_start = pmap_kmem_choose(virtual_start);
 453         virtual_end = VADDR(KPTDI+NKPDE-1, NPTEPG-1);
 454
 455         /*
 456          * Initialize protection array.
 457          */
 458         i386_protection_init();
 459
 460         /*
 461          * The kernel's pmap is statically allocated so we don't have to use
 462          * pmap_create, which is unlikely to work correctly at this part of
 463          * the boot sequence (XXX and which no longer exists).
 464          */
 465         kernel_pmap.pm_pdir = (pd_entry_t *)(PTOV_OFFSET + (uint64_t)IdlePTD);
 466         kernel_pmap.pm_count = 1;
 467         kernel_pmap.pm_active = (cpumask_t)-1;  /* don't allow deactivation */
 468         TAILQ_INIT(&kernel_pmap.pm_pvlist);
 469         nkpt = NKPT;
 470
 471         /*
 472          * Reserve some special page table entries/VA space for temporary
 473          * mapping of pages.
 474          */
 475 #define SYSMAP(c, p, v, n)      \
 476         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 477
 478         va = virtual_start;
 479         pte = (pt_entry_t *) pmap_pte(&kernel_pmap, va);
 480
 481         /*
 482          * CMAP1/CMAP2 are used for zeroing and copying pages.
 483          */
 484         SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 485
 486         /*
 487          * Crashdump maps.
 488          */
 489         SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
 490
 491         /*
 492          * ptvmmap is used for reading arbitrary physical pages via
 493          * /dev/mem.
 494          */
 495         SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
 496
 497         /*
 498          * msgbufp is used to map the system message buffer.
 499          * XXX msgbufmap is not used.
 500          */
 501         SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
 502                atop(round_page(MSGBUF_SIZE)))
 503
 504         virtual_start = va;
 505
 506         *CMAP1 = 0;
 507         for (i = 0; i < NKPT; i++)
 508                 PTD[i] = 0;
 509
 510         /*
 511          * PG_G is terribly broken on SMP because we IPI invltlb's in some
 512          * cases rather then invl1pg.  Actually, I don't even know why it
 513          * works under UP because self-referential page table mappings
 514          */
 515 #ifdef SMP
 516         pgeflag = 0;
 517 #else
 518         if (cpu_feature & CPUID_PGE)
 519                 pgeflag = PG_G;
 520 #endif
 521
 522 /*
 523  * Initialize the 4MB page size flag
 524  */
 525         pseflag = 0;
 526 /*
 527  * The 4MB page version of the initial
 528  * kernel page mapping.
 529  */
 530         pdir4mb = 0;
 531
 532 #if !defined(DISABLE_PSE)
 533         if (cpu_feature & CPUID_PSE) {
 534                 pt_entry_t ptditmp;
 535                 /*
 536                  * Note that we have enabled PSE mode
 537                  */
 538                 pseflag = PG_PS;
 539                 ptditmp = *(PTmap + amd64_btop(KERNBASE));
 540                 ptditmp &= ~(NBPDR - 1);
 541                 ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
 542                 pdir4mb = ptditmp;
 543
 544 #ifndef SMP
 545                 /*
 546                  * Enable the PSE mode.  If we are SMP we can't do this
 547                  * now because the APs will not be able to use it when
 548                  * they boot up.
 549                  */
 550                 load_cr4(rcr4() | CR4_PSE);
 551
 552                 /*
 553                  * We can do the mapping here for the single processor
 554                  * case.  We simply ignore the old page table page from
 555                  * now on.
 556                  */
 557                 /*
 558                  * For SMP, we still need 4K pages to bootstrap APs,
 559                  * PSE will be enabled as soon as all APs are up.
 560                  */
 561                 PTD[KPTDI] = (pd_entry_t)ptditmp;
 562                 kernel_pmap.pm_pdir[KPTDI] = (pd_entry_t)ptditmp;
 563                 cpu_invltlb();
 564 #endif
 565         }
 566 #endif
 567 #ifdef SMP
 568         if (cpu_apic_address == 0)
 569                 panic("pmap_bootstrap: no local apic!");
 570
 571         /* local apic is mapped on last page */
 572         SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
 573             (cpu_apic_address & PG_FRAME));
 574 #endif
 575
 576         /*
 577          * We need to finish setting up the globaldata page for the BSP.
 578          * locore has already populated the page table for the mdglobaldata
 579          * portion.
 580          */
 581         pg = MDGLOBALDATA_BASEALLOC_PAGES;
 582         gd = &CPU_prvspace[0].mdglobaldata;
 583         gd->gd_CMAP1 = &SMPpt[pg + 0];
 584         gd->gd_CMAP2 = &SMPpt[pg + 1];
 585         gd->gd_CMAP3 = &SMPpt[pg + 2];
 586         gd->gd_PMAP1 = &SMPpt[pg + 3];
 587         gd->gd_CADDR1 = CPU_prvspace[0].CPAGE1;
 588         gd->gd_CADDR2 = CPU_prvspace[0].CPAGE2;
 589         gd->gd_CADDR3 = CPU_prvspace[0].CPAGE3;
 590         gd->gd_PADDR1 = (pt_entry_t *)CPU_prvspace[0].PPAGE1;
 591
 592         cpu_invltlb();
 593 }
 594
 595 #ifdef SMP
 596 /*
 597  * Set 4mb pdir for mp startup
 598  */
 599 void
 600 pmap_set_opt(void)
 601 {
 602         if (pseflag && (cpu_feature & CPUID_PSE)) {
 603                 load_cr4(rcr4() | CR4_PSE);
 604                 if (pdir4mb && mycpu->gd_cpuid == 0) {  /* only on BSP */
 605                         kernel_pmap.pm_pdir[KPTDI] =
 606                             PTD[KPTDI] = (pd_entry_t)pdir4mb;
 607                         cpu_invltlb();
 608                 }
 609         }
 610 }
 611 #endif
 612
 613 /*
 614  *      Initialize the pmap module.
 615  *      Called by vm_init, to initialize any structures that the pmap
 616  *      system needs to map virtual memory.
 617  *      pmap_init has been enhanced to support in a fairly consistant
 618  *      way, discontiguous physical memory.
 619  */
 620 void
 621 pmap_init(void)
 622 {
 623         int i;
 624         int initial_pvs;
 625
 626         /*
 627          * object for kernel page table pages
 628          */
 629         kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
 630
 631         /*
 632          * Allocate memory for random pmap data structures.  Includes the
 633          * pv_head_table.
 634          */
 635
 636         for(i = 0; i < vm_page_array_size; i++) {
 637                 vm_page_t m;
 638
 639                 m = &vm_page_array[i];
 640                 TAILQ_INIT(&m->md.pv_list);
 641                 m->md.pv_list_count = 0;
 642         }
 643
 644         /*
 645          * init the pv free list
 646          */
 647         initial_pvs = vm_page_array_size;
 648         if (initial_pvs < MINPV)
 649                 initial_pvs = MINPV;
 650         pvzone = &pvzone_store;
 651         pvinit = (struct pv_entry *) kmem_alloc(&kernel_map,
 652                 initial_pvs * sizeof (struct pv_entry));
 653         zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit,
 654                 initial_pvs);
 655
 656         /*
 657          * Now it is safe to enable pv_table recording.
 658          */
 659         pmap_initialized = TRUE;
 660 }
 661
 662 /*
 663  * Initialize the address space (zone) for the pv_entries.  Set a
 664  * high water mark so that the system can recover from excessive
 665  * numbers of pv entries.
 666  */
 667 void
 668 pmap_init2(void)
 669 {
 670         int shpgperproc = PMAP_SHPGPERPROC;
 671
 672         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 673         pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
 674         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 675         pv_entry_high_water = 9 * (pv_entry_max / 10);
 676         zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
 677 }
 678
 679
 680 /***************************************************
 681  * Low level helper routines.....
 682  ***************************************************/
 683
 684 #if defined(PMAP_DIAGNOSTIC)
 685
 686 /*
 687  * This code checks for non-writeable/modified pages.
 688  * This should be an invalid condition.
 689  */
 690 static int
 691 pmap_nw_modified(pt_entry_t ptea)
 692 {
 693         int pte;
 694
 695         pte = (int) ptea;
 696
 697         if ((pte & (PG_M|PG_RW)) == PG_M)
 698                 return 1;
 699         else
 700                 return 0;
 701 }
 702 #endif
 703
 704
 705 /*
 706  * this routine defines the region(s) of memory that should
 707  * not be tested for the modified bit.
 708  */
 709 static PMAP_INLINE int
 710 pmap_track_modified(vm_offset_t va)
 711 {
 712         if ((va < clean_sva) || (va >= clean_eva))
 713                 return 1;
 714         else
 715                 return 0;
 716 }
 717
 718 static pt_entry_t *
 719 get_ptbase(pmap_t pmap)
 720 {
 721         pd_entry_t frame = pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 722         struct globaldata *gd = mycpu;
 723
 724         /* are we current address space or kernel? */
 725         if (pmap == &kernel_pmap || frame == (PTDpde & PG_FRAME)) {
 726                 return (pt_entry_t *) PTmap;
 727         }
 728
 729         /* otherwise, we are alternate address space */
 730         KKASSERT(gd->gd_intr_nesting_level == 0 &&
 731                  (gd->gd_curthread->td_flags & TDF_INTTHREAD) == 0);
 732
 733         if (frame != (((pd_entry_t) APTDpde) & PG_FRAME)) {
 734                 APTDpde = (pd_entry_t)(frame | PG_RW | PG_V);
 735                 /* The page directory is not shared between CPUs */
 736                 cpu_invltlb();
 737         }
 738         return (pt_entry_t *) APTmap;
 739 }
 740
 741 /*
 742  * pmap_extract:
 743  *
 744  *      Extract the physical page address associated with the map/VA pair.
 745  *
 746  *      This function may not be called from an interrupt if the pmap is
 747  *      not kernel_pmap.
 748  */
 749 vm_paddr_t
 750 pmap_extract(pmap_t pmap, vm_offset_t va)
 751 {
 752         vm_offset_t rtval;
 753         vm_offset_t pdirindex;
 754
 755         pdirindex = va >> PDRSHIFT;
 756         if (pmap && (rtval = pmap->pm_pdir[pdirindex])) {
 757                 pt_entry_t *pte;
 758                 if ((rtval & PG_PS) != 0) {
 759                         rtval &= ~(NBPDR - 1);
 760                         rtval |= va & (NBPDR - 1);
 761                         return rtval;
 762                 }
 763                 pte = get_ptbase(pmap) + amd64_btop(va);
 764                 rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
 765                 return rtval;
 766         }
 767         return 0;
 768 }
 769
 770 /***************************************************
 771  * Low level mapping routines.....
 772  ***************************************************/
 773
 774 /*
 775  * Routine: pmap_kenter
 776  * Function:
 777  *      Add a wired page to the KVA
 778  *      NOTE! note that in order for the mapping to take effect -- you
 779  *      should do an invltlb after doing the pmap_kenter().
 780  */
 781 void
 782 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 783 {
 784         pt_entry_t *pte;
 785         pt_entry_t npte;
 786         pmap_inval_info info;
 787
 788         pmap_inval_init(&info);
 789         npte = pa | PG_RW | PG_V | pgeflag;
 790         pte = vtopte(va);
 791         pmap_inval_add(&info, &kernel_pmap, va);
 792         *pte = npte;
 793         pmap_inval_flush(&info);
 794 }
 795
 796 /*
 797  * Routine: pmap_kenter_quick
 798  * Function:
 799  *      Similar to pmap_kenter(), except we only invalidate the
 800  *      mapping on the current CPU.
 801  */
 802 void
 803 pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
 804 {
 805         pt_entry_t *pte;
 806         pt_entry_t npte;
 807
 808         npte = pa | PG_RW | PG_V | pgeflag;
 809         pte = vtopte(va);
 810         *pte = npte;
 811         cpu_invlpg((void *)va);
 812 }
 813
 814 void
 815 pmap_kenter_sync(vm_offset_t va)
 816 {
 817         pmap_inval_info info;
 818
 819         pmap_inval_init(&info);
 820         pmap_inval_add(&info, &kernel_pmap, va);
 821         pmap_inval_flush(&info);
 822 }
 823
 824 void
 825 pmap_kenter_sync_quick(vm_offset_t va)
 826 {
 827         cpu_invlpg((void *)va);
 828 }
 829
 830 /*
 831  * remove a page from the kernel pagetables
 832  */
 833 void
 834 pmap_kremove(vm_offset_t va)
 835 {
 836         pt_entry_t *pte;
 837         pmap_inval_info info;
 838
 839         pmap_inval_init(&info);
 840         pte = vtopte(va);
 841         pmap_inval_add(&info, &kernel_pmap, va);
 842         *pte = 0;
 843         pmap_inval_flush(&info);
 844 }
 845
 846 void
 847 pmap_kremove_quick(vm_offset_t va)
 848 {
 849         pt_entry_t *pte;
 850         pte = vtopte(va);
 851         *pte = 0;
 852         cpu_invlpg((void *)va);
 853 }
 854
 855 /*
 856  * XXX these need to be recoded.  They are not used in any critical path.
 857  */
 858 void
 859 pmap_kmodify_rw(vm_offset_t va)
 860 {
 861         *vtopte(va) |= PG_RW;
 862         cpu_invlpg((void *)va);
 863 }
 864
 865 void
 866 pmap_kmodify_nc(vm_offset_t va)
 867 {
 868         *vtopte(va) |= PG_N;
 869         cpu_invlpg((void *)va);
 870 }
 871
 872 /*
 873  *      Used to map a range of physical addresses into kernel
 874  *      virtual address space.
 875  *
 876  *      For now, VM is already on, we only need to map the
 877  *      specified memory.
 878  */
 879 vm_offset_t
 880 pmap_map(vm_offset_t virt, vm_paddr_t start, vm_paddr_t end, int prot)
 881 {
 882         while (start < end) {
 883                 pmap_kenter(virt, start);
 884                 virt += PAGE_SIZE;
 885                 start += PAGE_SIZE;
 886         }
 887         return (virt);
 888 }
 889
 890
 891 /*
 892  * Add a list of wired pages to the kva
 893  * this routine is only used for temporary
 894  * kernel mappings that do not need to have
 895  * page modification or references recorded.
 896  * Note that old mappings are simply written
 897  * over.  The page *must* be wired.
 898  */
 899 void
 900 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
 901 {
 902         vm_offset_t end_va;
 903
 904         end_va = va + count * PAGE_SIZE;
 905
 906         while (va < end_va) {
 907                 pt_entry_t *pte;
 908
 909                 pte = vtopte(va);
 910                 *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V | pgeflag;
 911                 cpu_invlpg((void *)va);
 912                 va += PAGE_SIZE;
 913                 m++;
 914         }
 915 #ifdef SMP
 916         smp_invltlb();  /* XXX */
 917 #endif
 918 }
 919
 920 void
 921 pmap_qenter2(vm_offset_t va, vm_page_t *m, int count, cpumask_t *mask)
 922 {
 923         vm_offset_t end_va;
 924         cpumask_t cmask = mycpu->gd_cpumask;
 925
 926         end_va = va + count * PAGE_SIZE;
 927
 928         while (va < end_va) {
 929                 pt_entry_t *pte;
 930                 pt_entry_t pteval;
 931
 932                 /*
 933                  * Install the new PTE.  If the pte changed from the prior
 934                  * mapping we must reset the cpu mask and invalidate the page.
 935                  * If the pte is the same but we have not seen it on the
 936                  * current cpu, invlpg the existing mapping.  Otherwise the
 937                  * entry is optimal and no invalidation is required.
 938                  */
 939                 pte = vtopte(va);
 940                 pteval = VM_PAGE_TO_PHYS(*m) | PG_A | PG_RW | PG_V | pgeflag;
 941                 if (*pte != pteval) {
 942                         *mask = 0;
 943                         *pte = pteval;
 944                         cpu_invlpg((void *)va);
 945                 } else if ((*mask & cmask) == 0) {
 946                         cpu_invlpg((void *)va);
 947                 }
 948                 va += PAGE_SIZE;
 949                 m++;
 950         }
 951         *mask |= cmask;
 952 }
 953
 954 /*
 955  * this routine jerks page mappings from the
 956  * kernel -- it is meant only for temporary mappings.
 957  */
 958 void
 959 pmap_qremove(vm_offset_t va, int count)
 960 {
 961         vm_offset_t end_va;
 962
 963         end_va = va + count*PAGE_SIZE;
 964
 965         while (va < end_va) {
 966                 pt_entry_t *pte;
 967
 968                 pte = vtopte(va);
 969                 *pte = 0;
 970                 cpu_invlpg((void *)va);
 971                 va += PAGE_SIZE;
 972         }
 973 #ifdef SMP
 974         smp_invltlb();
 975 #endif
 976 }
 977
 978 /*
 979  * This routine works like vm_page_lookup() but also blocks as long as the
 980  * page is busy.  This routine does not busy the page it returns.
 981  *
 982  * Unless the caller is managing objects whos pages are in a known state,
 983  * the call should be made with a critical section held so the page's object
 984  * association remains valid on return.
 985  */
 986 static vm_page_t
 987 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
 988 {
 989         vm_page_t m;
 990
 991         do {
 992                 m = vm_page_lookup(object, pindex);
 993         } while (m && vm_page_sleep_busy(m, FALSE, "pplookp"));
 994
 995         return(m);
 996 }
 997
 998 /*
 999  * Create a new thread and optionally associate it with a (new) process.
1000  * NOTE! the new thread's cpu may not equal the current cpu.
1001  */
1002 void
1003 pmap_init_thread(thread_t td)
1004 {
1005         /* enforce pcb placement */
1006         td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
1007         td->td_savefpu = &td->td_pcb->pcb_save;
1008         td->td_sp = (char *)td->td_pcb - 16;
1009 }
1010
1011 /*
1012  * This routine directly affects the fork perf for a process.
1013  */
1014 void
1015 pmap_init_proc(struct proc *p)
1016 {
1017 }
1018
1019 /*
1020  * Dispose the UPAGES for a process that has exited.
1021  * This routine directly impacts the exit perf of a process.
1022  */
1023 void
1024 pmap_dispose_proc(struct proc *p)
1025 {
1026         KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p));
1027 }
1028
1029 /***************************************************
1030  * Page table page management routines.....
1031  ***************************************************/
1032
1033 /*
1034  * This routine unholds page table pages, and if the hold count
1035  * drops to zero, then it decrements the wire count.
1036  */
1037 static int
1038 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
1039 {
1040         /*
1041          * Wait until we can busy the page ourselves.  We cannot have
1042          * any active flushes if we block.
1043          */
1044         if (m->flags & PG_BUSY) {
1045                 pmap_inval_flush(info);
1046                 while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
1047                         ;
1048         }
1049         KASSERT(m->queue == PQ_NONE,
1050                 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m));
1051
1052         if (m->hold_count == 1) {
1053                 /*
1054                  * Unmap the page table page
1055                  */
1056                 vm_page_busy(m);
1057                 pmap_inval_add(info, pmap, -1);
1058                 pmap->pm_pdir[m->pindex] = 0;
1059
1060                 KKASSERT(pmap->pm_stats.resident_count > 0);
1061                 --pmap->pm_stats.resident_count;
1062
1063                 if (pmap->pm_ptphint == m)
1064                         pmap->pm_ptphint = NULL;
1065
1066                 /*
1067                  * This was our last hold, the page had better be unwired
1068                  * after we decrement wire_count.
1069                  *
1070                  * FUTURE NOTE: shared page directory page could result in
1071                  * multiple wire counts.
1072                  */
1073                 vm_page_unhold(m);
1074                 --m->wire_count;
1075                 KKASSERT(m->wire_count == 0);
1076                 --vmstats.v_wire_count;
1077                 vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1078                 vm_page_flash(m);
1079                 vm_page_free_zero(m);
1080                 return 1;
1081         } else {
1082                 KKASSERT(m->hold_count > 1);
1083                 vm_page_unhold(m);
1084                 return 0;
1085         }
1086 }
1087
1088 static PMAP_INLINE int
1089 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
1090 {
1091         KKASSERT(m->hold_count > 0);
1092         if (m->hold_count > 1) {
1093                 vm_page_unhold(m);
1094                 return 0;
1095         } else {
1096                 return _pmap_unwire_pte_hold(pmap, m, info);
1097         }
1098 }
1099
1100 /*
1101  * After removing a page table entry, this routine is used to
1102  * conditionally free the page, and manage the hold/wire counts.
1103  */
1104 static int
1105 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
1106                 pmap_inval_info_t info)
1107 {
1108         vm_pindex_t ptepindex;
1109         if (va >= UPT_MIN_ADDRESS)
1110                 return 0;
1111
1112         if (mpte == NULL) {
1113                 ptepindex = (va >> PDRSHIFT);
1114                 if (pmap->pm_ptphint &&
1115                         (pmap->pm_ptphint->pindex == ptepindex)) {
1116                         mpte = pmap->pm_ptphint;
1117                 } else {
1118                         pmap_inval_flush(info);
1119                         mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1120                         pmap->pm_ptphint = mpte;
1121                 }
1122         }
1123
1124         return pmap_unwire_pte_hold(pmap, mpte, info);
1125 }
1126
1127 /*
1128  * Initialize pmap0/vmspace0.  This pmap is not added to pmap_list because
1129  * it, and IdlePTD, represents the template used to update all other pmaps.
1130  *
1131  * On architectures where the kernel pmap is not integrated into the user
1132  * process pmap, this pmap represents the process pmap, not the kernel pmap.
1133  * kernel_pmap should be used to directly access the kernel_pmap.
1134  */
1135 void
1136 pmap_pinit0(struct pmap *pmap)
1137 {
1138         pmap->pm_pdir =
1139                 (pd_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
1140         pmap_kenter((vm_offset_t)pmap->pm_pdir, (vm_offset_t) IdlePTD);
1141         pmap->pm_count = 1;
1142         pmap->pm_active = 0;
1143         pmap->pm_ptphint = NULL;
1144         TAILQ_INIT(&pmap->pm_pvlist);
1145         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1146 }
1147
1148 /*
1149  * Initialize a preallocated and zeroed pmap structure,
1150  * such as one in a vmspace structure.
1151  */
1152 void
1153 pmap_pinit(struct pmap *pmap)
1154 {
1155         vm_page_t ptdpg;
1156
1157         /*
1158          * No need to allocate page table space yet but we do need a valid
1159          * page directory table.
1160          */
1161         if (pmap->pm_pdir == NULL) {
1162                 pmap->pm_pdir =
1163                     (pd_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
1164         }
1165
1166         /*
1167          * Allocate an object for the ptes
1168          */
1169         if (pmap->pm_pteobj == NULL)
1170                 pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1);
1171
1172         /*
1173          * Allocate the page directory page, unless we already have
1174          * one cached.  If we used the cached page the wire_count will
1175          * already be set appropriately.
1176          */
1177         if ((ptdpg = pmap->pm_pdirm) == NULL) {
1178                 ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI,
1179                                      VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
1180                 pmap->pm_pdirm = ptdpg;
1181                 vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY);
1182                 ptdpg->valid = VM_PAGE_BITS_ALL;
1183                 ptdpg->wire_count = 1;
1184                 ++vmstats.v_wire_count;
1185                 pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
1186         }
1187         if ((ptdpg->flags & PG_ZERO) == 0)
1188                 bzero(pmap->pm_pdir, PAGE_SIZE);
1189
1190         pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1191
1192         /* install self-referential address mapping entry */
1193         *(pd_entry_t *) (pmap->pm_pdir + PTDPTDI) =
1194                 VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
1195
1196         pmap->pm_count = 1;
1197         pmap->pm_active = 0;
1198         pmap->pm_ptphint = NULL;
1199         TAILQ_INIT(&pmap->pm_pvlist);
1200         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1201         pmap->pm_stats.resident_count = 1;
1202 }
1203
1204 /*
1205  * Clean up a pmap structure so it can be physically freed.  This routine
1206  * is called by the vmspace dtor function.  A great deal of pmap data is
1207  * left passively mapped to improve vmspace management so we have a bit
1208  * of cleanup work to do here.
1209  */
1210 void
1211 pmap_puninit(pmap_t pmap)
1212 {
1213         vm_page_t p;
1214
1215         KKASSERT(pmap->pm_active == 0);
1216         if ((p = pmap->pm_pdirm) != NULL) {
1217                 KKASSERT(pmap->pm_pdir != NULL);
1218                 pmap_kremove((vm_offset_t)pmap->pm_pdir);
1219                 p->wire_count--;
1220                 vmstats.v_wire_count--;
1221                 KKASSERT((p->flags & PG_BUSY) == 0);
1222                 vm_page_busy(p);
1223                 vm_page_free_zero(p);
1224                 pmap->pm_pdirm = NULL;
1225         }
1226         if (pmap->pm_pdir) {
1227                 kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pdir, PAGE_SIZE);
1228                 pmap->pm_pdir = NULL;
1229         }
1230         if (pmap->pm_pteobj) {
1231                 vm_object_deallocate(pmap->pm_pteobj);
1232                 pmap->pm_pteobj = NULL;
1233         }
1234 }
1235
1236 /*
1237  * Wire in kernel global address entries.  To avoid a race condition
1238  * between pmap initialization and pmap_growkernel, this procedure
1239  * adds the pmap to the master list (which growkernel scans to update),
1240  * then copies the template.
1241  */
1242 void
1243 pmap_pinit2(struct pmap *pmap)
1244 {
1245         crit_enter();
1246         TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
1247         /* XXX copies current process, does not fill in MPPTDI */
1248         bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
1249         crit_exit();
1250 }
1251
1252 /*
1253  * Attempt to release and free a vm_page in a pmap.  Returns 1 on success,
1254  * 0 on failure (if the procedure had to sleep).
1255  *
1256  * When asked to remove the page directory page itself, we actually just
1257  * leave it cached so we do not have to incur the SMP inval overhead of
1258  * removing the kernel mapping.  pmap_puninit() will take care of it.
1259  */
1260 static int
1261 pmap_release_free_page(struct pmap *pmap, vm_page_t p)
1262 {
1263         pd_entry_t *pde = (pd_entry_t *) pmap->pm_pdir;
1264         /*
1265          * This code optimizes the case of freeing non-busy
1266          * page-table pages.  Those pages are zero now, and
1267          * might as well be placed directly into the zero queue.
1268          */
1269         if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
1270                 return 0;
1271
1272         vm_page_busy(p);
1273
1274         /*
1275          * Remove the page table page from the processes address space.
1276          */
1277         pde[p->pindex] = 0;
1278         KKASSERT(pmap->pm_stats.resident_count > 0);
1279         --pmap->pm_stats.resident_count;
1280
1281         if (p->hold_count)  {
1282                 panic("pmap_release: freeing held page table page");
1283         }
1284         if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
1285                 pmap->pm_ptphint = NULL;
1286
1287         /*
1288          * We leave the page directory page cached, wired, and mapped in
1289          * the pmap until the dtor function (pmap_puninit()) gets called.
1290          * However, still clean it up so we can set PG_ZERO.
1291          */
1292         if (p->pindex == PTDPTDI) {
1293                 bzero(pde + KPTDI, nkpt * PTESIZE);
1294                 pde[MPPTDI] = 0;
1295                 pde[APTDPTDI] = 0;
1296                 vm_page_flag_set(p, PG_ZERO);
1297                 vm_page_wakeup(p);
1298         } else {
1299                 p->wire_count--;
1300                 vmstats.v_wire_count--;
1301                 vm_page_free_zero(p);
1302         }
1303         return 1;
1304 }
1305
1306 /*
1307  * this routine is called if the page table page is not
1308  * mapped correctly.
1309  */
1310 static vm_page_t
1311 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
1312 {
1313         vm_offset_t pteva, ptepa;
1314         vm_page_t m;
1315
1316         /*
1317          * Find or fabricate a new pagetable page
1318          */
1319         m = vm_page_grab(pmap->pm_pteobj, ptepindex,
1320                         VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
1321
1322         KASSERT(m->queue == PQ_NONE,
1323                 ("_pmap_allocpte: %p->queue != PQ_NONE", m));
1324
1325         /*
1326          * Increment the hold count for the page we will be returning to
1327          * the caller.
1328          */
1329         m->hold_count++;
1330
1331         /*
1332          * It is possible that someone else got in and mapped by the page
1333          * directory page while we were blocked, if so just unbusy and
1334          * return the held page.
1335          */
1336         if ((ptepa = pmap->pm_pdir[ptepindex]) != 0) {
1337                 KKASSERT((ptepa & PG_FRAME) == VM_PAGE_TO_PHYS(m));
1338                 vm_page_wakeup(m);
1339                 return(m);
1340         }
1341
1342         if (m->wire_count == 0)
1343                 vmstats.v_wire_count++;
1344         m->wire_count++;
1345
1346
1347         /*
1348          * Map the pagetable page into the process address space, if
1349          * it isn't already there.
1350          */
1351
1352         ++pmap->pm_stats.resident_count;
1353
1354         ptepa = VM_PAGE_TO_PHYS(m);
1355         pmap->pm_pdir[ptepindex] =
1356                 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1357
1358         /*
1359          * Set the page table hint
1360          */
1361         pmap->pm_ptphint = m;
1362
1363         /*
1364          * Try to use the new mapping, but if we cannot, then
1365          * do it with the routine that maps the page explicitly.
1366          */
1367         if ((m->flags & PG_ZERO) == 0) {
1368                 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
1369                         (((pd_entry_t) PTDpde) & PG_FRAME)) {
1370                         pteva = UPT_MIN_ADDRESS + amd64_ptob(ptepindex);
1371                         bzero((caddr_t) pteva, PAGE_SIZE);
1372                 } else {
1373                         pmap_zero_page(ptepa);
1374                 }
1375         }
1376
1377         m->valid = VM_PAGE_BITS_ALL;
1378         vm_page_flag_clear(m, PG_ZERO);
1379         vm_page_flag_set(m, PG_MAPPED);
1380         vm_page_wakeup(m);
1381
1382         return m;
1383 }
1384
1385 static vm_page_t
1386 pmap_allocpte(pmap_t pmap, vm_offset_t va)
1387 {
1388         vm_pindex_t ptepindex;
1389         vm_offset_t ptepa;
1390         vm_page_t m;
1391
1392         /*
1393          * Calculate pagetable page index
1394          */
1395         ptepindex = va >> PDRSHIFT;
1396
1397         /*
1398          * Get the page directory entry
1399          */
1400         ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
1401
1402         /*
1403          * This supports switching from a 4MB page to a
1404          * normal 4K page.
1405          */
1406         if (ptepa & PG_PS) {
1407                 pmap->pm_pdir[ptepindex] = 0;
1408                 ptepa = 0;
1409                 cpu_invltlb();
1410                 smp_invltlb();
1411         }
1412
1413         /*
1414          * If the page table page is mapped, we just increment the
1415          * hold count, and activate it.
1416          */
1417         if (ptepa) {
1418                 /*
1419                  * In order to get the page table page, try the
1420                  * hint first.
1421                  */
1422                 if (pmap->pm_ptphint &&
1423                         (pmap->pm_ptphint->pindex == ptepindex)) {
1424                         m = pmap->pm_ptphint;
1425                 } else {
1426                         m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
1427                         pmap->pm_ptphint = m;
1428                 }
1429                 m->hold_count++;
1430                 return m;
1431         }
1432         /*
1433          * Here if the pte page isn't mapped, or if it has been deallocated.
1434          */
1435         return _pmap_allocpte(pmap, ptepindex);
1436 }
1437
1438
1439 /***************************************************
1440  * Pmap allocation/deallocation routines.
1441  ***************************************************/
1442
1443 /*
1444  * Release any resources held by the given physical map.
1445  * Called when a pmap initialized by pmap_pinit is being released.
1446  * Should only be called if the map contains no valid mappings.
1447  */
1448 static int pmap_release_callback(struct vm_page *p, void *data);
1449
1450 void
1451 pmap_release(struct pmap *pmap)
1452 {
1453         vm_object_t object = pmap->pm_pteobj;
1454         struct rb_vm_page_scan_info info;
1455
1456         KASSERT(pmap->pm_active == 0, ("pmap still active! %08x", pmap->pm_active));
1457 #if defined(DIAGNOSTIC)
1458         if (object->ref_count != 1)
1459                 panic("pmap_release: pteobj reference count != 1");
1460 #endif
1461
1462         info.pmap = pmap;
1463         info.object = object;
1464         crit_enter();
1465         TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
1466         crit_exit();
1467
1468         do {
1469                 crit_enter();
1470                 info.error = 0;
1471                 info.mpte = NULL;
1472                 info.limit = object->generation;
1473
1474                 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1475                                         pmap_release_callback, &info);
1476                 if (info.error == 0 && info.mpte) {
1477                         if (!pmap_release_free_page(pmap, info.mpte))
1478                                 info.error = 1;
1479                 }
1480                 crit_exit();
1481         } while (info.error);
1482 }
1483
1484 static int
1485 pmap_release_callback(struct vm_page *p, void *data)
1486 {
1487         struct rb_vm_page_scan_info *info = data;
1488
1489         if (p->pindex == PTDPTDI) {
1490                 info->mpte = p;
1491                 return(0);
1492         }
1493         if (!pmap_release_free_page(info->pmap, p)) {
1494                 info->error = 1;
1495                 return(-1);
1496         }
1497         if (info->object->generation != info->limit) {
1498                 info->error = 1;
1499                 return(-1);
1500         }
1501         return(0);
1502 }
1503
1504 /*
1505  * Grow the number of kernel page table entries, if needed.
1506  */
1507
1508 void
1509 pmap_growkernel(vm_offset_t addr)
1510 {
1511         struct pmap *pmap;
1512         vm_offset_t ptppaddr;
1513         vm_page_t nkpg;
1514         pd_entry_t newpdir;
1515
1516         crit_enter();
1517         if (kernel_vm_end == 0) {
1518                 kernel_vm_end = KERNBASE;
1519                 nkpt = 0;
1520                 while (pdir_pde(PTD, kernel_vm_end)) {
1521                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1522                         nkpt++;
1523                 }
1524         }
1525         addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1526         while (kernel_vm_end < addr) {
1527                 if (pdir_pde(PTD, kernel_vm_end)) {
1528                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1529                         continue;
1530                 }
1531
1532                 /*
1533                  * This index is bogus, but out of the way
1534                  */
1535                 nkpg = vm_page_alloc(kptobj, nkpt,
1536                         VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT);
1537                 if (nkpg == NULL)
1538                         panic("pmap_growkernel: no memory to grow kernel");
1539
1540                 vm_page_wire(nkpg);
1541                 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1542                 pmap_zero_page(ptppaddr);
1543                 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1544                 pdir_pde(PTD, kernel_vm_end) = newpdir;
1545                 *pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir;
1546                 nkpt++;
1547
1548                 /*
1549                  * This update must be interlocked with pmap_pinit2.
1550                  */
1551                 TAILQ_FOREACH(pmap, &pmap_list, pm_pmnode) {
1552                         *pmap_pde(pmap, kernel_vm_end) = newpdir;
1553                 }
1554                 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
1555                                 ~(PAGE_SIZE * NPTEPG - 1);
1556         }
1557         crit_exit();
1558 }
1559
1560 /*
1561  *      Retire the given physical map from service.
1562  *      Should only be called if the map contains
1563  *      no valid mappings.
1564  */
1565 void
1566 pmap_destroy(pmap_t pmap)
1567 {
1568         int count;
1569
1570         if (pmap == NULL)
1571                 return;
1572
1573         count = --pmap->pm_count;
1574         if (count == 0) {
1575                 pmap_release(pmap);
1576                 panic("destroying a pmap is not yet implemented");
1577         }
1578 }
1579
1580 /*
1581  *      Add a reference to the specified pmap.
1582  */
1583 void
1584 pmap_reference(pmap_t pmap)
1585 {
1586         if (pmap != NULL) {
1587                 pmap->pm_count++;
1588         }
1589 }
1590
1591 /***************************************************
1592 * page management routines.
1593  ***************************************************/
1594
1595 /*
1596  * free the pv_entry back to the free list.  This function may be
1597  * called from an interrupt.
1598  */
1599 static PMAP_INLINE void
1600 free_pv_entry(pv_entry_t pv)
1601 {
1602         pv_entry_count--;
1603         zfree(pvzone, pv);
1604 }
1605
1606 /*
1607  * get a new pv_entry, allocating a block from the system
1608  * when needed.  This function may be called from an interrupt.
1609  */
1610 static pv_entry_t
1611 get_pv_entry(void)
1612 {
1613         pv_entry_count++;
1614         if (pv_entry_high_water &&
1615             (pv_entry_count > pv_entry_high_water) &&
1616             (pmap_pagedaemon_waken == 0)) {
1617                 pmap_pagedaemon_waken = 1;
1618                 wakeup (&vm_pages_needed);
1619         }
1620         return zalloc(pvzone);
1621 }
1622
1623 /*
1624  * This routine is very drastic, but can save the system
1625  * in a pinch.
1626  */
1627 void
1628 pmap_collect(void)
1629 {
1630         int i;
1631         vm_page_t m;
1632         static int warningdone=0;
1633
1634         if (pmap_pagedaemon_waken == 0)
1635                 return;
1636         pmap_pagedaemon_waken = 0;
1637
1638         if (warningdone < 5) {
1639                 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1640                 warningdone++;
1641         }
1642
1643         for(i = 0; i < vm_page_array_size; i++) {
1644                 m = &vm_page_array[i];
1645                 if (m->wire_count || m->hold_count || m->busy ||
1646                     (m->flags & PG_BUSY))
1647                         continue;
1648                 pmap_remove_all(m);
1649         }
1650 }
1651
1652
1653 /*
1654  * If it is the first entry on the list, it is actually
1655  * in the header and we must copy the following entry up
1656  * to the header.  Otherwise we must search the list for
1657  * the entry.  In either case we free the now unused entry.
1658  */
1659 static int
1660 pmap_remove_entry(struct pmap *pmap, vm_page_t m,
1661                         vm_offset_t va, pmap_inval_info_t info)
1662 {
1663         pv_entry_t pv;
1664         int rtval;
1665
1666         crit_enter();
1667         if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1668                 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1669                         if (pmap == pv->pv_pmap && va == pv->pv_va)
1670                                 break;
1671                 }
1672         } else {
1673                 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1674                         if (va == pv->pv_va)
1675                                 break;
1676                 }
1677         }
1678
1679         rtval = 0;
1680         if (pv) {
1681                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1682                 m->md.pv_list_count--;
1683                 if (TAILQ_EMPTY(&m->md.pv_list))
1684                         vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1685                 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1686                 ++pmap->pm_generation;
1687                 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info);
1688                 free_pv_entry(pv);
1689         }
1690         crit_exit();
1691         return rtval;
1692 }
1693
1694 /*
1695  * Create a pv entry for page at pa for
1696  * (pmap, va).
1697  */
1698 static void
1699 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
1700 {
1701         pv_entry_t pv;
1702
1703         crit_enter();
1704         pv = get_pv_entry();
1705         pv->pv_va = va;
1706         pv->pv_pmap = pmap;
1707         pv->pv_ptem = mpte;
1708
1709         TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1710         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1711         m->md.pv_list_count++;
1712
1713         crit_exit();
1714 }
1715
1716 /*
1717  * pmap_remove_pte: do the things to unmap a page in a process
1718  */
1719 static int
1720 pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va,
1721         pmap_inval_info_t info)
1722 {
1723         pt_entry_t oldpte;
1724         vm_page_t m;
1725
1726         pmap_inval_add(info, pmap, va);
1727         oldpte = pte_load_clear(ptq);
1728         if (oldpte & PG_W)
1729                 pmap->pm_stats.wired_count -= 1;
1730         /*
1731          * Machines that don't support invlpg, also don't support
1732          * PG_G.  XXX PG_G is disabled for SMP so don't worry about
1733          * the SMP case.
1734          */
1735         if (oldpte & PG_G)
1736                 cpu_invlpg((void *)va);
1737         KKASSERT(pmap->pm_stats.resident_count > 0);
1738         --pmap->pm_stats.resident_count;
1739         if (oldpte & PG_MANAGED) {
1740                 m = PHYS_TO_VM_PAGE(oldpte);
1741                 if (oldpte & PG_M) {
1742 #if defined(PMAP_DIAGNOSTIC)
1743                         if (pmap_nw_modified((pt_entry_t) oldpte)) {
1744                                 kprintf(
1745         "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1746                                     va, oldpte);
1747                         }
1748 #endif
1749                         if (pmap_track_modified(va))
1750                                 vm_page_dirty(m);
1751                 }
1752                 if (oldpte & PG_A)
1753                         vm_page_flag_set(m, PG_REFERENCED);
1754                 return pmap_remove_entry(pmap, m, va, info);
1755         } else {
1756                 return pmap_unuse_pt(pmap, va, NULL, info);
1757         }
1758
1759         return 0;
1760 }
1761
1762 /*
1763  * pmap_remove_page:
1764  *
1765  *      Remove a single page from a process address space.
1766  *
1767  *      This function may not be called from an interrupt if the pmap is
1768  *      not kernel_pmap.
1769  */
1770 static void
1771 pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info)
1772 {
1773         pt_entry_t *ptq;
1774
1775         /*
1776          * if there is no pte for this address, just skip it!!!  Otherwise
1777          * get a local va for mappings for this pmap and remove the entry.
1778          */
1779         if (*pmap_pde(pmap, va) != 0) {
1780                 ptq = get_ptbase(pmap) + amd64_btop(va);
1781                 if (*ptq) {
1782                         pmap_remove_pte(pmap, ptq, va, info);
1783                 }
1784         }
1785 }
1786
1787 /*
1788  * pmap_remove:
1789  *
1790  *      Remove the given range of addresses from the specified map.
1791  *
1792  *      It is assumed that the start and end are properly
1793  *      rounded to the page size.
1794  *
1795  *      This function may not be called from an interrupt if the pmap is
1796  *      not kernel_pmap.
1797  */
1798 void
1799 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
1800 {
1801         pt_entry_t *ptbase;
1802         vm_offset_t pdnxt;
1803         vm_offset_t ptpaddr;
1804         vm_offset_t sindex, eindex;
1805         struct pmap_inval_info info;
1806
1807         if (pmap == NULL)
1808                 return;
1809
1810         if (pmap->pm_stats.resident_count == 0)
1811                 return;
1812
1813         pmap_inval_init(&info);
1814
1815         /*
1816          * special handling of removing one page.  a very
1817          * common operation and easy to short circuit some
1818          * code.
1819          */
1820         if (((sva + PAGE_SIZE) == eva) &&
1821                 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1822                 pmap_remove_page(pmap, sva, &info);
1823                 pmap_inval_flush(&info);
1824                 return;
1825         }
1826
1827         /*
1828          * Get a local virtual address for the mappings that are being
1829          * worked with.
1830          */
1831         sindex = amd64_btop(sva);
1832         eindex = amd64_btop(eva);
1833
1834         for (; sindex < eindex; sindex = pdnxt) {
1835                 vm_pindex_t pdirindex;
1836
1837                 /*
1838                  * Calculate index for next page table.
1839                  */
1840                 pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1841                 if (pmap->pm_stats.resident_count == 0)
1842                         break;
1843
1844                 pdirindex = sindex / NPDEPG;
1845                 if (((ptpaddr = pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
1846                         pmap_inval_add(&info, pmap, -1);
1847                         pmap->pm_pdir[pdirindex] = 0;
1848                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1849                         continue;
1850                 }
1851
1852                 /*
1853                  * Weed out invalid mappings. Note: we assume that the page
1854                  * directory table is always allocated, and in kernel virtual.
1855                  */
1856                 if (ptpaddr == 0)
1857                         continue;
1858
1859                 /*
1860                  * Limit our scan to either the end of the va represented
1861                  * by the current page table page, or to the end of the
1862                  * range being removed.
1863                  */
1864                 if (pdnxt > eindex) {
1865                         pdnxt = eindex;
1866                 }
1867
1868                 /*
1869                  * NOTE: pmap_remove_pte() can block.
1870                  */
1871                 for (; sindex != pdnxt; sindex++) {
1872                         vm_offset_t va;
1873
1874                         ptbase = get_ptbase(pmap);
1875                         if (ptbase[sindex] == 0)
1876                                 continue;
1877                         va = amd64_ptob(sindex);
1878                         if (pmap_remove_pte(pmap, ptbase + sindex, va, &info))
1879                                 break;
1880                 }
1881         }
1882         pmap_inval_flush(&info);
1883 }
1884
1885 /*
1886  * pmap_remove_all:
1887  *
1888  *      Removes this physical page from all physical maps in which it resides.
1889  *      Reflects back modify bits to the pager.
1890  *
1891  *      This routine may not be called from an interrupt.
1892  */
1893
1894 static void
1895 pmap_remove_all(vm_page_t m)
1896 {
1897         struct pmap_inval_info info;
1898         pt_entry_t *pte, tpte;
1899         pv_entry_t pv;
1900
1901         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
1902                 return;
1903
1904         pmap_inval_init(&info);
1905         crit_enter();
1906         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1907                 KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
1908                 --pv->pv_pmap->pm_stats.resident_count;
1909
1910                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1911                 pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
1912                 tpte = pte_load_clear(pte);
1913
1914                 if (tpte & PG_W)
1915                         pv->pv_pmap->pm_stats.wired_count--;
1916
1917                 if (tpte & PG_A)
1918                         vm_page_flag_set(m, PG_REFERENCED);
1919
1920                 /*
1921                  * Update the vm_page_t clean and reference bits.
1922                  */
1923                 if (tpte & PG_M) {
1924 #if defined(PMAP_DIAGNOSTIC)
1925                         if (pmap_nw_modified((pt_entry_t) tpte)) {
1926                                 kprintf(
1927         "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1928                                     pv->pv_va, tpte);
1929                         }
1930 #endif
1931                         if (pmap_track_modified(pv->pv_va))
1932                                 vm_page_dirty(m);
1933                 }
1934                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1935                 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1936                 ++pv->pv_pmap->pm_generation;
1937                 m->md.pv_list_count--;
1938                 if (TAILQ_EMPTY(&m->md.pv_list))
1939                         vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
1940                 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info);
1941                 free_pv_entry(pv);
1942         }
1943         crit_exit();
1944         KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
1945         pmap_inval_flush(&info);
1946 }
1947
1948 /*
1949  * pmap_protect:
1950  *
1951  *      Set the physical protection on the specified range of this map
1952  *      as requested.
1953  *
1954  *      This function may not be called from an interrupt if the map is
1955  *      not the kernel_pmap.
1956  */
1957 void
1958 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1959 {
1960         pt_entry_t *ptbase;
1961         vm_offset_t pdnxt, ptpaddr;
1962         vm_pindex_t sindex, eindex;
1963         pmap_inval_info info;
1964
1965         if (pmap == NULL)
1966                 return;
1967
1968         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1969                 pmap_remove(pmap, sva, eva);
1970                 return;
1971         }
1972
1973         if (prot & VM_PROT_WRITE)
1974                 return;
1975
1976         pmap_inval_init(&info);
1977
1978         ptbase = get_ptbase(pmap);
1979
1980         sindex = amd64_btop(sva);
1981         eindex = amd64_btop(eva);
1982
1983         for (; sindex < eindex; sindex = pdnxt) {
1984
1985                 vm_pindex_t pdirindex;
1986
1987                 pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
1988
1989                 pdirindex = sindex / NPDEPG;
1990                 if (((ptpaddr = pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
1991                         pmap_inval_add(&info, pmap, -1);
1992                         pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1993                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1994                         continue;
1995                 }
1996
1997                 /*
1998                  * Weed out invalid mappings. Note: we assume that the page
1999                  * directory table is always allocated, and in kernel virtual.
2000                  */
2001                 if (ptpaddr == 0)
2002                         continue;
2003
2004                 if (pdnxt > eindex) {
2005                         pdnxt = eindex;
2006                 }
2007
2008                 for (; sindex != pdnxt; sindex++) {
2009
2010                         pt_entry_t pbits;
2011                         vm_page_t m;
2012
2013                         /*
2014                          * XXX non-optimal.  Note also that there can be
2015                          * no pmap_inval_flush() calls until after we modify
2016                          * ptbase[sindex] (or otherwise we have to do another
2017                          * pmap_inval_add() call).
2018                          */
2019                         pmap_inval_add(&info, pmap, amd64_ptob(sindex));
2020                         pbits = ptbase[sindex];
2021
2022                         if (pbits & PG_MANAGED) {
2023                                 m = NULL;
2024                                 if (pbits & PG_A) {
2025                                         m = PHYS_TO_VM_PAGE(pbits);
2026                                         vm_page_flag_set(m, PG_REFERENCED);
2027                                         pbits &= ~PG_A;
2028                                 }
2029                                 if (pbits & PG_M) {
2030                                         if (pmap_track_modified(amd64_ptob(sindex))) {
2031                                                 if (m == NULL)
2032                                                         m = PHYS_TO_VM_PAGE(pbits);
2033                                                 vm_page_dirty(m);
2034                                                 pbits &= ~PG_M;
2035                                         }
2036                                 }
2037                         }
2038
2039                         pbits &= ~PG_RW;
2040
2041                         if (pbits != ptbase[sindex]) {
2042                                 ptbase[sindex] = pbits;
2043                         }
2044                 }
2045         }
2046         pmap_inval_flush(&info);
2047 }
2048
2049 /*
2050  *      Insert the given physical page (p) at
2051  *      the specified virtual address (v) in the
2052  *      target physical map with the protection requested.
2053  *
2054  *      If specified, the page will be wired down, meaning
2055  *      that the related pte can not be reclaimed.
2056  *
2057  *      NB:  This is the only routine which MAY NOT lazy-evaluate
2058  *      or lose information.  That is, this routine must actually
2059  *      insert this page into the given map NOW.
2060  */
2061 void
2062 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2063            boolean_t wired)
2064 {
2065         vm_paddr_t pa;
2066         pt_entry_t *pte;
2067         vm_paddr_t opa;
2068         vm_offset_t origpte, newpte;
2069         vm_page_t mpte;
2070         pmap_inval_info info;
2071
2072         if (pmap == NULL)
2073                 return;
2074
2075         va &= PG_FRAME;
2076 #ifdef PMAP_DIAGNOSTIC
2077         if (va >= KvaEnd)
2078                 panic("pmap_enter: toobig");
2079         if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2080                 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2081 #endif
2082         if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
2083                 kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n");
2084 #ifdef DDB
2085                 db_print_backtrace();
2086 #endif
2087         }
2088         if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
2089                 kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n");
2090 #ifdef DDB
2091                 db_print_backtrace();
2092 #endif
2093         }
2094
2095         /*
2096          * In the case that a page table page is not
2097          * resident, we are creating it here.
2098          */
2099         if (va < UPT_MIN_ADDRESS)
2100                 mpte = pmap_allocpte(pmap, va);
2101         else
2102                 mpte = NULL;
2103
2104         pmap_inval_init(&info);
2105         pte = pmap_pte(pmap, va);
2106
2107         /*
2108          * Page Directory table entry not valid, we need a new PT page
2109          */
2110         if (pte == NULL) {
2111                 panic("pmap_enter: invalid page directory pdir=%x, va=0x%x\n",
2112                      pmap->pm_pdir[PTDPTDI], va);
2113         }
2114
2115         pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
2116         origpte = *(vm_offset_t *)pte;
2117         opa = origpte & PG_FRAME;
2118
2119         if (origpte & PG_PS)
2120                 panic("pmap_enter: attempted pmap_enter on 4MB page");
2121
2122         /*
2123          * Mapping has not changed, must be protection or wiring change.
2124          */
2125         if (origpte && (opa == pa)) {
2126                 /*
2127                  * Wiring change, just update stats. We don't worry about
2128                  * wiring PT pages as they remain resident as long as there
2129                  * are valid mappings in them. Hence, if a user page is wired,
2130                  * the PT page will be also.
2131                  */
2132                 if (wired && ((origpte & PG_W) == 0))
2133                         pmap->pm_stats.wired_count++;
2134                 else if (!wired && (origpte & PG_W))
2135                         pmap->pm_stats.wired_count--;
2136
2137 #if defined(PMAP_DIAGNOSTIC)
2138                 if (pmap_nw_modified((pt_entry_t) origpte)) {
2139                         kprintf(
2140         "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
2141                             va, origpte);
2142                 }
2143 #endif
2144
2145                 /*
2146                  * Remove the extra pte reference.  Note that we cannot
2147                  * optimize the RO->RW case because we have adjusted the
2148                  * wiring count above and may need to adjust the wiring
2149                  * bits below.
2150                  */
2151                 if (mpte)
2152                         mpte->hold_count--;
2153
2154                 /*
2155                  * We might be turning off write access to the page,
2156                  * so we go ahead and sense modify status.
2157                  */
2158                 if (origpte & PG_MANAGED) {
2159                         if ((origpte & PG_M) && pmap_track_modified(va)) {
2160                                 vm_page_t om;
2161                                 om = PHYS_TO_VM_PAGE(opa);
2162                                 vm_page_dirty(om);
2163                         }
2164                         pa |= PG_MANAGED;
2165                         KKASSERT(m->flags & PG_MAPPED);
2166                 }
2167                 goto validate;
2168         }
2169         /*
2170          * Mapping has changed, invalidate old range and fall through to
2171          * handle validating new mapping.
2172          */
2173         if (opa) {
2174                 int err;
2175                 err = pmap_remove_pte(pmap, pte, va, &info);
2176                 if (err)
2177                         panic("pmap_enter: pte vanished, va: 0x%x", va);
2178         }
2179
2180         /*
2181          * Enter on the PV list if part of our managed memory. Note that we
2182          * raise IPL while manipulating pv_table since pmap_enter can be
2183          * called at interrupt time.
2184          */
2185         if (pmap_initialized &&
2186             (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2187                 pmap_insert_entry(pmap, va, mpte, m);
2188                 pa |= PG_MANAGED;
2189                 vm_page_flag_set(m, PG_MAPPED);
2190         }
2191
2192         /*
2193          * Increment counters
2194          */
2195         ++pmap->pm_stats.resident_count;
2196         if (wired)
2197                 pmap->pm_stats.wired_count++;
2198
2199 validate:
2200         /*
2201          * Now validate mapping with desired protection/wiring.
2202          */
2203         newpte = (vm_offset_t) (pa | pte_prot(pmap, prot) | PG_V);
2204
2205         if (wired)
2206                 newpte |= PG_W;
2207         if (va < UPT_MIN_ADDRESS)
2208                 newpte |= PG_U;
2209         if (pmap == &kernel_pmap)
2210                 newpte |= pgeflag;
2211
2212         /*
2213          * if the mapping or permission bits are different, we need
2214          * to update the pte.
2215          */
2216         if ((origpte & ~(PG_M|PG_A)) != newpte) {
2217                 pmap_inval_add(&info, pmap, va);
2218                 *pte = newpte | PG_A;
2219                 if (newpte & PG_RW)
2220                         vm_page_flag_set(m, PG_WRITEABLE);
2221         }
2222         KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
2223         pmap_inval_flush(&info);
2224 }
2225
2226 /*
2227  * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
2228  * This code also assumes that the pmap has no pre-existing entry for this
2229  * VA.
2230  *
2231  * This code currently may only be used on user pmaps, not kernel_pmap.
2232  */
2233 static void
2234 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
2235 {
2236         pt_entry_t *pte;
2237         vm_paddr_t pa;
2238         vm_page_t mpte;
2239         vm_pindex_t ptepindex;
2240         vm_offset_t ptepa;
2241         pmap_inval_info info;
2242
2243         pmap_inval_init(&info);
2244
2245         if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
2246                 kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n");
2247 #ifdef DDB
2248                 db_print_backtrace();
2249 #endif
2250         }
2251         if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
2252                 kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n");
2253 #ifdef DDB
2254                 db_print_backtrace();
2255 #endif
2256         }
2257
2258         KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */
2259
2260         /*
2261          * Calculate the page table page (mpte), allocating it if necessary.
2262          *
2263          * A held page table page (mpte), or NULL, is passed onto the
2264          * section following.
2265          */
2266         if (va < UPT_MIN_ADDRESS) {
2267                 /*
2268                  * Calculate pagetable page index
2269                  */
2270                 ptepindex = va >> PDRSHIFT;
2271
2272                 do {
2273                         /*
2274                          * Get the page directory entry
2275                          */
2276                         ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
2277
2278                         /*
2279                          * If the page table page is mapped, we just increment
2280                          * the hold count, and activate it.
2281                          */
2282                         if (ptepa) {
2283                                 if (ptepa & PG_PS)
2284                                         panic("pmap_enter_quick: unexpected mapping into 4MB page");
2285                                 if (pmap->pm_ptphint &&
2286                                     (pmap->pm_ptphint->pindex == ptepindex)) {
2287                                         mpte = pmap->pm_ptphint;
2288                                 } else {
2289                                         mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
2290                                         pmap->pm_ptphint = mpte;
2291                                 }
2292                                 if (mpte)
2293                                         mpte->hold_count++;
2294                         } else {
2295                                 mpte = _pmap_allocpte(pmap, ptepindex);
2296                         }
2297                 } while (mpte == NULL);
2298         } else {
2299                 mpte = NULL;
2300                 /* this code path is not yet used */
2301         }
2302
2303         /*
2304          * With a valid (and held) page directory page, we can just use
2305          * vtopte() to get to the pte.  If the pte is already present
2306          * we do not disturb it.
2307          */
2308         pte = vtopte(va);
2309         if (*pte & PG_V) {
2310                 if (mpte)
2311                         pmap_unwire_pte_hold(pmap, mpte, &info);
2312                 pa = VM_PAGE_TO_PHYS(m);
2313                 KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
2314                 return;
2315         }
2316
2317         /*
2318          * Enter on the PV list if part of our managed memory
2319          */
2320         if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
2321                 pmap_insert_entry(pmap, va, mpte, m);
2322                 vm_page_flag_set(m, PG_MAPPED);
2323         }
2324
2325         /*
2326          * Increment counters
2327          */
2328         ++pmap->pm_stats.resident_count;
2329
2330         pa = VM_PAGE_TO_PHYS(m);
2331
2332         /*
2333          * Now validate mapping with RO protection
2334          */
2335         if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2336                 *pte = pa | PG_V | PG_U;
2337         else
2338                 *pte = pa | PG_V | PG_U | PG_MANAGED;
2339 /*      pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
2340         pmap_inval_flush(&info);
2341 }
2342
2343 /*
2344  * Make a temporary mapping for a physical address.  This is only intended
2345  * to be used for panic dumps.
2346  */
2347 void *
2348 pmap_kenter_temporary(vm_paddr_t pa, int i)
2349 {
2350         pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
2351         return ((void *)crashdumpmap);
2352 }
2353
2354 #define MAX_INIT_PT (96)
2355
2356 /*
2357  * This routine preloads the ptes for a given object into the specified pmap.
2358  * This eliminates the blast of soft faults on process startup and
2359  * immediately after an mmap.
2360  */
2361 static int pmap_object_init_pt_callback(vm_page_t p, void *data);
2362
2363 void
2364 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
2365                     vm_object_t object, vm_pindex_t pindex,
2366                     vm_size_t size, int limit)
2367 {
2368         struct rb_vm_page_scan_info info;
2369         struct lwp *lp;
2370         int psize;
2371
2372         /*
2373          * We can't preinit if read access isn't set or there is no pmap
2374          * or object.
2375          */
2376         if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL)
2377                 return;
2378
2379         /*
2380          * We can't preinit if the pmap is not the current pmap
2381          */
2382         lp = curthread->td_lwp;
2383         if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace))
2384                 return;
2385
2386         psize = amd64_btop(size);
2387
2388         if ((object->type != OBJT_VNODE) ||
2389                 ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
2390                         (object->resident_page_count > MAX_INIT_PT))) {
2391                 return;
2392         }
2393
2394         if (psize + pindex > object->size) {
2395                 if (object->size < pindex)
2396                         return;
2397                 psize = object->size - pindex;
2398         }
2399
2400         if (psize == 0)
2401                 return;
2402
2403         /*
2404          * Use a red-black scan to traverse the requested range and load
2405          * any valid pages found into the pmap.
2406          *
2407          * We cannot safely scan the object's memq unless we are in a
2408          * critical section since interrupts can remove pages from objects.
2409          */
2410         info.start_pindex = pindex;
2411         info.end_pindex = pindex + psize - 1;
2412         info.limit = limit;
2413         info.mpte = NULL;
2414         info.addr = addr;
2415         info.pmap = pmap;
2416
2417         crit_enter();
2418         vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
2419                                 pmap_object_init_pt_callback, &info);
2420         crit_exit();
2421 }
2422
2423 static
2424 int
2425 pmap_object_init_pt_callback(vm_page_t p, void *data)
2426 {
2427         struct rb_vm_page_scan_info *info = data;
2428         vm_pindex_t rel_index;
2429         /*
2430          * don't allow an madvise to blow away our really
2431          * free pages allocating pv entries.
2432          */
2433         if ((info->limit & MAP_PREFAULT_MADVISE) &&
2434                 vmstats.v_free_count < vmstats.v_free_reserved) {
2435                     return(-1);
2436         }
2437         if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2438             (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2439                 if ((p->queue - p->pc) == PQ_CACHE)
2440                         vm_page_deactivate(p);
2441                 vm_page_busy(p);
2442                 rel_index = p->pindex - info->start_pindex;
2443                 pmap_enter_quick(info->pmap,
2444                                  info->addr + amd64_ptob(rel_index), p);
2445                 vm_page_wakeup(p);
2446         }
2447         return(0);
2448 }
2449
2450 /*
2451  * pmap_prefault provides a quick way of clustering pagefaults into a
2452  * processes address space.  It is a "cousin" of pmap_object_init_pt,
2453  * except it runs at page fault time instead of mmap time.
2454  */
2455 #define PFBAK 4
2456 #define PFFOR 4
2457 #define PAGEORDER_SIZE (PFBAK+PFFOR)
2458
2459 static int pmap_prefault_pageorder[] = {
2460         -PAGE_SIZE, PAGE_SIZE,
2461         -2 * PAGE_SIZE, 2 * PAGE_SIZE,
2462         -3 * PAGE_SIZE, 3 * PAGE_SIZE,
2463         -4 * PAGE_SIZE, 4 * PAGE_SIZE
2464 };
2465
2466 void
2467 pmap_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry)
2468 {
2469         int i;
2470         vm_offset_t starta;
2471         vm_offset_t addr;
2472         vm_pindex_t pindex;
2473         vm_page_t m;
2474         vm_object_t object;
2475         struct lwp *lp;
2476
2477         /*
2478          * We do not currently prefault mappings that use virtual page
2479          * tables.  We do not prefault foreign pmaps.
2480          */
2481         if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
2482                 return;
2483         lp = curthread->td_lwp;
2484         if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace)))
2485                 return;
2486
2487         object = entry->object.vm_object;
2488
2489         starta = addra - PFBAK * PAGE_SIZE;
2490         if (starta < entry->start)
2491                 starta = entry->start;
2492         else if (starta > addra)
2493                 starta = 0;
2494
2495         /*
2496          * critical section protection is required to maintain the
2497          * page/object association, interrupts can free pages and remove
2498          * them from their objects.
2499          */
2500         crit_enter();
2501         for (i = 0; i < PAGEORDER_SIZE; i++) {
2502                 vm_object_t lobject;
2503                 pt_entry_t *pte;
2504
2505                 addr = addra + pmap_prefault_pageorder[i];
2506                 if (addr > addra + (PFFOR * PAGE_SIZE))
2507                         addr = 0;
2508
2509                 if (addr < starta || addr >= entry->end)
2510                         continue;
2511
2512                 if ((*pmap_pde(pmap, addr)) == 0)
2513                         continue;
2514
2515                 pte = vtopte(addr);
2516                 if (*pte)
2517                         continue;
2518
2519                 pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
2520                 lobject = object;
2521
2522                 for (m = vm_page_lookup(lobject, pindex);
2523                     (!m && (lobject->type == OBJT_DEFAULT) &&
2524                      (lobject->backing_object));
2525                     lobject = lobject->backing_object
2526                 ) {
2527                         if (lobject->backing_object_offset & PAGE_MASK)
2528                                 break;
2529                         pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
2530                         m = vm_page_lookup(lobject->backing_object, pindex);
2531                 }
2532
2533                 /*
2534                  * give-up when a page is not in memory
2535                  */
2536                 if (m == NULL)
2537                         break;
2538
2539                 if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
2540                         (m->busy == 0) &&
2541                     (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
2542
2543                         if ((m->queue - m->pc) == PQ_CACHE) {
2544                                 vm_page_deactivate(m);
2545                         }
2546                         vm_page_busy(m);
2547                         pmap_enter_quick(pmap, addr, m);
2548                         vm_page_wakeup(m);
2549                 }
2550         }
2551         crit_exit();
2552 }
2553
2554 /*
2555  *      Routine:        pmap_change_wiring
2556  *      Function:       Change the wiring attribute for a map/virtual-address
2557  *                      pair.
2558  *      In/out conditions:
2559  *                      The mapping must already exist in the pmap.
2560  */
2561 void
2562 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
2563 {
2564         pt_entry_t *pte;
2565
2566         if (pmap == NULL)
2567                 return;
2568
2569         pte = pmap_pte(pmap, va);
2570
2571         if (wired && !pmap_pte_w(pte))
2572                 pmap->pm_stats.wired_count++;
2573         else if (!wired && pmap_pte_w(pte))
2574                 pmap->pm_stats.wired_count--;
2575
2576         /*
2577          * Wiring is not a hardware characteristic so there is no need to
2578          * invalidate TLB.  However, in an SMP environment we must use
2579          * a locked bus cycle to update the pte (if we are not using
2580          * the pmap_inval_*() API that is)... it's ok to do this for simple
2581          * wiring changes.
2582          */
2583 #ifdef SMP
2584         if (wired)
2585                 atomic_set_int(pte, PG_W);
2586         else
2587                 atomic_clear_int(pte, PG_W);
2588 #else
2589         if (wired)
2590                 atomic_set_int_nonlocked(pte, PG_W);
2591         else
2592                 atomic_clear_int_nonlocked(pte, PG_W);
2593 #endif
2594 }
2595
2596
2597
2598 /*
2599  *      Copy the range specified by src_addr/len
2600  *      from the source map to the range dst_addr/len
2601  *      in the destination map.
2602  *
2603  *      This routine is only advisory and need not do anything.
2604  */
2605 void
2606 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2607         vm_size_t len, vm_offset_t src_addr)
2608 {
2609         pmap_inval_info info;
2610         vm_offset_t addr;
2611         vm_offset_t end_addr = src_addr + len;
2612         vm_offset_t pdnxt;
2613         pd_entry_t src_frame, dst_frame;
2614         vm_page_t m;
2615
2616         if (dst_addr != src_addr)
2617                 return;
2618         /*
2619          * XXX BUGGY.  Amoung other things srcmpte is assumed to remain
2620          * valid through blocking calls, and that's just not going to
2621          * be the case.
2622          *
2623          * FIXME!
2624          */
2625         return;
2626
2627         src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2628         if (src_frame != (PTDpde & PG_FRAME)) {
2629                 return;
2630         }
2631
2632         dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
2633         if (dst_frame != (APTDpde & PG_FRAME)) {
2634                 APTDpde = (pd_entry_t) (dst_frame | PG_RW | PG_V);
2635                 /* The page directory is not shared between CPUs */
2636                 cpu_invltlb();
2637         }
2638         pmap_inval_init(&info);
2639         pmap_inval_add(&info, dst_pmap, -1);
2640         pmap_inval_add(&info, src_pmap, -1);
2641
2642         /*
2643          * critical section protection is required to maintain the page/object
2644          * association, interrupts can free pages and remove them from
2645          * their objects.
2646          */
2647         crit_enter();
2648         for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2649                 pt_entry_t *src_pte, *dst_pte;
2650                 vm_page_t dstmpte, srcmpte;
2651                 vm_offset_t srcptepaddr;
2652                 vm_pindex_t ptepindex;
2653
2654                 if (addr >= UPT_MIN_ADDRESS)
2655                         panic("pmap_copy: invalid to pmap_copy page tables\n");
2656
2657                 /*
2658                  * Don't let optional prefaulting of pages make us go
2659                  * way below the low water mark of free pages or way
2660                  * above high water mark of used pv entries.
2661                  */
2662                 if (vmstats.v_free_count < vmstats.v_free_reserved ||
2663                     pv_entry_count > pv_entry_high_water)
2664                         break;
2665
2666                 pdnxt = ((addr + PAGE_SIZE*NPTEPG) & ~(PAGE_SIZE*NPTEPG - 1));
2667                 ptepindex = addr >> PDRSHIFT;
2668
2669                 srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
2670                 if (srcptepaddr == 0)
2671                         continue;
2672
2673                 if (srcptepaddr & PG_PS) {
2674                         if (dst_pmap->pm_pdir[ptepindex] == 0) {
2675                                 dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
2676                                 dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
2677                         }
2678                         continue;
2679                 }
2680
2681                 srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
2682                 if ((srcmpte == NULL) || (srcmpte->hold_count == 0) ||
2683                     (srcmpte->flags & PG_BUSY)) {
2684                         continue;
2685                 }
2686
2687                 if (pdnxt > end_addr)
2688                         pdnxt = end_addr;
2689
2690                 src_pte = vtopte(addr);
2691                 dst_pte = avtopte(addr);
2692                 while (addr < pdnxt) {
2693                         pt_entry_t ptetemp;
2694
2695                         ptetemp = *src_pte;
2696                         /*
2697                          * we only virtual copy managed pages
2698                          */
2699                         if ((ptetemp & PG_MANAGED) != 0) {
2700                                 /*
2701                                  * We have to check after allocpte for the
2702                                  * pte still being around...  allocpte can
2703                                  * block.
2704                                  *
2705                                  * pmap_allocpte() can block.  If we lose
2706                                  * our page directory mappings we stop.
2707                                  */
2708                                 dstmpte = pmap_allocpte(dst_pmap, addr);
2709
2710                                 if (src_frame != (PTDpde & PG_FRAME) ||
2711                                     dst_frame != (APTDpde & PG_FRAME)
2712                                 ) {
2713                                         kprintf("WARNING: pmap_copy: detected and corrected race\n");
2714                                         pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
2715                                         goto failed;
2716                                 } else if ((*dst_pte == 0) &&
2717                                            (ptetemp = *src_pte) != 0 &&
2718                                            (ptetemp & PG_MANAGED)) {
2719                                         /*
2720                                          * Clear the modified and
2721                                          * accessed (referenced) bits
2722                                          * during the copy.
2723                                          */
2724                                         m = PHYS_TO_VM_PAGE(ptetemp);
2725                                         *dst_pte = ptetemp & ~(PG_M | PG_A);
2726                                         ++dst_pmap->pm_stats.resident_count;
2727                                         pmap_insert_entry(dst_pmap, addr,
2728                                                 dstmpte, m);
2729                                         KKASSERT(m->flags & PG_MAPPED);
2730                                 } else {
2731                                         kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n");
2732                                         pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
2733                                         goto failed;
2734                                 }
2735                                 if (dstmpte->hold_count >= srcmpte->hold_count)
2736                                         break;
2737                         }
2738                         addr += PAGE_SIZE;
2739                         src_pte++;
2740                         dst_pte++;
2741                 }
2742         }
2743 failed:
2744         crit_exit();
2745         pmap_inval_flush(&info);
2746 }
2747
2748 /*
2749  * pmap_zero_page:
2750  *
2751  *      Zero the specified PA by mapping the page into KVM and clearing its
2752  *      contents.
2753  *
2754  *      This function may be called from an interrupt and no locking is
2755  *      required.
2756  */
2757 void
2758 pmap_zero_page(vm_paddr_t phys)
2759 {
2760         struct mdglobaldata *gd = mdcpu;
2761
2762         crit_enter();
2763         if (*gd->gd_CMAP3)
2764                 panic("pmap_zero_page: CMAP3 busy");
2765         *gd->gd_CMAP3 =
2766                     PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2767         cpu_invlpg(gd->gd_CADDR3);
2768
2769 #if defined(I686_CPU)
2770         if (cpu_class == CPUCLASS_686)
2771                 i686_pagezero(gd->gd_CADDR3);
2772         else
2773 #endif
2774                 bzero(gd->gd_CADDR3, PAGE_SIZE);
2775         *gd->gd_CMAP3 = 0;
2776         crit_exit();
2777 }
2778
2779 /*
2780  * pmap_page_assertzero:
2781  *
2782  *      Assert that a page is empty, panic if it isn't.
2783  */
2784 void
2785 pmap_page_assertzero(vm_paddr_t phys)
2786 {
2787         struct mdglobaldata *gd = mdcpu;
2788         int i;
2789
2790         crit_enter();
2791         if (*gd->gd_CMAP3)
2792                 panic("pmap_zero_page: CMAP3 busy");
2793         *gd->gd_CMAP3 =
2794                     PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2795         cpu_invlpg(gd->gd_CADDR3);
2796         for (i = 0; i < PAGE_SIZE; i += sizeof(int)) {
2797             if (*(int *)((char *)gd->gd_CADDR3 + i) != 0) {
2798                 panic("pmap_page_assertzero() @ %p not zero!\n",
2799                     (void *)gd->gd_CADDR3);
2800             }
2801         }
2802         *gd->gd_CMAP3 = 0;
2803         crit_exit();
2804 }
2805
2806 /*
2807  * pmap_zero_page:
2808  *
2809  *      Zero part of a physical page by mapping it into memory and clearing
2810  *      its contents with bzero.
2811  *
2812  *      off and size may not cover an area beyond a single hardware page.
2813  */
2814 void
2815 pmap_zero_page_area(vm_paddr_t phys, int off, int size)
2816 {
2817         struct mdglobaldata *gd = mdcpu;
2818
2819         crit_enter();
2820         if (*gd->gd_CMAP3)
2821                 panic("pmap_zero_page: CMAP3 busy");
2822         *gd->gd_CMAP3 = PG_V | PG_RW | (phys & PG_FRAME) | PG_A | PG_M;
2823         cpu_invlpg(gd->gd_CADDR3);
2824
2825 #if defined(I686_CPU)
2826         if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
2827                 i686_pagezero(gd->gd_CADDR3);
2828         else
2829 #endif
2830                 bzero((char *)gd->gd_CADDR3 + off, size);
2831         *gd->gd_CMAP3 = 0;
2832         crit_exit();
2833 }
2834
2835 /*
2836  * pmap_copy_page:
2837  *
2838  *      Copy the physical page from the source PA to the target PA.
2839  *      This function may be called from an interrupt.  No locking
2840  *      is required.
2841  */
2842 void
2843 pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
2844 {
2845         struct mdglobaldata *gd = mdcpu;
2846
2847         crit_enter();
2848         if (*gd->gd_CMAP1)
2849                 panic("pmap_copy_page: CMAP1 busy");
2850         if (*gd->gd_CMAP2)
2851                 panic("pmap_copy_page: CMAP2 busy");
2852
2853         *gd->gd_CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2854         *gd->gd_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2855
2856         cpu_invlpg(gd->gd_CADDR1);
2857         cpu_invlpg(gd->gd_CADDR2);
2858
2859         bcopy(gd->gd_CADDR1, gd->gd_CADDR2, PAGE_SIZE);
2860
2861         *gd->gd_CMAP1 = 0;
2862         *gd->gd_CMAP2 = 0;
2863         crit_exit();
2864 }
2865
2866 /*
2867  * pmap_copy_page_frag:
2868  *
2869  *      Copy the physical page from the source PA to the target PA.
2870  *      This function may be called from an interrupt.  No locking
2871  *      is required.
2872  */
2873 void
2874 pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
2875 {
2876         struct mdglobaldata *gd = mdcpu;
2877
2878         crit_enter();
2879         if (*gd->gd_CMAP1)
2880                 panic("pmap_copy_page: CMAP1 busy");
2881         if (*gd->gd_CMAP2)
2882                 panic("pmap_copy_page: CMAP2 busy");
2883
2884         *gd->gd_CMAP1 = PG_V | (src & PG_FRAME) | PG_A;
2885         *gd->gd_CMAP2 = PG_V | PG_RW | (dst & PG_FRAME) | PG_A | PG_M;
2886
2887         cpu_invlpg(gd->gd_CADDR1);
2888         cpu_invlpg(gd->gd_CADDR2);
2889
2890         bcopy((char *)gd->gd_CADDR1 + (src & PAGE_MASK),
2891               (char *)gd->gd_CADDR2 + (dst & PAGE_MASK),
2892               bytes);
2893
2894         *gd->gd_CMAP1 = 0;
2895         *gd->gd_CMAP2 = 0;
2896         crit_exit();
2897 }
2898
2899 /*
2900  * Returns true if the pmap's pv is one of the first
2901  * 16 pvs linked to from this page.  This count may
2902  * be changed upwards or downwards in the future; it
2903  * is only necessary that true be returned for a small
2904  * subset of pmaps for proper page aging.
2905  */
2906 boolean_t
2907 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
2908 {
2909         pv_entry_t pv;
2910         int loops = 0;
2911
2912         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
2913                 return FALSE;
2914
2915         crit_enter();
2916
2917         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2918                 if (pv->pv_pmap == pmap) {
2919                         crit_exit();
2920                         return TRUE;
2921                 }
2922                 loops++;
2923                 if (loops >= 16)
2924                         break;
2925         }
2926         crit_exit();
2927         return (FALSE);
2928 }
2929
2930 /*
2931  * Remove all pages from specified address space
2932  * this aids process exit speeds.  Also, this code
2933  * is special cased for current process only, but
2934  * can have the more generic (and slightly slower)
2935  * mode enabled.  This is much faster than pmap_remove
2936  * in the case of running down an entire address space.
2937  */
2938 void
2939 pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2940 {
2941         struct lwp *lp;
2942         pt_entry_t *pte, tpte;
2943         pv_entry_t pv, npv;
2944         vm_page_t m;
2945         pmap_inval_info info;
2946         int iscurrentpmap;
2947         int32_t save_generation;
2948
2949         lp = curthread->td_lwp;
2950         if (lp && pmap == vmspace_pmap(lp->lwp_vmspace))
2951                 iscurrentpmap = 1;
2952         else
2953                 iscurrentpmap = 0;
2954
2955         pmap_inval_init(&info);
2956         crit_enter();
2957         for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2958                 if (pv->pv_va >= eva || pv->pv_va < sva) {
2959                         npv = TAILQ_NEXT(pv, pv_plist);
2960                         continue;
2961                 }
2962
2963                 KKASSERT(pmap == pv->pv_pmap);
2964
2965                 if (iscurrentpmap)
2966                         pte = vtopte(pv->pv_va);
2967                 else
2968                         pte = pmap_pte_quick(pmap, pv->pv_va);
2969                 if (pmap->pm_active)
2970                         pmap_inval_add(&info, pmap, pv->pv_va);
2971
2972                 /*
2973                  * We cannot remove wired pages from a process' mapping
2974                  * at this time
2975                  */
2976                 if (*pte & PG_W) {
2977                         npv = TAILQ_NEXT(pv, pv_plist);
2978                         continue;
2979                 }
2980                 tpte = pte_load_clear(pte);
2981
2982                 m = PHYS_TO_VM_PAGE(tpte);
2983
2984                 KASSERT(m < &vm_page_array[vm_page_array_size],
2985                         ("pmap_remove_pages: bad tpte %x", tpte));
2986
2987                 KKASSERT(pmap->pm_stats.resident_count > 0);
2988                 --pmap->pm_stats.resident_count;
2989
2990                 /*
2991                  * Update the vm_page_t clean and reference bits.
2992                  */
2993                 if (tpte & PG_M) {
2994                         vm_page_dirty(m);
2995                 }
2996
2997                 npv = TAILQ_NEXT(pv, pv_plist);
2998                 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2999                 save_generation = ++pmap->pm_generation;
3000
3001                 m->md.pv_list_count--;
3002                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3003                 if (TAILQ_EMPTY(&m->md.pv_list))
3004                         vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
3005
3006                 pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info);
3007                 free_pv_entry(pv);
3008
3009                 /*
3010                  * Restart the scan if we blocked during the unuse or free
3011                  * calls and other removals were made.
3012                  */
3013                 if (save_generation != pmap->pm_generation) {
3014                         kprintf("Warning: pmap_remove_pages race-A avoided\n");
3015                         pv = TAILQ_FIRST(&pmap->pm_pvlist);
3016                 }
3017         }
3018         pmap_inval_flush(&info);
3019         crit_exit();
3020 }
3021
3022 /*
3023  * pmap_testbit tests bits in pte's
3024  * note that the testbit/clearbit routines are inline,
3025  * and a lot of things compile-time evaluate.
3026  */
3027 static boolean_t
3028 pmap_testbit(vm_page_t m, int bit)
3029 {
3030         pv_entry_t pv;
3031         pt_entry_t *pte;
3032
3033         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3034                 return FALSE;
3035
3036         if (TAILQ_FIRST(&m->md.pv_list) == NULL)
3037                 return FALSE;
3038
3039         crit_enter();
3040
3041         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3042                 /*
3043                  * if the bit being tested is the modified bit, then
3044                  * mark clean_map and ptes as never
3045                  * modified.
3046                  */
3047                 if (bit & (PG_A|PG_M)) {
3048                         if (!pmap_track_modified(pv->pv_va))
3049                                 continue;
3050                 }
3051
3052 #if defined(PMAP_DIAGNOSTIC)
3053                 if (!pv->pv_pmap) {
3054                         kprintf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
3055                         continue;
3056                 }
3057 #endif
3058                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3059                 if (*pte & bit) {
3060                         crit_exit();
3061                         return TRUE;
3062                 }
3063         }
3064         crit_exit();
3065         return (FALSE);
3066 }
3067
3068 /*
3069  * this routine is used to modify bits in ptes
3070  */
3071 static __inline void
3072 pmap_clearbit(vm_page_t m, int bit)
3073 {
3074         struct pmap_inval_info info;
3075         pv_entry_t pv;
3076         pt_entry_t *pte;
3077         pt_entry_t pbits;
3078
3079         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3080                 return;
3081
3082         pmap_inval_init(&info);
3083         crit_enter();
3084
3085         /*
3086          * Loop over all current mappings setting/clearing as appropos If
3087          * setting RO do we need to clear the VAC?
3088          */
3089         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3090                 /*
3091                  * don't write protect pager mappings
3092                  */
3093                 if (bit == PG_RW) {
3094                         if (!pmap_track_modified(pv->pv_va))
3095                                 continue;
3096                 }
3097
3098 #if defined(PMAP_DIAGNOSTIC)
3099                 if (!pv->pv_pmap) {
3100                         kprintf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
3101                         continue;
3102                 }
3103 #endif
3104
3105                 /*
3106                  * Careful here.  We can use a locked bus instruction to
3107                  * clear PG_A or PG_M safely but we need to synchronize
3108                  * with the target cpus when we mess with PG_RW.
3109                  *
3110                  * We do not have to force synchronization when clearing
3111                  * PG_M even for PTEs generated via virtual memory maps,
3112                  * because the virtual kernel will invalidate the pmap
3113                  * entry when/if it needs to resynchronize the Modify bit.
3114                  */
3115                 if (bit & PG_RW)
3116                         pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
3117                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3118 again:
3119                 pbits = *pte;
3120                 if (pbits & bit) {
3121                         if (bit == PG_RW) {
3122                                 if (pbits & PG_M) {
3123                                         vm_page_dirty(m);
3124                                         atomic_clear_int(pte, PG_M|PG_RW);
3125                                 } else {
3126                                         /*
3127                                          * The cpu may be trying to set PG_M
3128                                          * simultaniously with our clearing
3129                                          * of PG_RW.
3130                                          */
3131                                         if (!atomic_cmpset_int(pte, pbits,
3132                                                                pbits & ~PG_RW))
3133                                                 goto again;
3134                                 }
3135                         } else if (bit == PG_M) {
3136                                 /*
3137                                  * We could also clear PG_RW here to force
3138                                  * a fault on write to redetect PG_M for
3139                                  * virtual kernels, but it isn't necessary
3140                                  * since virtual kernels invalidate the pte
3141                                  * when they clear the VPTE_M bit in their
3142                                  * virtual page tables.
3143                                  */
3144                                 atomic_clear_int(pte, PG_M);
3145                         } else {
3146                                 atomic_clear_int(pte, bit);
3147                         }
3148                 }
3149         }
3150         pmap_inval_flush(&info);
3151         crit_exit();
3152 }
3153
3154 /*
3155  *      pmap_page_protect:
3156  *
3157  *      Lower the permission for all mappings to a given page.
3158  */
3159 void
3160 pmap_page_protect(vm_page_t m, vm_prot_t prot)
3161 {
3162         if ((prot & VM_PROT_WRITE) == 0) {
3163                 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
3164                         pmap_clearbit(m, PG_RW);
3165                         vm_page_flag_clear(m, PG_WRITEABLE);
3166                 } else {
3167                         pmap_remove_all(m);
3168                 }
3169         }
3170 }
3171
3172 vm_paddr_t
3173 pmap_phys_address(vm_pindex_t ppn)
3174 {
3175         return (amd64_ptob(ppn));
3176 }
3177
3178 /*
3179  *      pmap_ts_referenced:
3180  *
3181  *      Return a count of reference bits for a page, clearing those bits.
3182  *      It is not necessary for every reference bit to be cleared, but it
3183  *      is necessary that 0 only be returned when there are truly no
3184  *      reference bits set.
3185  *
3186  *      XXX: The exact number of bits to check and clear is a matter that
3187  *      should be tested and standardized at some point in the future for
3188  *      optimal aging of shared pages.
3189  */
3190 int
3191 pmap_ts_referenced(vm_page_t m)
3192 {
3193         pv_entry_t pv, pvf, pvn;
3194         pt_entry_t *pte;
3195         int rtval = 0;
3196
3197         if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
3198                 return (rtval);
3199
3200         crit_enter();
3201
3202         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3203
3204                 pvf = pv;
3205
3206                 do {
3207                         pvn = TAILQ_NEXT(pv, pv_list);
3208
3209                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3210
3211                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3212
3213                         if (!pmap_track_modified(pv->pv_va))
3214                                 continue;
3215
3216                         pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
3217
3218                         if (pte && (*pte & PG_A)) {
3219 #ifdef SMP
3220                                 atomic_clear_int(pte, PG_A);
3221 #else
3222                                 atomic_clear_int_nonlocked(pte, PG_A);
3223 #endif
3224                                 rtval++;
3225                                 if (rtval > 4) {
3226                                         break;
3227                                 }
3228                         }
3229                 } while ((pv = pvn) != NULL && pv != pvf);
3230         }
3231         crit_exit();
3232
3233         return (rtval);
3234 }
3235
3236 /*
3237  *      pmap_is_modified:
3238  *
3239  *      Return whether or not the specified physical page was modified
3240  *      in any physical maps.
3241  */
3242 boolean_t
3243 pmap_is_modified(vm_page_t m)
3244 {
3245         return pmap_testbit(m, PG_M);
3246 }
3247
3248 /*
3249  *      Clear the modify bits on the specified physical page.
3250  */
3251 void
3252 pmap_clear_modify(vm_page_t m)
3253 {
3254         pmap_clearbit(m, PG_M);
3255 }
3256
3257 /*
3258  *      pmap_clear_reference:
3259  *
3260  *      Clear the reference bit on the specified physical page.
3261  */
3262 void
3263 pmap_clear_reference(vm_page_t m)
3264 {
3265         pmap_clearbit(m, PG_A);
3266 }
3267
3268 /*
3269  * Miscellaneous support routines follow
3270  */
3271
3272 static void
3273 i386_protection_init(void)
3274 {
3275         int *kp, prot;
3276
3277         kp = protection_codes;
3278         for (prot = 0; prot < 8; prot++) {
3279                 switch (prot) {
3280                 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
3281                         /*
3282                          * Read access is also 0. There isn't any execute bit,
3283                          * so just make it readable.
3284                          */
3285                 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
3286                 case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
3287                 case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
3288                         *kp++ = 0;
3289                         break;
3290                 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
3291                 case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
3292                 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
3293                 case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
3294                         *kp++ = PG_RW;
3295                         break;
3296                 }
3297         }
3298 }
3299
3300 /*
3301  * Map a set of physical memory pages into the kernel virtual
3302  * address space. Return a pointer to where it is mapped. This
3303  * routine is intended to be used for mapping device memory,
3304  * NOT real memory.
3305  *
3306  * NOTE: we can't use pgeflag unless we invalidate the pages one at
3307  * a time.
3308  */
3309 void *
3310 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
3311 {
3312         vm_offset_t va, tmpva, offset;
3313         pt_entry_t *pte;
3314
3315         offset = pa & PAGE_MASK;
3316         size = roundup(offset + size, PAGE_SIZE);
3317
3318         va = kmem_alloc_nofault(&kernel_map, size);
3319         if (!va)
3320                 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3321
3322         pa = pa & PG_FRAME;
3323         for (tmpva = va; size > 0;) {
3324                 pte = vtopte(tmpva);
3325                 *pte = pa | PG_RW | PG_V; /* | pgeflag; */
3326                 size -= PAGE_SIZE;
3327                 tmpva += PAGE_SIZE;
3328                 pa += PAGE_SIZE;
3329         }
3330         cpu_invltlb();
3331         smp_invltlb();
3332
3333         return ((void *)(va + offset));
3334 }
3335
3336 void
3337 pmap_unmapdev(vm_offset_t va, vm_size_t size)
3338 {
3339         vm_offset_t base, offset;
3340
3341         base = va & PG_FRAME;
3342         offset = va & PAGE_MASK;
3343         size = roundup(offset + size, PAGE_SIZE);
3344         pmap_qremove(va, size >> PAGE_SHIFT);
3345         kmem_free(&kernel_map, base, size);
3346 }
3347
3348 /*
3349  * perform the pmap work for mincore
3350  */
3351 int
3352 pmap_mincore(pmap_t pmap, vm_offset_t addr)
3353 {
3354         pt_entry_t *ptep, pte;
3355         vm_page_t m;
3356         int val = 0;
3357
3358         ptep = pmap_pte(pmap, addr);
3359         if (ptep == 0) {
3360                 return 0;
3361         }
3362
3363         if ((pte = *ptep) != 0) {
3364                 vm_offset_t pa;
3365
3366                 val = MINCORE_INCORE;
3367                 if ((pte & PG_MANAGED) == 0)
3368                         return val;
3369
3370                 pa = pte & PG_FRAME;
3371
3372                 m = PHYS_TO_VM_PAGE(pa);
3373
3374                 /*
3375                  * Modified by us
3376                  */
3377                 if (pte & PG_M)
3378                         val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3379                 /*
3380                  * Modified by someone
3381                  */
3382                 else if (m->dirty || pmap_is_modified(m))
3383                         val |= MINCORE_MODIFIED_OTHER;
3384                 /*
3385                  * Referenced by us
3386                  */
3387                 if (pte & PG_A)
3388                         val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3389
3390                 /*
3391                  * Referenced by someone
3392                  */
3393                 else if ((m->flags & PG_REFERENCED) || pmap_ts_referenced(m)) {
3394                         val |= MINCORE_REFERENCED_OTHER;
3395                         vm_page_flag_set(m, PG_REFERENCED);
3396                 }
3397         }
3398         return val;
3399 }
3400
3401 /*
3402  * Replace p->p_vmspace with a new one.  If adjrefs is non-zero the new
3403  * vmspace will be ref'd and the old one will be deref'd.
3404  *
3405  * The vmspace for all lwps associated with the process will be adjusted
3406  * and cr3 will be reloaded if any lwp is the current lwp.
3407  */
3408 void
3409 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs)
3410 {
3411         struct vmspace *oldvm;
3412         struct lwp *lp;
3413
3414         crit_enter();
3415         oldvm = p->p_vmspace;
3416         if (oldvm != newvm) {
3417                 p->p_vmspace = newvm;
3418                 KKASSERT(p->p_nthreads == 1);
3419                 lp = RB_ROOT(&p->p_lwp_tree);
3420                 pmap_setlwpvm(lp, newvm);
3421                 if (adjrefs) {
3422                         sysref_get(&newvm->vm_sysref);
3423                         sysref_put(&oldvm->vm_sysref);
3424                 }
3425         }
3426         crit_exit();
3427 }
3428
3429 /*
3430  * Set the vmspace for a LWP.  The vmspace is almost universally set the
3431  * same as the process vmspace, but virtual kernels need to swap out contexts
3432  * on a per-lwp basis.
3433  */
3434 void
3435 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
3436 {
3437         struct vmspace *oldvm;
3438         struct pmap *pmap;
3439
3440         crit_enter();
3441         oldvm = lp->lwp_vmspace;
3442
3443         if (oldvm != newvm) {
3444                 lp->lwp_vmspace = newvm;
3445                 if (curthread->td_lwp == lp) {
3446                         pmap = vmspace_pmap(newvm);
3447 #if defined(SMP)
3448                         atomic_set_int(&pmap->pm_active, 1 << mycpu->gd_cpuid);
3449 #else
3450                         pmap->pm_active |= 1;
3451 #endif
3452 #if defined(SWTCH_OPTIM_STATS)
3453                         tlb_flush_count++;
3454 #endif
3455                         curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pdir);
3456                         curthread->td_pcb->pcb_cr3 |= PG_RW | PG_U | PG_V;
3457                         *link_pdpe = curthread->td_pcb->pcb_cr3 | PG_RW | PG_U | PG_V;
3458                         load_cr3(common_lvl4_phys);
3459                         pmap = vmspace_pmap(oldvm);
3460 #if defined(SMP)
3461                         atomic_clear_int(&pmap->pm_active,
3462                                           1 << mycpu->gd_cpuid);
3463 #else
3464                         pmap->pm_active &= ~1;
3465 #endif
3466                 }
3467         }
3468         crit_exit();
3469 }
3470
3471 vm_offset_t
3472 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3473 {
3474
3475         if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3476                 return addr;
3477         }
3478
3479         addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
3480         return addr;
3481 }
3482
3483
3484 #if defined(DEBUG)
3485
3486 static void     pads (pmap_t pm);
3487 void            pmap_pvdump (vm_paddr_t pa);
3488
3489 /* print address space of pmap*/
3490 static void
3491 pads(pmap_t pm)
3492 {
3493         vm_offset_t va;
3494         unsigned i, j;
3495         pt_entry_t *ptep;
3496
3497         if (pm == &kernel_pmap)
3498                 return;
3499         crit_enter();
3500         for (i = 0; i < NPDEPG; i++) {
3501                 if (pm->pm_pdir[i]) {
3502                         for (j = 0; j < NPTEPG; j++) {
3503                                 va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3504                                 if (pm == &kernel_pmap && va < KERNBASE)
3505                                         continue;
3506                                 if (pm != &kernel_pmap && va > UPT_MAX_ADDRESS)
3507                                         continue;
3508                                 ptep = pmap_pte_quick(pm, va);
3509                                 if (pmap_pte_v(ptep))
3510                                         kprintf("%lx:%lx ", va, *ptep);
3511                         };
3512                 }
3513         }
3514         crit_exit();
3515
3516 }
3517
3518 void
3519 pmap_pvdump(vm_paddr_t pa)
3520 {
3521         pv_entry_t pv;
3522         vm_page_t m;
3523
3524         kprintf("pa %08llx", (long long)pa);
3525         m = PHYS_TO_VM_PAGE(pa);
3526         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3527 #ifdef used_to_be
3528                 kprintf(" -> pmap %p, va %x, flags %x",
3529                     (void *)pv->pv_pmap, pv->pv_va, pv->pv_flags);
3530 #endif
3531                 kprintf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3532                 pads(pv->pv_pmap);
3533         }
3534         kprintf(" ");
3535 }
3536 #endif