2 * Copyright (c) 2006 The DragonFly Project. All rights reserved.
3 * Copyright (c) 1991 Regents of the University of California.
5 * Copyright (c) 1994 John S. Dyson
7 * Copyright (c) 1994 David Greenman
9 * Copyright (c) 2004-2006 Matthew Dillon
10 * All rights reserved.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in
20 * the documentation and/or other materials provided with the
22 * 3. Neither the name of The DragonFly Project nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific, prior written permission.
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
29 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
30 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
31 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
32 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
33 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
34 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
35 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
36 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
40 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
41 * $DragonFly: src/sys/platform/vkernel/platform/pmap.c,v 1.30 2008/06/06 13:19:25 swildner Exp $
44 * NOTE: PMAP_INVAL_ADD: In pc32 this function is called prior to adjusting
45 * the PTE in the page table, because a cpu synchronization might be required.
46 * The actual invalidation is delayed until the following call or flush. In
47 * the VKERNEL build this function is called prior to adjusting the PTE and
48 * invalidates the table synchronously (not delayed), and is not SMP safe
52 #include <sys/types.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
57 #include <sys/vkernel.h>
59 #include <sys/thread.h>
61 #include <sys/vmspace.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_extern.h>
66 #include <vm/vm_kern.h>
67 #include <vm/vm_object.h>
68 #include <vm/vm_zone.h>
69 #include <vm/vm_pageout.h>
71 #include <machine/md_var.h>
72 #include <machine/pcb.h>
73 #include <machine/pmap_inval.h>
74 #include <machine/globaldata.h>
76 #include <sys/sysref2.h>
80 struct pmap kernel_pmap
;
82 static struct vm_zone pvzone
;
83 static struct vm_object pvzone_obj
;
84 static TAILQ_HEAD(,pmap
) pmap_list
= TAILQ_HEAD_INITIALIZER(pmap_list
);
85 static int pv_entry_count
;
86 static int pv_entry_max
;
87 static int pv_entry_high_water
;
88 static int pmap_pagedaemon_waken
;
89 static boolean_t pmap_initialized
= FALSE
;
90 static int protection_codes
[8];
92 static void i386_protection_init(void);
93 static void pmap_remove_all(vm_page_t m
);
94 static int pmap_release_free_page(struct pmap
*pmap
, vm_page_t p
);
97 #ifndef PMAP_SHPGPERPROC
98 #define PMAP_SHPGPERPROC 200
101 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
103 #define pte_prot(m, p) \
104 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
110 struct pv_entry
*pvinit
;
112 for (i
= 0; i
< vm_page_array_size
; i
++) {
115 m
= &vm_page_array
[i
];
116 TAILQ_INIT(&m
->md
.pv_list
);
117 m
->md
.pv_list_count
= 0;
120 i
= vm_page_array_size
;
123 pvinit
= (struct pv_entry
*)kmem_alloc(&kernel_map
, i
*sizeof(*pvinit
));
124 zbootinit(&pvzone
, "PV ENTRY", sizeof(*pvinit
), pvinit
, i
);
125 pmap_initialized
= TRUE
;
131 int shpgperproc
= PMAP_SHPGPERPROC
;
133 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc
);
134 pv_entry_max
= shpgperproc
* maxproc
+ vm_page_array_size
;
135 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max
);
136 pv_entry_high_water
= 9 * (pv_entry_max
/ 10);
137 zinitna(&pvzone
, &pvzone_obj
, NULL
, 0, pv_entry_max
, ZONE_INTERRUPT
, 1);
141 * Bootstrap the kernel_pmap so it can be used with pmap_enter().
143 * NOTE! pm_pdir for the kernel pmap is offset so VA's translate
144 * directly into PTD indexes (PTA is also offset for the same reason).
145 * This is necessary because, for now, KVA is not mapped at address 0.
147 * Page table pages are not managed like they are in normal pmaps, so
148 * no pteobj is needed.
153 vm_pindex_t i
= (vm_offset_t
)KernelPTD
>> PAGE_SHIFT
;
155 kernel_pmap
.pm_pdir
= KernelPTD
- (KvaStart
>> SEG_SHIFT
);
156 kernel_pmap
.pm_pdirpte
= KernelPTA
[i
];
157 kernel_pmap
.pm_count
= 1;
158 kernel_pmap
.pm_active
= (cpumask_t
)-1;
159 TAILQ_INIT(&kernel_pmap
.pm_pvlist
);
160 i386_protection_init();
164 * Initialize pmap0/vmspace0 . Since process 0 never enters user mode we
165 * just dummy it up so it works well enough for fork().
167 * In DragonFly, process pmaps may only be used to manipulate user address
168 * space, never kernel address space.
171 pmap_pinit0(struct pmap
*pmap
)
176 /************************************************************************
177 * Procedures to manage whole physical maps *
178 ************************************************************************
180 * Initialize a preallocated and zeroed pmap structure,
181 * such as one in a vmspace structure.
184 pmap_pinit(struct pmap
*pmap
)
190 * No need to allocate page table space yet but we do need a valid
191 * page directory table.
193 if (pmap
->pm_pdir
== NULL
) {
195 (pd_entry_t
*)kmem_alloc_pageable(&kernel_map
, PAGE_SIZE
);
199 * allocate object for the pte array and page directory
201 npages
= VPTE_PAGETABLE_SIZE
+
202 (VM_MAX_USER_ADDRESS
/ PAGE_SIZE
) * sizeof(vpte_t
);
203 npages
= (npages
+ PAGE_MASK
) / PAGE_SIZE
;
205 if (pmap
->pm_pteobj
== NULL
)
206 pmap
->pm_pteobj
= vm_object_allocate(OBJT_DEFAULT
, npages
);
207 pmap
->pm_pdindex
= npages
- 1;
210 * allocate the page directory page
212 ptdpg
= vm_page_grab(pmap
->pm_pteobj
, pmap
->pm_pdindex
,
213 VM_ALLOC_NORMAL
| VM_ALLOC_RETRY
);
215 ptdpg
->wire_count
= 1;
216 ++vmstats
.v_wire_count
;
218 /* not usually mapped */
219 vm_page_flag_clear(ptdpg
, PG_MAPPED
| PG_BUSY
);
220 ptdpg
->valid
= VM_PAGE_BITS_ALL
;
222 pmap_kenter((vm_offset_t
)pmap
->pm_pdir
, VM_PAGE_TO_PHYS(ptdpg
));
223 pmap
->pm_pdirpte
= KernelPTA
[(vm_offset_t
)pmap
->pm_pdir
>> PAGE_SHIFT
];
224 if ((ptdpg
->flags
& PG_ZERO
) == 0)
225 bzero(pmap
->pm_pdir
, PAGE_SIZE
);
229 pmap
->pm_ptphint
= NULL
;
230 pmap
->pm_cpucachemask
= 0;
231 TAILQ_INIT(&pmap
->pm_pvlist
);
232 bzero(&pmap
->pm_stats
, sizeof pmap
->pm_stats
);
233 pmap
->pm_stats
.resident_count
= 1;
237 * Clean up a pmap structure so it can be physically freed
240 pmap_puninit(pmap_t pmap
)
243 kmem_free(&kernel_map
, (vm_offset_t
)pmap
->pm_pdir
, PAGE_SIZE
);
244 pmap
->pm_pdir
= NULL
;
246 if (pmap
->pm_pteobj
) {
247 vm_object_deallocate(pmap
->pm_pteobj
);
248 pmap
->pm_pteobj
= NULL
;
254 * Wire in kernel global address entries. To avoid a race condition
255 * between pmap initialization and pmap_growkernel, this procedure
256 * adds the pmap to the master list (which growkernel scans to update),
257 * then copies the template.
259 * In a virtual kernel there are no kernel global address entries.
262 pmap_pinit2(struct pmap
*pmap
)
265 TAILQ_INSERT_TAIL(&pmap_list
, pmap
, pm_pmnode
);
270 * Release all resources held by the given physical map.
272 * Should only be called if the map contains no valid mappings.
274 static int pmap_release_callback(struct vm_page
*p
, void *data
);
277 pmap_release(struct pmap
*pmap
)
279 struct mdglobaldata
*gd
= mdcpu
;
280 vm_object_t object
= pmap
->pm_pteobj
;
281 struct rb_vm_page_scan_info info
;
283 KKASSERT(pmap
!= &kernel_pmap
);
285 #if defined(DIAGNOSTIC)
286 if (object
->ref_count
!= 1)
287 panic("pmap_release: pteobj reference count != 1");
290 * Once we destroy the page table, the mapping becomes invalid.
291 * Don't waste time doing a madvise to invalidate the mapping, just
292 * set cpucachemask to 0.
294 if (pmap
->pm_pdir
== gd
->gd_PT1pdir
) {
295 gd
->gd_PT1pdir
= NULL
;
297 /* madvise(gd->gd_PT1map, SEG_SIZE, MADV_INVAL); */
299 if (pmap
->pm_pdir
== gd
->gd_PT2pdir
) {
300 gd
->gd_PT2pdir
= NULL
;
302 /* madvise(gd->gd_PT2map, SEG_SIZE, MADV_INVAL); */
304 if (pmap
->pm_pdir
== gd
->gd_PT3pdir
) {
305 gd
->gd_PT3pdir
= NULL
;
307 /* madvise(gd->gd_PT3map, SEG_SIZE, MADV_INVAL); */
311 info
.object
= object
;
313 TAILQ_REMOVE(&pmap_list
, pmap
, pm_pmnode
);
320 info
.limit
= object
->generation
;
322 vm_page_rb_tree_RB_SCAN(&object
->rb_memq
, NULL
,
323 pmap_release_callback
, &info
);
324 if (info
.error
== 0 && info
.mpte
) {
325 if (!pmap_release_free_page(pmap
, info
.mpte
))
329 } while (info
.error
);
332 * Leave the KVA reservation for pm_pdir cached for later reuse.
334 pmap
->pm_pdirpte
= 0;
335 pmap
->pm_cpucachemask
= 0;
339 * Callback to release a page table page backing a directory
343 pmap_release_callback(struct vm_page
*p
, void *data
)
345 struct rb_vm_page_scan_info
*info
= data
;
347 if (p
->pindex
== info
->pmap
->pm_pdindex
) {
351 if (!pmap_release_free_page(info
->pmap
, p
)) {
355 if (info
->object
->generation
!= info
->limit
) {
363 * Retire the given physical map from service. Should only be called if
364 * the map contains no valid mappings.
367 pmap_destroy(pmap_t pmap
)
374 count
= --pmap
->pm_count
;
377 panic("destroying a pmap is not yet implemented");
382 * Add a reference to the specified pmap.
385 pmap_reference(pmap_t pmap
)
392 /************************************************************************
393 * VMSPACE MANAGEMENT *
394 ************************************************************************
396 * The VMSPACE management we do in our virtual kernel must be reflected
397 * in the real kernel. This is accomplished by making vmspace system
398 * calls to the real kernel.
401 cpu_vmspace_alloc(struct vmspace
*vm
)
406 #define LAST_EXTENT (VM_MAX_USER_ADDRESS - 0x80000000)
408 if (vmspace_create(&vm
->vm_pmap
, 0, NULL
) < 0)
409 panic("vmspace_create() failed");
411 rp
= vmspace_mmap(&vm
->vm_pmap
, (void *)0x00000000, 0x40000000,
412 PROT_READ
|PROT_WRITE
,
413 MAP_FILE
|MAP_SHARED
|MAP_VPAGETABLE
|MAP_FIXED
,
415 if (rp
== MAP_FAILED
)
416 panic("vmspace_mmap: failed1");
417 vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x00000000, 0x40000000,
419 rp
= vmspace_mmap(&vm
->vm_pmap
, (void *)0x40000000, 0x40000000,
420 PROT_READ
|PROT_WRITE
,
421 MAP_FILE
|MAP_SHARED
|MAP_VPAGETABLE
|MAP_FIXED
,
422 MemImageFd
, 0x40000000);
423 if (rp
== MAP_FAILED
)
424 panic("vmspace_mmap: failed2");
425 vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x40000000, 0x40000000,
427 rp
= vmspace_mmap(&vm
->vm_pmap
, (void *)0x80000000, LAST_EXTENT
,
428 PROT_READ
|PROT_WRITE
,
429 MAP_FILE
|MAP_SHARED
|MAP_VPAGETABLE
|MAP_FIXED
,
430 MemImageFd
, 0x80000000);
431 vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x80000000, LAST_EXTENT
,
433 if (rp
== MAP_FAILED
)
434 panic("vmspace_mmap: failed3");
436 r
= vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x00000000, 0x40000000,
437 MADV_SETMAP
, vmspace_pmap(vm
)->pm_pdirpte
);
439 panic("vmspace_mcontrol: failed1");
440 r
= vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x40000000, 0x40000000,
441 MADV_SETMAP
, vmspace_pmap(vm
)->pm_pdirpte
);
443 panic("vmspace_mcontrol: failed2");
444 r
= vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x80000000, LAST_EXTENT
,
445 MADV_SETMAP
, vmspace_pmap(vm
)->pm_pdirpte
);
447 panic("vmspace_mcontrol: failed3");
451 cpu_vmspace_free(struct vmspace
*vm
)
453 if (vmspace_destroy(&vm
->vm_pmap
) < 0)
454 panic("vmspace_destroy() failed");
457 /************************************************************************
458 * Procedures which operate directly on the kernel PMAP *
459 ************************************************************************/
462 * This maps the requested page table and gives us access to it.
464 * This routine can be called from a potentially preempting interrupt
465 * thread or from a normal thread.
468 get_ptbase(struct pmap
*pmap
, vm_offset_t va
)
470 struct mdglobaldata
*gd
= mdcpu
;
472 if (pmap
== &kernel_pmap
) {
473 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
474 return(KernelPTA
+ (va
>> PAGE_SHIFT
));
475 } else if (pmap
->pm_pdir
== gd
->gd_PT1pdir
) {
476 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
477 *gd
->gd_PT1pde
= pmap
->pm_pdirpte
;
478 madvise(gd
->gd_PT1map
, SEG_SIZE
, MADV_INVAL
);
479 atomic_set_int(&pmap
->pm_cpucachemask
, gd
->mi
.gd_cpumask
);
481 return(gd
->gd_PT1map
+ (va
>> PAGE_SHIFT
));
482 } else if (pmap
->pm_pdir
== gd
->gd_PT2pdir
) {
483 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
484 *gd
->gd_PT2pde
= pmap
->pm_pdirpte
;
485 madvise(gd
->gd_PT2map
, SEG_SIZE
, MADV_INVAL
);
486 atomic_set_int(&pmap
->pm_cpucachemask
, gd
->mi
.gd_cpumask
);
488 return(gd
->gd_PT2map
+ (va
>> PAGE_SHIFT
));
492 * If we aren't running from a potentially preempting interrupt,
493 * load a new page table directory into the page table cache
495 if (gd
->mi
.gd_intr_nesting_level
== 0 &&
496 (gd
->mi
.gd_curthread
->td_flags
& TDF_INTTHREAD
) == 0) {
498 * Choose one or the other and map the page table
499 * in the KVA space reserved for it.
501 if ((gd
->gd_PTflip
= 1 - gd
->gd_PTflip
) == 0) {
502 gd
->gd_PT1pdir
= pmap
->pm_pdir
;
503 *gd
->gd_PT1pde
= pmap
->pm_pdirpte
;
504 madvise(gd
->gd_PT1map
, SEG_SIZE
, MADV_INVAL
);
505 atomic_set_int(&pmap
->pm_cpucachemask
,
507 return(gd
->gd_PT1map
+ (va
>> PAGE_SHIFT
));
509 gd
->gd_PT2pdir
= pmap
->pm_pdir
;
510 *gd
->gd_PT2pde
= pmap
->pm_pdirpte
;
511 madvise(gd
->gd_PT2map
, SEG_SIZE
, MADV_INVAL
);
512 atomic_set_int(&pmap
->pm_cpucachemask
,
514 return(gd
->gd_PT2map
+ (va
>> PAGE_SHIFT
));
519 * If we are running from a preempting interrupt use a private
520 * map. The caller must be in a critical section.
522 KKASSERT(IN_CRITICAL_SECT(curthread
));
523 if (pmap
->pm_pdir
== gd
->gd_PT3pdir
) {
524 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
525 *gd
->gd_PT3pde
= pmap
->pm_pdirpte
;
526 madvise(gd
->gd_PT3map
, SEG_SIZE
, MADV_INVAL
);
527 atomic_set_int(&pmap
->pm_cpucachemask
,
531 gd
->gd_PT3pdir
= pmap
->pm_pdir
;
532 *gd
->gd_PT3pde
= pmap
->pm_pdirpte
;
533 madvise(gd
->gd_PT3map
, SEG_SIZE
, MADV_INVAL
);
534 atomic_set_int(&pmap
->pm_cpucachemask
,
537 return(gd
->gd_PT3map
+ (va
>> PAGE_SHIFT
));
541 get_ptbase1(struct pmap
*pmap
, vm_offset_t va
)
543 struct mdglobaldata
*gd
= mdcpu
;
545 if (pmap
== &kernel_pmap
) {
546 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
547 return(KernelPTA
+ (va
>> PAGE_SHIFT
));
548 } else if (pmap
->pm_pdir
== gd
->gd_PT1pdir
) {
549 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
550 *gd
->gd_PT1pde
= pmap
->pm_pdirpte
;
551 madvise(gd
->gd_PT1map
, SEG_SIZE
, MADV_INVAL
);
552 atomic_set_int(&pmap
->pm_cpucachemask
, gd
->mi
.gd_cpumask
);
554 return(gd
->gd_PT1map
+ (va
>> PAGE_SHIFT
));
556 KKASSERT(gd
->mi
.gd_intr_nesting_level
== 0 &&
557 (gd
->mi
.gd_curthread
->td_flags
& TDF_INTTHREAD
) == 0);
558 gd
->gd_PT1pdir
= pmap
->pm_pdir
;
559 *gd
->gd_PT1pde
= pmap
->pm_pdirpte
;
560 madvise(gd
->gd_PT1map
, SEG_SIZE
, MADV_INVAL
);
561 return(gd
->gd_PT1map
+ (va
>> PAGE_SHIFT
));
565 get_ptbase2(struct pmap
*pmap
, vm_offset_t va
)
567 struct mdglobaldata
*gd
= mdcpu
;
569 if (pmap
== &kernel_pmap
) {
570 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
571 return(KernelPTA
+ (va
>> PAGE_SHIFT
));
572 } else if (pmap
->pm_pdir
== gd
->gd_PT2pdir
) {
573 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
574 *gd
->gd_PT2pde
= pmap
->pm_pdirpte
;
575 madvise(gd
->gd_PT2map
, SEG_SIZE
, MADV_INVAL
);
576 atomic_set_int(&pmap
->pm_cpucachemask
, gd
->mi
.gd_cpumask
);
578 return(gd
->gd_PT2map
+ (va
>> PAGE_SHIFT
));
580 KKASSERT(gd
->mi
.gd_intr_nesting_level
== 0 &&
581 (gd
->mi
.gd_curthread
->td_flags
& TDF_INTTHREAD
) == 0);
582 gd
->gd_PT2pdir
= pmap
->pm_pdir
;
583 *gd
->gd_PT2pde
= pmap
->pm_pdirpte
;
584 madvise(gd
->gd_PT2map
, SEG_SIZE
, MADV_INVAL
);
585 return(gd
->gd_PT2map
+ (va
>> PAGE_SHIFT
));
589 * Return a pointer to the page table entry for the specified va in the
590 * specified pmap. NULL is returned if there is no valid page table page
593 static __inline vpte_t
*
594 pmap_pte(struct pmap
*pmap
, vm_offset_t va
)
598 ptep
= &pmap
->pm_pdir
[va
>> SEG_SHIFT
];
602 return (get_ptbase(pmap
, va
));
608 * Enter a mapping into kernel_pmap. Mappings created in this fashion
609 * are not managed. Mappings must be immediately accessible on all cpus.
611 * Call pmap_inval_pte() to invalidate the virtual pte and clean out the
612 * real pmap and handle related races before storing the new vpte.
615 pmap_kenter(vm_offset_t va
, vm_paddr_t pa
)
620 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
621 npte
= (vpte_t
)pa
| VPTE_R
| VPTE_W
| VPTE_V
;
622 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
624 pmap_inval_pte(ptep
, &kernel_pmap
, va
);
629 * Synchronize a kvm mapping originally made for the private use on
630 * some other cpu so it can be used on all cpus.
632 * XXX add MADV_RESYNC to improve performance.
635 pmap_kenter_sync(vm_offset_t va
)
637 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
641 * Synchronize a kvm mapping originally made for the private use on
642 * some other cpu so it can be used on our cpu. Turns out to be the
643 * same madvise() call, because we have to sync the real pmaps anyway.
645 * XXX add MADV_RESYNC to improve performance.
648 pmap_kenter_sync_quick(vm_offset_t va
)
650 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
655 * Make a previously read-only kernel mapping R+W (not implemented by
659 pmap_kmodify_rw(vm_offset_t va
)
661 *pmap_kpte(va
) |= VPTE_R
| VPTE_W
;
662 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
666 * Make a kernel mapping non-cacheable (not applicable to virtual kernels)
669 pmap_kmodify_nc(vm_offset_t va
)
671 *pmap_kpte(va
) |= VPTE_N
;
672 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
678 * Map a contiguous range of physical memory to a KVM
681 pmap_map(vm_offset_t virt
, vm_paddr_t start
, vm_paddr_t end
, int prot
)
683 while (start
< end
) {
684 pmap_kenter(virt
, start
);
692 pmap_kpte(vm_offset_t va
)
696 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
697 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
702 * Enter an unmanaged KVA mapping for the private use of the current
703 * cpu only. pmap_kenter_sync() may be called to make the mapping usable
706 * It is illegal for the mapping to be accessed by other cpus unleess
707 * pmap_kenter_sync*() is called.
710 pmap_kenter_quick(vm_offset_t va
, vm_paddr_t pa
)
715 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
717 npte
= (vpte_t
)pa
| VPTE_R
| VPTE_W
| VPTE_V
;
718 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
720 pmap_inval_pte_quick(ptep
, &kernel_pmap
, va
);
725 * Make a temporary mapping for a physical address. This is only intended
726 * to be used for panic dumps.
729 pmap_kenter_temporary(vm_paddr_t pa
, int i
)
731 pmap_kenter(crashdumpmap
+ (i
* PAGE_SIZE
), pa
);
732 return ((void *)crashdumpmap
);
736 * Remove an unmanaged mapping created with pmap_kenter*().
739 pmap_kremove(vm_offset_t va
)
743 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
745 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
747 pmap_inval_pte(ptep
, &kernel_pmap
, va
);
752 * Remove an unmanaged mapping created with pmap_kenter*() but synchronize
753 * only with this cpu.
755 * Unfortunately because we optimize new entries by testing VPTE_V later
756 * on, we actually still have to synchronize with all the cpus. XXX maybe
757 * store a junk value and test against 0 in the other places instead?
760 pmap_kremove_quick(vm_offset_t va
)
764 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
766 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
768 pmap_inval_pte(ptep
, &kernel_pmap
, va
); /* NOT _quick */
773 * Extract the physical address from the kernel_pmap that is associated
774 * with the specified virtual address.
777 pmap_kextract(vm_offset_t va
)
782 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
784 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
785 pa
= (vm_paddr_t
)(*ptep
& VPTE_FRAME
) | (va
& PAGE_MASK
);
790 * Map a set of unmanaged VM pages into KVM.
793 pmap_qenter(vm_offset_t va
, struct vm_page
**m
, int count
)
795 KKASSERT(va
>= KvaStart
&& va
+ count
* PAGE_SIZE
< KvaEnd
);
799 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
801 pmap_inval_pte(ptep
, &kernel_pmap
, va
);
802 *ptep
= (vpte_t
)(*m
)->phys_addr
| VPTE_R
| VPTE_W
| VPTE_V
;
810 * Map a set of VM pages to kernel virtual memory. If a mapping changes
811 * clear the supplied mask. The caller handles any SMP interactions.
812 * The mask is used to provide the caller with hints on what SMP interactions
816 pmap_qenter2(vm_offset_t va
, struct vm_page
**m
, int count
, cpumask_t
*mask
)
818 cpumask_t cmask
= mycpu
->gd_cpumask
;
820 KKASSERT(va
>= KvaStart
&& va
+ count
* PAGE_SIZE
< KvaEnd
);
825 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
826 npte
= (vpte_t
)(*m
)->phys_addr
| VPTE_R
| VPTE_W
| VPTE_V
;
829 pmap_inval_pte_quick(ptep
, &kernel_pmap
, va
);
831 } else if ((*mask
& cmask
) == 0) {
832 pmap_kenter_sync_quick(va
);
842 * Undo the effects of pmap_qenter*().
845 pmap_qremove(vm_offset_t va
, int count
)
847 KKASSERT(va
>= KvaStart
&& va
+ count
* PAGE_SIZE
< KvaEnd
);
851 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
853 pmap_inval_pte(ptep
, &kernel_pmap
, va
);
860 /************************************************************************
861 * Misc support glue called by machine independant code *
862 ************************************************************************
864 * These routines are called by machine independant code to operate on
865 * certain machine-dependant aspects of processes, threads, and pmaps.
869 * Initialize MD portions of the thread structure.
872 pmap_init_thread(thread_t td
)
874 /* enforce pcb placement */
875 td
->td_pcb
= (struct pcb
*)(td
->td_kstack
+ td
->td_kstack_size
) - 1;
876 td
->td_savefpu
= &td
->td_pcb
->pcb_save
;
877 td
->td_sp
= (char *)td
->td_pcb
- 16;
881 * This routine directly affects the fork perf for a process.
884 pmap_init_proc(struct proc
*p
)
889 * Destroy the UPAGES for a process that has exited and disassociate
890 * the process from its thread.
893 pmap_dispose_proc(struct proc
*p
)
895 KASSERT(p
->p_lock
== 0, ("attempt to dispose referenced proc! %p", p
));
899 * We pre-allocate all page table pages for kernel virtual memory so
900 * this routine will only be called if KVM has been exhausted.
903 pmap_growkernel(vm_offset_t addr
)
905 addr
= (addr
+ PAGE_SIZE
* NPTEPG
) & ~(PAGE_SIZE
* NPTEPG
- 1);
907 if (addr
> virtual_end
- SEG_SIZE
)
908 panic("KVM exhausted");
909 kernel_vm_end
= addr
;
913 * The modification bit is not tracked for any pages in this range. XXX
914 * such pages in this maps should always use pmap_k*() functions and not
917 * XXX User and kernel address spaces are independant for virtual kernels,
918 * this function only applies to the kernel pmap.
921 pmap_track_modified(pmap_t pmap
, vm_offset_t va
)
923 if (pmap
!= &kernel_pmap
)
925 if ((va
< clean_sva
) || (va
>= clean_eva
))
931 /************************************************************************
932 * Procedures supporting managed page table pages *
933 ************************************************************************
935 * These procedures are used to track managed page table pages. These pages
936 * use the page table page's vm_page_t to track PTEs in the page. The
937 * page table pages themselves are arranged in a VM object, pmap->pm_pteobj.
939 * This allows the system to throw away page table pages for user processes
940 * at will and reinstantiate them on demand.
944 * This routine works like vm_page_lookup() but also blocks as long as the
945 * page is busy. This routine does not busy the page it returns.
947 * Unless the caller is managing objects whos pages are in a known state,
948 * the call should be made with a critical section held so the page's object
949 * association remains valid on return.
952 pmap_page_lookup(vm_object_t object
, vm_pindex_t pindex
)
957 m
= vm_page_lookup(object
, pindex
);
958 if (m
&& vm_page_sleep_busy(m
, FALSE
, "pplookp"))
964 * This routine unholds page table pages, and if the hold count
965 * drops to zero, then it decrements the wire count.
967 * We must recheck that this is the last hold reference after busy-sleeping
971 _pmap_unwire_pte_hold(pmap_t pmap
, vm_page_t m
)
973 while (vm_page_sleep_busy(m
, FALSE
, "pmuwpt"))
975 KASSERT(m
->queue
== PQ_NONE
,
976 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m
));
978 if (m
->hold_count
== 1) {
980 * Unmap the page table page.
983 KKASSERT(pmap
->pm_pdir
[m
->pindex
] != 0);
984 pmap_inval_pde(&pmap
->pm_pdir
[m
->pindex
], pmap
,
985 (vm_offset_t
)m
->pindex
<< SEG_SHIFT
);
986 KKASSERT(pmap
->pm_stats
.resident_count
> 0);
987 --pmap
->pm_stats
.resident_count
;
989 if (pmap
->pm_ptphint
== m
)
990 pmap
->pm_ptphint
= NULL
;
993 * This was our last hold, the page had better be unwired
994 * after we decrement wire_count.
996 * FUTURE NOTE: shared page directory page could result in
997 * multiple wire counts.
1001 KKASSERT(m
->wire_count
== 0);
1002 --vmstats
.v_wire_count
;
1003 vm_page_flag_clear(m
, PG_MAPPED
| PG_WRITEABLE
);
1005 vm_page_free_zero(m
);
1008 KKASSERT(m
->hold_count
> 1);
1014 pmap_unwire_pte_hold(pmap_t pmap
, vm_page_t m
)
1016 KKASSERT(m
->hold_count
> 0);
1017 if (m
->hold_count
> 1) {
1021 return _pmap_unwire_pte_hold(pmap
, m
);
1026 * After removing a page table entry, this routine is used to
1027 * conditionally free the page, and manage the hold/wire counts.
1030 pmap_unuse_pt(pmap_t pmap
, vm_offset_t va
, vm_page_t mpte
)
1036 * page table pages in the kernel_pmap are not managed.
1038 if (pmap
== &kernel_pmap
)
1040 ptepindex
= (va
>> PDRSHIFT
);
1041 if (pmap
->pm_ptphint
&&
1042 (pmap
->pm_ptphint
->pindex
== ptepindex
)) {
1043 mpte
= pmap
->pm_ptphint
;
1045 mpte
= pmap_page_lookup( pmap
->pm_pteobj
, ptepindex
);
1046 pmap
->pm_ptphint
= mpte
;
1049 return pmap_unwire_pte_hold(pmap
, mpte
);
1053 * Attempt to release and free the vm_page backing a page directory page
1054 * in a pmap. Returns 1 on success, 0 on failure (if the procedure had
1058 pmap_release_free_page(struct pmap
*pmap
, vm_page_t p
)
1060 vpte_t
*pde
= pmap
->pm_pdir
;
1063 * This code optimizes the case of freeing non-busy
1064 * page-table pages. Those pages are zero now, and
1065 * might as well be placed directly into the zero queue.
1067 if (vm_page_sleep_busy(p
, FALSE
, "pmaprl"))
1071 KKASSERT(pmap
->pm_stats
.resident_count
> 0);
1072 --pmap
->pm_stats
.resident_count
;
1074 if (p
->hold_count
) {
1075 panic("pmap_release: freeing held page table page");
1078 * Page directory pages need to have the kernel stuff cleared, so
1079 * they can go into the zero queue also.
1081 * In virtual kernels there is no 'kernel stuff'. For the moment
1082 * I just make sure the whole thing has been zero'd even though
1083 * it should already be completely zero'd.
1085 * pmaps for vkernels do not self-map because they do not share
1086 * their address space with the vkernel. Clearing of pde[] thus
1087 * only applies to page table pages and not to the page directory
1090 if (p
->pindex
== pmap
->pm_pdindex
) {
1091 bzero(pde
, VPTE_PAGETABLE_SIZE
);
1092 pmap_kremove((vm_offset_t
)pmap
->pm_pdir
);
1094 KKASSERT(pde
[p
->pindex
] != 0);
1095 pmap_inval_pde(&pde
[p
->pindex
], pmap
,
1096 (vm_offset_t
)p
->pindex
<< SEG_SHIFT
);
1100 * Clear the matching hint
1102 if (pmap
->pm_ptphint
&& (pmap
->pm_ptphint
->pindex
== p
->pindex
))
1103 pmap
->pm_ptphint
= NULL
;
1106 * And throw the page away. The page is completely zero'd out so
1107 * optimize the free call.
1110 vmstats
.v_wire_count
--;
1111 vm_page_free_zero(p
);
1116 * This routine is called if the page table page is not mapped in the page
1119 * The routine is broken up into two parts for readability.
1121 * It must return a held mpte and map the page directory page as required.
1122 * Because vm_page_grab() can block, we must re-check pm_pdir[ptepindex]
1125 _pmap_allocpte(pmap_t pmap
, unsigned ptepindex
)
1131 * Find or fabricate a new pagetable page. A busied page will be
1132 * returned. This call may block.
1134 m
= vm_page_grab(pmap
->pm_pteobj
, ptepindex
,
1135 VM_ALLOC_NORMAL
| VM_ALLOC_ZERO
| VM_ALLOC_RETRY
);
1137 KASSERT(m
->queue
== PQ_NONE
,
1138 ("_pmap_allocpte: %p->queue != PQ_NONE", m
));
1141 * Increment the hold count for the page we will be returning to
1147 * It is possible that someone else got in and mapped by the page
1148 * directory page while we were blocked, if so just unbusy and
1149 * return the held page.
1151 if ((ptepa
= pmap
->pm_pdir
[ptepindex
]) != 0) {
1152 Debugger("PTEPA RACE");
1153 KKASSERT((ptepa
& VPTE_FRAME
) == VM_PAGE_TO_PHYS(m
));
1158 if (m
->wire_count
== 0)
1159 vmstats
.v_wire_count
++;
1163 * Map the pagetable page into the process address space, if
1164 * it isn't already there.
1166 ++pmap
->pm_stats
.resident_count
;
1168 ptepa
= VM_PAGE_TO_PHYS(m
);
1169 pmap
->pm_pdir
[ptepindex
] = (vpte_t
)ptepa
| VPTE_R
| VPTE_W
| VPTE_V
|
1173 * We are likely about to access this page table page, so set the
1174 * page table hint to reduce overhead.
1176 pmap
->pm_ptphint
= m
;
1179 * Try to use the new mapping, but if we cannot, then
1180 * do it with the routine that maps the page explicitly.
1182 if ((m
->flags
& PG_ZERO
) == 0)
1183 pmap_zero_page(ptepa
);
1185 m
->valid
= VM_PAGE_BITS_ALL
;
1186 vm_page_flag_clear(m
, PG_ZERO
);
1187 vm_page_flag_set(m
, PG_MAPPED
);
1194 * Determine the page table page required to access the VA in the pmap
1195 * and allocate it if necessary. Return a held vm_page_t for the page.
1197 * Only used with user pmaps.
1200 pmap_allocpte(pmap_t pmap
, vm_offset_t va
)
1207 * Calculate pagetable page index
1209 ptepindex
= va
>> PDRSHIFT
;
1212 * Get the page directory entry
1214 ptepa
= (vm_offset_t
) pmap
->pm_pdir
[ptepindex
];
1217 * This supports switching from a 4MB page to a
1220 if (ptepa
& VPTE_PS
) {
1221 KKASSERT(pmap
->pm_pdir
[ptepindex
] != 0);
1222 pmap_inval_pde(&pmap
->pm_pdir
[ptepindex
], pmap
,
1223 (vm_offset_t
)ptepindex
<< SEG_SHIFT
);
1228 * If the page table page is mapped, we just increment the
1229 * hold count, and activate it.
1233 * In order to get the page table page, try the
1236 if (pmap
->pm_ptphint
&&
1237 (pmap
->pm_ptphint
->pindex
== ptepindex
)) {
1238 m
= pmap
->pm_ptphint
;
1240 m
= pmap_page_lookup( pmap
->pm_pteobj
, ptepindex
);
1241 pmap
->pm_ptphint
= m
;
1247 * Here if the pte page isn't mapped, or if it has been deallocated.
1249 return _pmap_allocpte(pmap
, ptepindex
);
1252 /************************************************************************
1253 * Managed pages in pmaps *
1254 ************************************************************************
1256 * All pages entered into user pmaps and some pages entered into the kernel
1257 * pmap are managed, meaning that pmap_protect() and other related management
1258 * functions work on these pages.
1262 * free the pv_entry back to the free list. This function may be
1263 * called from an interrupt.
1265 static __inline
void
1266 free_pv_entry(pv_entry_t pv
)
1273 * get a new pv_entry, allocating a block from the system
1274 * when needed. This function may be called from an interrupt.
1280 if (pv_entry_high_water
&&
1281 (pv_entry_count
> pv_entry_high_water
) &&
1282 (pmap_pagedaemon_waken
== 0)) {
1283 pmap_pagedaemon_waken
= 1;
1284 wakeup (&vm_pages_needed
);
1286 return zalloc(&pvzone
);
1290 * This routine is very drastic, but can save the system
1298 static int warningdone
=0;
1300 if (pmap_pagedaemon_waken
== 0)
1303 if (warningdone
< 5) {
1304 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1308 for(i
= 0; i
< vm_page_array_size
; i
++) {
1309 m
= &vm_page_array
[i
];
1310 if (m
->wire_count
|| m
->hold_count
|| m
->busy
||
1311 (m
->flags
& PG_BUSY
))
1315 pmap_pagedaemon_waken
= 0;
1319 * If it is the first entry on the list, it is actually
1320 * in the header and we must copy the following entry up
1321 * to the header. Otherwise we must search the list for
1322 * the entry. In either case we free the now unused entry.
1325 pmap_remove_entry(struct pmap
*pmap
, vm_page_t m
, vm_offset_t va
)
1331 if (m
->md
.pv_list_count
< pmap
->pm_stats
.resident_count
) {
1332 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
1333 if (pmap
== pv
->pv_pmap
&& va
== pv
->pv_va
)
1337 TAILQ_FOREACH(pv
, &pmap
->pm_pvlist
, pv_plist
) {
1338 if (va
== pv
->pv_va
)
1344 * Note that pv_ptem is NULL if the page table page itself is not
1345 * managed, even if the page being removed IS managed.
1349 TAILQ_REMOVE(&m
->md
.pv_list
, pv
, pv_list
);
1350 m
->md
.pv_list_count
--;
1351 TAILQ_REMOVE(&pmap
->pm_pvlist
, pv
, pv_plist
);
1352 if (TAILQ_EMPTY(&m
->md
.pv_list
))
1353 vm_page_flag_clear(m
, PG_MAPPED
| PG_WRITEABLE
);
1354 ++pmap
->pm_generation
;
1355 rtval
= pmap_unuse_pt(pmap
, va
, pv
->pv_ptem
);
1363 * Create a pv entry for page at pa for (pmap, va). If the page table page
1364 * holding the VA is managed, mpte will be non-NULL.
1367 pmap_insert_entry(pmap_t pmap
, vm_offset_t va
, vm_page_t mpte
, vm_page_t m
)
1372 pv
= get_pv_entry();
1377 TAILQ_INSERT_TAIL(&pmap
->pm_pvlist
, pv
, pv_plist
);
1378 TAILQ_INSERT_TAIL(&m
->md
.pv_list
, pv
, pv_list
);
1379 m
->md
.pv_list_count
++;
1385 * pmap_remove_pte: do the things to unmap a page in a process
1388 pmap_remove_pte(struct pmap
*pmap
, vpte_t
*ptq
, vm_offset_t va
)
1393 oldpte
= pmap_inval_loadandclear(ptq
, pmap
, va
);
1394 if (oldpte
& VPTE_WIRED
)
1395 --pmap
->pm_stats
.wired_count
;
1396 KKASSERT(pmap
->pm_stats
.wired_count
>= 0);
1400 * Machines that don't support invlpg, also don't support
1401 * VPTE_G. XXX VPTE_G is disabled for SMP so don't worry about
1404 if (oldpte
& VPTE_G
)
1405 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
1407 KKASSERT(pmap
->pm_stats
.resident_count
> 0);
1408 --pmap
->pm_stats
.resident_count
;
1409 if (oldpte
& VPTE_MANAGED
) {
1410 m
= PHYS_TO_VM_PAGE(oldpte
);
1411 if (oldpte
& VPTE_M
) {
1412 #if defined(PMAP_DIAGNOSTIC)
1413 if (pmap_nw_modified((pt_entry_t
) oldpte
)) {
1415 "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1419 if (pmap_track_modified(pmap
, va
))
1422 if (oldpte
& VPTE_A
)
1423 vm_page_flag_set(m
, PG_REFERENCED
);
1424 return pmap_remove_entry(pmap
, m
, va
);
1426 return pmap_unuse_pt(pmap
, va
, NULL
);
1435 * Remove a single page from a process address space.
1437 * This function may not be called from an interrupt if the pmap is
1441 pmap_remove_page(struct pmap
*pmap
, vm_offset_t va
)
1446 * if there is no pte for this address, just skip it!!! Otherwise
1447 * get a local va for mappings for this pmap and remove the entry.
1449 if (*pmap_pde(pmap
, va
) != 0) {
1450 ptq
= get_ptbase(pmap
, va
);
1452 pmap_remove_pte(pmap
, ptq
, va
);
1460 * Remove the given range of addresses from the specified map.
1462 * It is assumed that the start and end are properly
1463 * rounded to the page size.
1465 * This function may not be called from an interrupt if the pmap is
1469 pmap_remove(struct pmap
*pmap
, vm_offset_t sva
, vm_offset_t eva
)
1473 vm_offset_t ptpaddr
;
1474 vm_pindex_t sindex
, eindex
;
1479 KKASSERT(pmap
->pm_stats
.resident_count
>= 0);
1480 if (pmap
->pm_stats
.resident_count
== 0)
1484 * special handling of removing one page. a very
1485 * common operation and easy to short circuit some
1488 if (((sva
+ PAGE_SIZE
) == eva
) &&
1489 ((pmap
->pm_pdir
[(sva
>> PDRSHIFT
)] & VPTE_PS
) == 0)) {
1490 pmap_remove_page(pmap
, sva
);
1495 * Get a local virtual address for the mappings that are being
1498 * XXX this is really messy because the kernel pmap is not relative
1501 sindex
= (sva
>> PAGE_SHIFT
);
1502 eindex
= (eva
>> PAGE_SHIFT
);
1504 for (; sindex
< eindex
; sindex
= pdnxt
) {
1508 * Calculate index for next page table.
1510 pdnxt
= ((sindex
+ NPTEPG
) & ~(NPTEPG
- 1));
1511 if (pmap
->pm_stats
.resident_count
== 0)
1514 pdirindex
= sindex
/ NPDEPG
;
1515 if (((ptpaddr
= pmap
->pm_pdir
[pdirindex
]) & VPTE_PS
) != 0) {
1516 KKASSERT(pmap
->pm_pdir
[pdirindex
] != 0);
1517 pmap
->pm_stats
.resident_count
-= NBPDR
/ PAGE_SIZE
;
1518 pmap_inval_pde(&pmap
->pm_pdir
[pdirindex
], pmap
,
1519 (vm_offset_t
)pdirindex
<< SEG_SHIFT
);
1524 * Weed out invalid mappings. Note: we assume that the page
1525 * directory table is always allocated, and in kernel virtual.
1531 * Limit our scan to either the end of the va represented
1532 * by the current page table page, or to the end of the
1533 * range being removed.
1539 * NOTE: pmap_remove_pte() can block.
1541 for (; sindex
!= pdnxt
; sindex
++) {
1544 ptbase
= get_ptbase(pmap
, sindex
<< PAGE_SHIFT
);
1547 va
= i386_ptob(sindex
);
1548 if (pmap_remove_pte(pmap
, ptbase
, va
))
1557 * Removes this physical page from all physical maps in which it resides.
1558 * Reflects back modify bits to the pager.
1560 * This routine may not be called from an interrupt.
1563 pmap_remove_all(vm_page_t m
)
1568 #if defined(PMAP_DIAGNOSTIC)
1570 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1573 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
)) {
1574 panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m
));
1579 while ((pv
= TAILQ_FIRST(&m
->md
.pv_list
)) != NULL
) {
1580 KKASSERT(pv
->pv_pmap
->pm_stats
.resident_count
> 0);
1581 --pv
->pv_pmap
->pm_stats
.resident_count
;
1583 pte
= pmap_pte(pv
->pv_pmap
, pv
->pv_va
);
1584 KKASSERT(pte
!= NULL
);
1586 tpte
= pmap_inval_loadandclear(pte
, pv
->pv_pmap
, pv
->pv_va
);
1587 if (tpte
& VPTE_WIRED
)
1588 --pv
->pv_pmap
->pm_stats
.wired_count
;
1589 KKASSERT(pv
->pv_pmap
->pm_stats
.wired_count
>= 0);
1592 vm_page_flag_set(m
, PG_REFERENCED
);
1595 * Update the vm_page_t clean and reference bits.
1597 if (tpte
& VPTE_M
) {
1598 #if defined(PMAP_DIAGNOSTIC)
1599 if (pmap_nw_modified((pt_entry_t
) tpte
)) {
1601 "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1605 if (pmap_track_modified(pv
->pv_pmap
, pv
->pv_va
))
1608 TAILQ_REMOVE(&m
->md
.pv_list
, pv
, pv_list
);
1609 TAILQ_REMOVE(&pv
->pv_pmap
->pm_pvlist
, pv
, pv_plist
);
1610 ++pv
->pv_pmap
->pm_generation
;
1611 m
->md
.pv_list_count
--;
1612 if (TAILQ_EMPTY(&m
->md
.pv_list
))
1613 vm_page_flag_clear(m
, PG_MAPPED
| PG_WRITEABLE
);
1614 pmap_unuse_pt(pv
->pv_pmap
, pv
->pv_va
, pv
->pv_ptem
);
1617 KKASSERT((m
->flags
& (PG_MAPPED
| PG_WRITEABLE
)) == 0);
1624 * Set the physical protection on the specified range of this map
1627 * This function may not be called from an interrupt if the map is
1628 * not the kernel_pmap.
1631 pmap_protect(pmap_t pmap
, vm_offset_t sva
, vm_offset_t eva
, vm_prot_t prot
)
1635 vm_offset_t pdnxt
, ptpaddr
;
1636 vm_pindex_t sindex
, eindex
;
1642 if ((prot
& VM_PROT_READ
) == VM_PROT_NONE
) {
1643 pmap_remove(pmap
, sva
, eva
);
1647 if (prot
& VM_PROT_WRITE
)
1650 ptbase
= get_ptbase(pmap
, sva
);
1652 sindex
= (sva
>> PAGE_SHIFT
);
1653 eindex
= (eva
>> PAGE_SHIFT
);
1656 for (; sindex
< eindex
; sindex
= pdnxt
) {
1660 pdnxt
= ((sindex
+ NPTEPG
) & ~(NPTEPG
- 1));
1662 pdirindex
= sindex
/ NPDEPG
;
1665 * Clear the modified and writable bits for a 4m page.
1666 * Throw away the modified bit (?)
1668 if (((ptpaddr
= pmap
->pm_pdir
[pdirindex
]) & VPTE_PS
) != 0) {
1669 pmap_clean_pde(&pmap
->pm_pdir
[pdirindex
], pmap
,
1670 (vm_offset_t
)pdirindex
<< SEG_SHIFT
);
1671 pmap
->pm_stats
.resident_count
-= NBPDR
/ PAGE_SIZE
;
1676 * Weed out invalid mappings. Note: we assume that the page
1677 * directory table is always allocated, and in kernel virtual.
1682 if (pdnxt
> eindex
) {
1686 for (; sindex
!= pdnxt
; sindex
++) {
1691 * Clean managed pages and also check the accessed
1692 * bit. Just remove write perms for unmanaged
1693 * pages. Be careful of races, turning off write
1694 * access will force a fault rather then setting
1695 * the modified bit at an unexpected time.
1697 ptep
= &ptbase
[sindex
- sbase
];
1698 if (*ptep
& VPTE_MANAGED
) {
1699 pbits
= pmap_clean_pte(ptep
, pmap
,
1702 if (pbits
& VPTE_A
) {
1703 m
= PHYS_TO_VM_PAGE(pbits
);
1704 vm_page_flag_set(m
, PG_REFERENCED
);
1705 atomic_clear_int(ptep
, VPTE_A
);
1707 if (pbits
& VPTE_M
) {
1708 if (pmap_track_modified(pmap
, i386_ptob(sindex
))) {
1710 m
= PHYS_TO_VM_PAGE(pbits
);
1715 pbits
= pmap_setro_pte(ptep
, pmap
,
1723 * Enter a managed page into a pmap. If the page is not wired related pmap
1724 * data can be destroyed at any time for later demand-operation.
1726 * Insert the vm_page (m) at virtual address (v) in (pmap), with the
1727 * specified protection, and wire the mapping if requested.
1729 * NOTE: This routine may not lazy-evaluate or lose information. The
1730 * page must actually be inserted into the given map NOW.
1732 * NOTE: When entering a page at a KVA address, the pmap must be the
1736 pmap_enter(pmap_t pmap
, vm_offset_t va
, vm_page_t m
, vm_prot_t prot
,
1742 vm_offset_t origpte
, newpte
;
1751 * Get the page table page. The kernel_pmap's page table pages
1752 * are preallocated and have no associated vm_page_t.
1754 if (pmap
== &kernel_pmap
)
1757 mpte
= pmap_allocpte(pmap
, va
);
1759 pte
= pmap_pte(pmap
, va
);
1762 * Page Directory table entry not valid, we need a new PT page
1763 * and pmap_allocpte() didn't give us one. Oops!
1766 panic("pmap_enter: invalid page directory pmap=%p, va=0x%p\n",
1771 * Deal with races on the original mapping (though don't worry
1772 * about VPTE_A races) by cleaning it. This will force a fault
1773 * if an attempt is made to write to the page.
1775 pa
= VM_PAGE_TO_PHYS(m
) & VPTE_FRAME
;
1776 origpte
= pmap_clean_pte(pte
, pmap
, va
);
1777 opa
= origpte
& VPTE_FRAME
;
1779 if (origpte
& VPTE_PS
)
1780 panic("pmap_enter: attempted pmap_enter on 4MB page");
1783 * Mapping has not changed, must be protection or wiring change.
1785 if (origpte
&& (opa
== pa
)) {
1787 * Wiring change, just update stats. We don't worry about
1788 * wiring PT pages as they remain resident as long as there
1789 * are valid mappings in them. Hence, if a user page is wired,
1790 * the PT page will be also.
1792 if (wired
&& ((origpte
& VPTE_WIRED
) == 0))
1793 ++pmap
->pm_stats
.wired_count
;
1794 else if (!wired
&& (origpte
& VPTE_WIRED
))
1795 --pmap
->pm_stats
.wired_count
;
1796 KKASSERT(pmap
->pm_stats
.wired_count
>= 0);
1799 * Remove the extra pte reference. Note that we cannot
1800 * optimize the RO->RW case because we have adjusted the
1801 * wiring count above and may need to adjust the wiring
1808 * We might be turning off write access to the page,
1809 * so we go ahead and sense modify status.
1811 if (origpte
& VPTE_MANAGED
) {
1812 if ((origpte
& VPTE_M
) &&
1813 pmap_track_modified(pmap
, va
)) {
1815 om
= PHYS_TO_VM_PAGE(opa
);
1819 KKASSERT(m
->flags
& PG_MAPPED
);
1824 * Mapping has changed, invalidate old range and fall through to
1825 * handle validating new mapping.
1829 err
= pmap_remove_pte(pmap
, pte
, va
);
1831 panic("pmap_enter: pte vanished, va: 0x%x", va
);
1835 * Enter on the PV list if part of our managed memory. Note that we
1836 * raise IPL while manipulating pv_table since pmap_enter can be
1837 * called at interrupt time.
1839 if (pmap_initialized
&&
1840 (m
->flags
& (PG_FICTITIOUS
|PG_UNMANAGED
)) == 0) {
1841 pmap_insert_entry(pmap
, va
, mpte
, m
);
1843 vm_page_flag_set(m
, PG_MAPPED
);
1847 * Increment counters
1849 ++pmap
->pm_stats
.resident_count
;
1851 pmap
->pm_stats
.wired_count
++;
1855 * Now validate mapping with desired protection/wiring.
1857 newpte
= (vm_offset_t
) (pa
| pte_prot(pmap
, prot
) | VPTE_V
);
1860 newpte
|= VPTE_WIRED
;
1861 if (pmap
!= &kernel_pmap
)
1865 * If the mapping or permission bits are different from the
1866 * (now cleaned) original pte, an update is needed. We've
1867 * already downgraded or invalidated the page so all we have
1868 * to do now is update the bits.
1870 * XXX should we synchronize RO->RW changes to avoid another
1873 if ((origpte
& ~(VPTE_W
|VPTE_M
|VPTE_A
)) != newpte
) {
1874 *pte
= newpte
| VPTE_A
;
1875 if (newpte
& VPTE_W
)
1876 vm_page_flag_set(m
, PG_WRITEABLE
);
1878 KKASSERT((newpte
& VPTE_MANAGED
) == 0 || m
->flags
& PG_MAPPED
);
1882 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
1884 * Currently this routine may only be used on user pmaps, not kernel_pmap.
1887 pmap_enter_quick(pmap_t pmap
, vm_offset_t va
, vm_page_t m
)
1895 KKASSERT(pmap
!= &kernel_pmap
);
1897 KKASSERT(va
>= VM_MIN_USER_ADDRESS
&& va
< VM_MAX_USER_ADDRESS
);
1900 * Calculate pagetable page (mpte), allocating it if necessary.
1902 * A held page table page (mpte), or NULL, is passed onto the
1903 * section following.
1905 ptepindex
= va
>> PDRSHIFT
;
1909 * Get the page directory entry
1911 ptepa
= (vm_offset_t
) pmap
->pm_pdir
[ptepindex
];
1914 * If the page table page is mapped, we just increment
1915 * the hold count, and activate it.
1918 if (ptepa
& VPTE_PS
)
1919 panic("pmap_enter_quick: unexpected mapping into 4MB page");
1920 if (pmap
->pm_ptphint
&&
1921 (pmap
->pm_ptphint
->pindex
== ptepindex
)) {
1922 mpte
= pmap
->pm_ptphint
;
1924 mpte
= pmap_page_lookup( pmap
->pm_pteobj
, ptepindex
);
1925 pmap
->pm_ptphint
= mpte
;
1930 mpte
= _pmap_allocpte(pmap
, ptepindex
);
1932 } while (mpte
== NULL
);
1935 * Ok, now that the page table page has been validated, get the pte.
1936 * If the pte is already mapped undo mpte's hold_count and
1939 pte
= pmap_pte(pmap
, va
);
1941 pmap_unwire_pte_hold(pmap
, mpte
);
1946 * Enter on the PV list if part of our managed memory. Note that we
1947 * raise IPL while manipulating pv_table since pmap_enter can be
1948 * called at interrupt time.
1950 if ((m
->flags
& (PG_FICTITIOUS
|PG_UNMANAGED
)) == 0) {
1951 pmap_insert_entry(pmap
, va
, mpte
, m
);
1952 vm_page_flag_set(m
, PG_MAPPED
);
1956 * Increment counters
1958 ++pmap
->pm_stats
.resident_count
;
1960 pa
= VM_PAGE_TO_PHYS(m
);
1963 * Now validate mapping with RO protection
1965 if (m
->flags
& (PG_FICTITIOUS
|PG_UNMANAGED
))
1966 *pte
= (vpte_t
)pa
| VPTE_V
| VPTE_U
;
1968 *pte
= (vpte_t
)pa
| VPTE_V
| VPTE_U
| VPTE_MANAGED
;
1969 /*pmap_inval_add(&info, pmap, va); shouldn't be needed 0->valid */
1970 /*pmap_inval_flush(&info); don't need for vkernel */
1974 * Extract the physical address for the translation at the specified
1975 * virtual address in the pmap.
1978 pmap_extract(pmap_t pmap
, vm_offset_t va
)
1983 if (pmap
&& (pte
= pmap
->pm_pdir
[va
>> SEG_SHIFT
]) != 0) {
1984 if (pte
& VPTE_PS
) {
1985 rtval
= pte
& ~((vpte_t
)(1 << SEG_SHIFT
) - 1);
1986 rtval
|= va
& SEG_MASK
;
1988 pte
= *get_ptbase(pmap
, va
);
1989 rtval
= (pte
& VPTE_FRAME
) | (va
& PAGE_MASK
);
1996 #define MAX_INIT_PT (96)
1999 * This routine preloads the ptes for a given object into the specified pmap.
2000 * This eliminates the blast of soft faults on process startup and
2001 * immediately after an mmap.
2003 static int pmap_object_init_pt_callback(vm_page_t p
, void *data
);
2006 pmap_object_init_pt(pmap_t pmap
, vm_offset_t addr
, vm_prot_t prot
,
2007 vm_object_t object
, vm_pindex_t pindex
,
2008 vm_size_t size
, int limit
)
2010 struct rb_vm_page_scan_info info
;
2015 * We can't preinit if read access isn't set or there is no pmap
2018 if ((prot
& VM_PROT_READ
) == 0 || pmap
== NULL
|| object
== NULL
)
2022 * We can't preinit if the pmap is not the current pmap
2024 lp
= curthread
->td_lwp
;
2025 if (lp
== NULL
|| pmap
!= vmspace_pmap(lp
->lwp_vmspace
))
2028 psize
= size
>> PAGE_SHIFT
;
2030 if ((object
->type
!= OBJT_VNODE
) ||
2031 ((limit
& MAP_PREFAULT_PARTIAL
) && (psize
> MAX_INIT_PT
) &&
2032 (object
->resident_page_count
> MAX_INIT_PT
))) {
2036 if (psize
+ pindex
> object
->size
) {
2037 if (object
->size
< pindex
)
2039 psize
= object
->size
- pindex
;
2046 * Use a red-black scan to traverse the requested range and load
2047 * any valid pages found into the pmap.
2049 * We cannot safely scan the object's memq unless we are in a
2050 * critical section since interrupts can remove pages from objects.
2052 info
.start_pindex
= pindex
;
2053 info
.end_pindex
= pindex
+ psize
- 1;
2060 vm_page_rb_tree_RB_SCAN(&object
->rb_memq
, rb_vm_page_scancmp
,
2061 pmap_object_init_pt_callback
, &info
);
2067 pmap_object_init_pt_callback(vm_page_t p
, void *data
)
2069 struct rb_vm_page_scan_info
*info
= data
;
2070 vm_pindex_t rel_index
;
2072 * don't allow an madvise to blow away our really
2073 * free pages allocating pv entries.
2075 if ((info
->limit
& MAP_PREFAULT_MADVISE
) &&
2076 vmstats
.v_free_count
< vmstats
.v_free_reserved
) {
2079 if (((p
->valid
& VM_PAGE_BITS_ALL
) == VM_PAGE_BITS_ALL
) &&
2080 (p
->busy
== 0) && (p
->flags
& (PG_BUSY
| PG_FICTITIOUS
)) == 0) {
2081 if ((p
->queue
- p
->pc
) == PQ_CACHE
)
2082 vm_page_deactivate(p
);
2084 rel_index
= p
->pindex
- info
->start_pindex
;
2085 pmap_enter_quick(info
->pmap
,
2086 info
->addr
+ i386_ptob(rel_index
), p
);
2093 * pmap_prefault provides a quick way of clustering pagefaults into a
2094 * processes address space. It is a "cousin" of pmap_object_init_pt,
2095 * except it runs at page fault time instead of mmap time.
2099 #define PAGEORDER_SIZE (PFBAK+PFFOR)
2101 static int pmap_prefault_pageorder
[] = {
2102 -PAGE_SIZE
, PAGE_SIZE
,
2103 -2 * PAGE_SIZE
, 2 * PAGE_SIZE
,
2104 -3 * PAGE_SIZE
, 3 * PAGE_SIZE
,
2105 -4 * PAGE_SIZE
, 4 * PAGE_SIZE
2109 pmap_prefault(pmap_t pmap
, vm_offset_t addra
, vm_map_entry_t entry
)
2120 * We do not currently prefault mappings that use virtual page
2121 * tables. We do not prefault foreign pmaps.
2123 if (entry
->maptype
== VM_MAPTYPE_VPAGETABLE
)
2125 lp
= curthread
->td_lwp
;
2126 if (lp
== NULL
|| pmap
!= vmspace_pmap(lp
->lwp_vmspace
))
2129 object
= entry
->object
.vm_object
;
2131 starta
= addra
- PFBAK
* PAGE_SIZE
;
2132 if (starta
< entry
->start
)
2133 starta
= entry
->start
;
2134 else if (starta
> addra
)
2138 * critical section protection is required to maintain the
2139 * page/object association, interrupts can free pages and remove
2140 * them from their objects.
2143 for (i
= 0; i
< PAGEORDER_SIZE
; i
++) {
2144 vm_object_t lobject
;
2147 addr
= addra
+ pmap_prefault_pageorder
[i
];
2148 if (addr
> addra
+ (PFFOR
* PAGE_SIZE
))
2151 if (addr
< starta
|| addr
>= entry
->end
)
2155 * Make sure the page table page already exists
2157 if ((*pmap_pde(pmap
, addr
)) == 0)
2161 * Get a pointer to the pte and make sure that no valid page
2164 pte
= get_ptbase(pmap
, addr
);
2169 * Get the page to be mapped
2171 pindex
= ((addr
- entry
->start
) + entry
->offset
) >> PAGE_SHIFT
;
2174 for (m
= vm_page_lookup(lobject
, pindex
);
2175 (!m
&& (lobject
->type
== OBJT_DEFAULT
) &&
2176 (lobject
->backing_object
));
2177 lobject
= lobject
->backing_object
2179 if (lobject
->backing_object_offset
& PAGE_MASK
)
2181 pindex
+= (lobject
->backing_object_offset
>> PAGE_SHIFT
);
2182 m
= vm_page_lookup(lobject
->backing_object
, pindex
);
2186 * give-up when a page is not in memory
2192 * If everything meets the requirements for pmap_enter_quick(),
2193 * then enter the page.
2196 if (((m
->valid
& VM_PAGE_BITS_ALL
) == VM_PAGE_BITS_ALL
) &&
2198 (m
->flags
& (PG_BUSY
| PG_FICTITIOUS
)) == 0) {
2200 if ((m
->queue
- m
->pc
) == PQ_CACHE
) {
2201 vm_page_deactivate(m
);
2204 pmap_enter_quick(pmap
, addr
, m
);
2212 * Routine: pmap_change_wiring
2213 * Function: Change the wiring attribute for a map/virtual-address
2215 * In/out conditions:
2216 * The mapping must already exist in the pmap.
2219 pmap_change_wiring(pmap_t pmap
, vm_offset_t va
, boolean_t wired
)
2226 pte
= get_ptbase(pmap
, va
);
2228 if (wired
&& (*pte
& VPTE_WIRED
) == 0)
2229 ++pmap
->pm_stats
.wired_count
;
2230 else if (!wired
&& (*pte
& VPTE_WIRED
))
2231 --pmap
->pm_stats
.wired_count
;
2232 KKASSERT(pmap
->pm_stats
.wired_count
>= 0);
2235 * Wiring is not a hardware characteristic so there is no need to
2236 * invalidate TLB. However, in an SMP environment we must use
2237 * a locked bus cycle to update the pte (if we are not using
2238 * the pmap_inval_*() API that is)... it's ok to do this for simple
2242 atomic_set_int(pte
, VPTE_WIRED
);
2244 atomic_clear_int(pte
, VPTE_WIRED
);
2248 * Copy the range specified by src_addr/len
2249 * from the source map to the range dst_addr/len
2250 * in the destination map.
2252 * This routine is only advisory and need not do anything.
2255 pmap_copy(pmap_t dst_pmap
, pmap_t src_pmap
, vm_offset_t dst_addr
,
2256 vm_size_t len
, vm_offset_t src_addr
)
2259 vm_offset_t end_addr
= src_addr
+ len
;
2266 * XXX BUGGY. Amoung other things srcmpte is assumed to remain
2267 * valid through blocking calls, and that's just not going to
2274 if (dst_addr
!= src_addr
)
2276 if (dst_pmap
->pm_pdir
== NULL
)
2278 if (src_pmap
->pm_pdir
== NULL
)
2283 src_frame
= get_ptbase1(src_pmap
, src_addr
);
2284 dst_frame
= get_ptbase2(dst_pmap
, src_addr
);
2287 * critical section protection is required to maintain the page/object
2288 * association, interrupts can free pages and remove them from
2291 for (addr
= src_addr
; addr
< end_addr
; addr
= pdnxt
) {
2292 vpte_t
*src_pte
, *dst_pte
;
2293 vm_page_t dstmpte
, srcmpte
;
2294 vm_offset_t srcptepaddr
;
2297 if (addr
>= VM_MAX_USER_ADDRESS
)
2298 panic("pmap_copy: invalid to pmap_copy page tables\n");
2301 * Don't let optional prefaulting of pages make us go
2302 * way below the low water mark of free pages or way
2303 * above high water mark of used pv entries.
2305 if (vmstats
.v_free_count
< vmstats
.v_free_reserved
||
2306 pv_entry_count
> pv_entry_high_water
)
2309 pdnxt
= ((addr
+ PAGE_SIZE
*NPTEPG
) & ~(PAGE_SIZE
*NPTEPG
- 1));
2310 ptepindex
= addr
>> PDRSHIFT
;
2312 srcptepaddr
= (vm_offset_t
) src_pmap
->pm_pdir
[ptepindex
];
2313 if (srcptepaddr
== 0)
2316 if (srcptepaddr
& VPTE_PS
) {
2317 if (dst_pmap
->pm_pdir
[ptepindex
] == 0) {
2318 dst_pmap
->pm_pdir
[ptepindex
] = (pd_entry_t
) srcptepaddr
;
2319 dst_pmap
->pm_stats
.resident_count
+= NBPDR
/ PAGE_SIZE
;
2324 srcmpte
= vm_page_lookup(src_pmap
->pm_pteobj
, ptepindex
);
2325 if ((srcmpte
== NULL
) || (srcmpte
->hold_count
== 0) ||
2326 (srcmpte
->flags
& PG_BUSY
)) {
2330 if (pdnxt
> end_addr
)
2333 src_pte
= src_frame
+ ((addr
- src_addr
) >> PAGE_SHIFT
);
2334 dst_pte
= dst_frame
+ ((addr
- src_addr
) >> PAGE_SHIFT
);
2335 while (addr
< pdnxt
) {
2340 * we only virtual copy managed pages
2342 if ((ptetemp
& VPTE_MANAGED
) != 0) {
2344 * We have to check after allocpte for the
2345 * pte still being around... allocpte can
2348 * pmap_allocpte can block, unfortunately
2349 * we have to reload the tables.
2351 dstmpte
= pmap_allocpte(dst_pmap
, addr
);
2352 src_frame
= get_ptbase1(src_pmap
, src_addr
);
2353 dst_frame
= get_ptbase2(dst_pmap
, src_addr
);
2355 if ((*dst_pte
== 0) && (ptetemp
= *src_pte
) &&
2356 (ptetemp
& VPTE_MANAGED
) != 0) {
2358 * Clear the modified and accessed
2359 * (referenced) bits during the copy.
2361 * We do not have to clear the write
2362 * bit to force a fault-on-modify
2363 * because the real kernel's target
2364 * pmap is empty and will fault anyway.
2366 m
= PHYS_TO_VM_PAGE(ptetemp
);
2367 *dst_pte
= ptetemp
& ~(VPTE_M
| VPTE_A
);
2368 ++dst_pmap
->pm_stats
.resident_count
;
2369 pmap_insert_entry(dst_pmap
, addr
,
2371 KKASSERT(m
->flags
& PG_MAPPED
);
2373 pmap_unwire_pte_hold(dst_pmap
, dstmpte
);
2375 if (dstmpte
->hold_count
>= srcmpte
->hold_count
)
2389 * Zero the specified PA by mapping the page into KVM and clearing its
2392 * This function may be called from an interrupt and no locking is
2396 pmap_zero_page(vm_paddr_t phys
)
2398 struct mdglobaldata
*gd
= mdcpu
;
2402 panic("pmap_zero_page: CMAP3 busy");
2403 *gd
->gd_CMAP3
= VPTE_V
| VPTE_R
| VPTE_W
| (phys
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2404 madvise(gd
->gd_CADDR3
, PAGE_SIZE
, MADV_INVAL
);
2406 bzero(gd
->gd_CADDR3
, PAGE_SIZE
);
2412 * pmap_page_assertzero:
2414 * Assert that a page is empty, panic if it isn't.
2417 pmap_page_assertzero(vm_paddr_t phys
)
2419 struct mdglobaldata
*gd
= mdcpu
;
2424 panic("pmap_zero_page: CMAP3 busy");
2425 *gd
->gd_CMAP3
= VPTE_V
| VPTE_R
| VPTE_W
|
2426 (phys
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2427 madvise(gd
->gd_CADDR3
, PAGE_SIZE
, MADV_INVAL
);
2428 for (i
= 0; i
< PAGE_SIZE
; i
+= 4) {
2429 if (*(int *)((char *)gd
->gd_CADDR3
+ i
) != 0) {
2430 panic("pmap_page_assertzero() @ %p not zero!\n",
2431 (void *)gd
->gd_CADDR3
);
2441 * Zero part of a physical page by mapping it into memory and clearing
2442 * its contents with bzero.
2444 * off and size may not cover an area beyond a single hardware page.
2447 pmap_zero_page_area(vm_paddr_t phys
, int off
, int size
)
2449 struct mdglobaldata
*gd
= mdcpu
;
2453 panic("pmap_zero_page: CMAP3 busy");
2454 *gd
->gd_CMAP3
= VPTE_V
| VPTE_R
| VPTE_W
|
2455 (phys
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2456 madvise(gd
->gd_CADDR3
, PAGE_SIZE
, MADV_INVAL
);
2458 bzero((char *)gd
->gd_CADDR3
+ off
, size
);
2466 * Copy the physical page from the source PA to the target PA.
2467 * This function may be called from an interrupt. No locking
2471 pmap_copy_page(vm_paddr_t src
, vm_paddr_t dst
)
2473 struct mdglobaldata
*gd
= mdcpu
;
2476 if (*(int *) gd
->gd_CMAP1
)
2477 panic("pmap_copy_page: CMAP1 busy");
2478 if (*(int *) gd
->gd_CMAP2
)
2479 panic("pmap_copy_page: CMAP2 busy");
2481 *(int *) gd
->gd_CMAP1
= VPTE_V
| VPTE_R
| (src
& PG_FRAME
) | VPTE_A
;
2482 *(int *) gd
->gd_CMAP2
= VPTE_V
| VPTE_R
| VPTE_W
| (dst
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2484 madvise(gd
->gd_CADDR1
, PAGE_SIZE
, MADV_INVAL
);
2485 madvise(gd
->gd_CADDR2
, PAGE_SIZE
, MADV_INVAL
);
2487 bcopy(gd
->gd_CADDR1
, gd
->gd_CADDR2
, PAGE_SIZE
);
2489 *(int *) gd
->gd_CMAP1
= 0;
2490 *(int *) gd
->gd_CMAP2
= 0;
2495 * pmap_copy_page_frag:
2497 * Copy the physical page from the source PA to the target PA.
2498 * This function may be called from an interrupt. No locking
2502 pmap_copy_page_frag(vm_paddr_t src
, vm_paddr_t dst
, size_t bytes
)
2504 struct mdglobaldata
*gd
= mdcpu
;
2507 if (*(int *) gd
->gd_CMAP1
)
2508 panic("pmap_copy_page: CMAP1 busy");
2509 if (*(int *) gd
->gd_CMAP2
)
2510 panic("pmap_copy_page: CMAP2 busy");
2512 *(int *) gd
->gd_CMAP1
= VPTE_V
| (src
& VPTE_FRAME
) | VPTE_A
;
2513 *(int *) gd
->gd_CMAP2
= VPTE_V
| VPTE_R
| VPTE_W
| (dst
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2515 madvise(gd
->gd_CADDR1
, PAGE_SIZE
, MADV_INVAL
);
2516 madvise(gd
->gd_CADDR2
, PAGE_SIZE
, MADV_INVAL
);
2518 bcopy((char *)gd
->gd_CADDR1
+ (src
& PAGE_MASK
),
2519 (char *)gd
->gd_CADDR2
+ (dst
& PAGE_MASK
),
2522 *(int *) gd
->gd_CMAP1
= 0;
2523 *(int *) gd
->gd_CMAP2
= 0;
2528 * Returns true if the pmap's pv is one of the first
2529 * 16 pvs linked to from this page. This count may
2530 * be changed upwards or downwards in the future; it
2531 * is only necessary that true be returned for a small
2532 * subset of pmaps for proper page aging.
2535 pmap_page_exists_quick(pmap_t pmap
, vm_page_t m
)
2540 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
))
2545 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
2546 if (pv
->pv_pmap
== pmap
) {
2559 * Remove all pages from specified address space
2560 * this aids process exit speeds. Also, this code
2561 * is special cased for current process only, but
2562 * can have the more generic (and slightly slower)
2563 * mode enabled. This is much faster than pmap_remove
2564 * in the case of running down an entire address space.
2567 pmap_remove_pages(pmap_t pmap
, vm_offset_t sva
, vm_offset_t eva
)
2573 int32_t save_generation
;
2576 lp
= curthread
->td_lwp
;
2577 if (lp
&& pmap
== vmspace_pmap(lp
->lwp_vmspace
))
2583 for (pv
= TAILQ_FIRST(&pmap
->pm_pvlist
); pv
; pv
= npv
) {
2584 if (pv
->pv_va
>= eva
|| pv
->pv_va
< sva
) {
2585 npv
= TAILQ_NEXT(pv
, pv_plist
);
2589 KKASSERT(pmap
== pv
->pv_pmap
);
2591 pte
= pmap_pte(pmap
, pv
->pv_va
);
2594 * We cannot remove wired pages from a process' mapping
2597 if (*pte
& VPTE_WIRED
) {
2598 npv
= TAILQ_NEXT(pv
, pv_plist
);
2601 tpte
= pmap_inval_loadandclear(pte
, pmap
, pv
->pv_va
);
2603 m
= PHYS_TO_VM_PAGE(tpte
);
2605 KASSERT(m
< &vm_page_array
[vm_page_array_size
],
2606 ("pmap_remove_pages: bad tpte %x", tpte
));
2608 KKASSERT(pmap
->pm_stats
.resident_count
> 0);
2609 --pmap
->pm_stats
.resident_count
;
2612 * Update the vm_page_t clean and reference bits.
2614 if (tpte
& VPTE_M
) {
2618 npv
= TAILQ_NEXT(pv
, pv_plist
);
2619 TAILQ_REMOVE(&pmap
->pm_pvlist
, pv
, pv_plist
);
2620 save_generation
= ++pmap
->pm_generation
;
2622 m
->md
.pv_list_count
--;
2623 TAILQ_REMOVE(&m
->md
.pv_list
, pv
, pv_list
);
2624 if (TAILQ_FIRST(&m
->md
.pv_list
) == NULL
)
2625 vm_page_flag_clear(m
, PG_MAPPED
| PG_WRITEABLE
);
2627 pmap_unuse_pt(pmap
, pv
->pv_va
, pv
->pv_ptem
);
2631 * Restart the scan if we blocked during the unuse or free
2632 * calls and other removals were made.
2634 if (save_generation
!= pmap
->pm_generation
) {
2635 kprintf("Warning: pmap_remove_pages race-A avoided\n");
2636 pv
= TAILQ_FIRST(&pmap
->pm_pvlist
);
2643 * pmap_testbit tests bits in active mappings of a VM page.
2646 pmap_testbit(vm_page_t m
, int bit
)
2651 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
))
2654 if (TAILQ_FIRST(&m
->md
.pv_list
) == NULL
)
2659 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
2661 * if the bit being tested is the modified bit, then
2662 * mark clean_map and ptes as never
2665 if (bit
& (VPTE_A
|VPTE_M
)) {
2666 if (!pmap_track_modified(pv
->pv_pmap
, pv
->pv_va
))
2670 #if defined(PMAP_DIAGNOSTIC)
2672 kprintf("Null pmap (tb) at va: 0x%x\n", pv
->pv_va
);
2676 pte
= pmap_pte(pv
->pv_pmap
, pv
->pv_va
);
2687 * This routine is used to clear bits in ptes. Certain bits require special
2688 * handling, in particular (on virtual kernels) the VPTE_M (modify) bit.
2690 * This routine is only called with certain VPTE_* bit combinations.
2692 static __inline
void
2693 pmap_clearbit(vm_page_t m
, int bit
)
2699 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
))
2705 * Loop over all current mappings setting/clearing as appropos If
2706 * setting RO do we need to clear the VAC?
2708 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
2710 * don't write protect pager mappings
2712 if (bit
== VPTE_W
) {
2713 if (!pmap_track_modified(pv
->pv_pmap
, pv
->pv_va
))
2717 #if defined(PMAP_DIAGNOSTIC)
2719 kprintf("Null pmap (cb) at va: 0x%x\n", pv
->pv_va
);
2725 * Careful here. We can use a locked bus instruction to
2726 * clear VPTE_A or VPTE_M safely but we need to synchronize
2727 * with the target cpus when we mess with VPTE_W.
2729 * On virtual kernels we must force a new fault-on-write
2730 * in the real kernel if we clear the Modify bit ourselves,
2731 * otherwise the real kernel will not get a new fault and
2732 * will never set our Modify bit again.
2734 pte
= pmap_pte(pv
->pv_pmap
, pv
->pv_va
);
2736 if (bit
== VPTE_W
) {
2738 * We must also clear VPTE_M when clearing
2741 pbits
= pmap_clean_pte(pte
, pv
->pv_pmap
,
2745 } else if (bit
== VPTE_M
) {
2747 * We do not have to make the page read-only
2748 * when clearing the Modify bit. The real
2749 * kernel will make the real PTE read-only
2750 * or otherwise detect the write and set
2751 * our VPTE_M again simply by us invalidating
2752 * the real kernel VA for the pmap (as we did
2753 * above). This allows the real kernel to
2754 * handle the write fault without forwarding
2757 atomic_clear_int(pte
, VPTE_M
);
2758 } else if ((bit
& (VPTE_W
|VPTE_M
)) == (VPTE_W
|VPTE_M
)) {
2760 * We've been asked to clear W & M, I guess
2761 * the caller doesn't want us to update
2762 * the dirty status of the VM page.
2764 pmap_clean_pte(pte
, pv
->pv_pmap
, pv
->pv_va
);
2767 * We've been asked to clear bits that do
2768 * not interact with hardware.
2770 atomic_clear_int(pte
, bit
);
2778 * pmap_page_protect:
2780 * Lower the permission for all mappings to a given page.
2783 pmap_page_protect(vm_page_t m
, vm_prot_t prot
)
2785 if ((prot
& VM_PROT_WRITE
) == 0) {
2786 if (prot
& (VM_PROT_READ
| VM_PROT_EXECUTE
)) {
2787 pmap_clearbit(m
, VPTE_W
);
2788 vm_page_flag_clear(m
, PG_WRITEABLE
);
2796 pmap_phys_address(int ppn
)
2798 return (i386_ptob(ppn
));
2802 * pmap_ts_referenced:
2804 * Return a count of reference bits for a page, clearing those bits.
2805 * It is not necessary for every reference bit to be cleared, but it
2806 * is necessary that 0 only be returned when there are truly no
2807 * reference bits set.
2809 * XXX: The exact number of bits to check and clear is a matter that
2810 * should be tested and standardized at some point in the future for
2811 * optimal aging of shared pages.
2814 pmap_ts_referenced(vm_page_t m
)
2816 pv_entry_t pv
, pvf
, pvn
;
2820 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
))
2825 if ((pv
= TAILQ_FIRST(&m
->md
.pv_list
)) != NULL
) {
2830 pvn
= TAILQ_NEXT(pv
, pv_list
);
2832 TAILQ_REMOVE(&m
->md
.pv_list
, pv
, pv_list
);
2834 TAILQ_INSERT_TAIL(&m
->md
.pv_list
, pv
, pv_list
);
2836 if (!pmap_track_modified(pv
->pv_pmap
, pv
->pv_va
))
2839 pte
= pmap_pte(pv
->pv_pmap
, pv
->pv_va
);
2841 if (pte
&& (*pte
& VPTE_A
)) {
2843 atomic_clear_int(pte
, VPTE_A
);
2845 atomic_clear_int_nonlocked(pte
, VPTE_A
);
2852 } while ((pv
= pvn
) != NULL
&& pv
!= pvf
);
2862 * Return whether or not the specified physical page was modified
2863 * in any physical maps.
2866 pmap_is_modified(vm_page_t m
)
2868 return pmap_testbit(m
, VPTE_M
);
2872 * Clear the modify bits on the specified physical page.
2875 pmap_clear_modify(vm_page_t m
)
2877 pmap_clearbit(m
, VPTE_M
);
2881 * pmap_clear_reference:
2883 * Clear the reference bit on the specified physical page.
2886 pmap_clear_reference(vm_page_t m
)
2888 pmap_clearbit(m
, VPTE_A
);
2892 * Miscellaneous support routines follow
2896 i386_protection_init(void)
2900 kp
= protection_codes
;
2901 for (prot
= 0; prot
< 8; prot
++) {
2902 if (prot
& VM_PROT_READ
)
2904 if (prot
& VM_PROT_WRITE
)
2906 if (prot
& VM_PROT_EXECUTE
)
2915 * Map a set of physical memory pages into the kernel virtual
2916 * address space. Return a pointer to where it is mapped. This
2917 * routine is intended to be used for mapping device memory,
2920 * NOTE: we can't use pgeflag unless we invalidate the pages one at
2924 pmap_mapdev(vm_paddr_t pa
, vm_size_t size
)
2926 vm_offset_t va
, tmpva
, offset
;
2929 offset
= pa
& PAGE_MASK
;
2930 size
= roundup(offset
+ size
, PAGE_SIZE
);
2932 va
= kmem_alloc_nofault(&kernel_map
, size
);
2934 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2936 pa
= pa
& VPTE_FRAME
;
2937 for (tmpva
= va
; size
> 0;) {
2938 pte
= KernelPTA
+ (tmpva
>> PAGE_SHIFT
);
2939 *pte
= pa
| VPTE_R
| VPTE_W
| VPTE_V
; /* | pgeflag; */
2947 return ((void *)(va
+ offset
));
2951 pmap_unmapdev(vm_offset_t va
, vm_size_t size
)
2953 vm_offset_t base
, offset
;
2955 base
= va
& VPTE_FRAME
;
2956 offset
= va
& PAGE_MASK
;
2957 size
= roundup(offset
+ size
, PAGE_SIZE
);
2958 pmap_qremove(va
, size
>> PAGE_SHIFT
);
2959 kmem_free(&kernel_map
, base
, size
);
2965 * perform the pmap work for mincore
2968 pmap_mincore(pmap_t pmap
, vm_offset_t addr
)
2974 ptep
= pmap_pte(pmap
, addr
);
2979 if ((pte
= *ptep
) != 0) {
2982 val
= MINCORE_INCORE
;
2983 if ((pte
& VPTE_MANAGED
) == 0)
2986 pa
= pte
& VPTE_FRAME
;
2988 m
= PHYS_TO_VM_PAGE(pa
);
2994 val
|= MINCORE_MODIFIED
|MINCORE_MODIFIED_OTHER
;
2996 * Modified by someone
2998 else if (m
->dirty
|| pmap_is_modified(m
))
2999 val
|= MINCORE_MODIFIED_OTHER
;
3004 val
|= MINCORE_REFERENCED
|MINCORE_REFERENCED_OTHER
;
3007 * Referenced by someone
3009 else if ((m
->flags
& PG_REFERENCED
) || pmap_ts_referenced(m
)) {
3010 val
|= MINCORE_REFERENCED_OTHER
;
3011 vm_page_flag_set(m
, PG_REFERENCED
);
3018 pmap_replacevm(struct proc
*p
, struct vmspace
*newvm
, int adjrefs
)
3020 struct vmspace
*oldvm
;
3023 oldvm
= p
->p_vmspace
;
3025 if (oldvm
!= newvm
) {
3026 p
->p_vmspace
= newvm
;
3027 KKASSERT(p
->p_nthreads
== 1);
3028 lp
= RB_ROOT(&p
->p_lwp_tree
);
3029 pmap_setlwpvm(lp
, newvm
);
3031 sysref_get(&newvm
->vm_sysref
);
3032 sysref_put(&oldvm
->vm_sysref
);
3039 pmap_setlwpvm(struct lwp
*lp
, struct vmspace
*newvm
)
3041 struct vmspace
*oldvm
;
3045 oldvm
= lp
->lwp_vmspace
;
3047 if (oldvm
!= newvm
) {
3048 lp
->lwp_vmspace
= newvm
;
3049 if (curthread
->td_lwp
== lp
) {
3050 pmap
= vmspace_pmap(newvm
);
3052 atomic_set_int(&pmap
->pm_active
, 1 << mycpu
->gd_cpuid
);
3054 pmap
->pm_active
|= 1;
3056 #if defined(SWTCH_OPTIM_STATS)
3059 pmap
= vmspace_pmap(oldvm
);
3061 atomic_clear_int(&pmap
->pm_active
,
3062 1 << mycpu
->gd_cpuid
);
3064 pmap
->pm_active
&= ~1;
3073 pmap_addr_hint(vm_object_t obj
, vm_offset_t addr
, vm_size_t size
)
3076 if ((obj
== NULL
) || (size
< NBPDR
) || (obj
->type
!= OBJT_DEVICE
)) {
3080 addr
= (addr
+ (NBPDR
- 1)) & ~(NBPDR
- 1);
3087 static void pads (pmap_t pm
);
3088 void pmap_pvdump (vm_paddr_t pa
);
3090 /* print address space of pmap*/
3098 if (pm
== &kernel_pmap
)
3100 for (i
= 0; i
< 1024; i
++)
3102 for (j
= 0; j
< 1024; j
++) {
3103 va
= (i
<< PDRSHIFT
) + (j
<< PAGE_SHIFT
);
3104 if (pm
== &kernel_pmap
&& va
< KERNBASE
)
3106 if (pm
!= &kernel_pmap
&& va
> UPT_MAX_ADDRESS
)
3108 ptep
= pmap_pte(pm
, va
);
3109 if (ptep
&& (*ptep
& VPTE_V
)) {
3111 (void *)va
, (unsigned)*ptep
);
3118 pmap_pvdump(vm_paddr_t pa
)
3123 kprintf("pa %08llx", (long long)pa
);
3124 m
= PHYS_TO_VM_PAGE(pa
);
3125 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
3127 kprintf(" -> pmap %p, va %x, flags %x",
3128 (void *)pv
->pv_pmap
, pv
->pv_va
, pv
->pv_flags
);
3130 kprintf(" -> pmap %p, va %x", (void *)pv
->pv_pmap
, pv
->pv_va
);