4 * Copyright (c) 2006 The DragonFly Project. All rights reserved.
5 * Copyright (c) 1991 Regents of the University of California.
7 * Copyright (c) 1994 John S. Dyson
9 * Copyright (c) 1994 David Greenman
10 * All rights reserved.
11 * Copyright (c) 2004-2006 Matthew Dillon
12 * All rights reserved.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in
22 * the documentation and/or other materials provided with the
24 * 3. Neither the name of The DragonFly Project nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific, prior written permission.
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
31 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
32 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
33 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
34 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
35 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
36 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
37 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
38 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
42 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
45 * NOTE: PMAP_INVAL_ADD: In pc32 this function is called prior to adjusting
46 * the PTE in the page table, because a cpu synchronization might be required.
47 * The actual invalidation is delayed until the following call or flush. In
48 * the VKERNEL build this function is called prior to adjusting the PTE and
49 * invalidates the table synchronously (not delayed), and is not SMP safe
53 #include <sys/types.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
58 #include <sys/vkernel.h>
60 #include <sys/thread.h>
62 #include <sys/vmspace.h>
65 #include <vm/vm_page.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_kern.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_zone.h>
70 #include <vm/vm_pageout.h>
72 #include <machine/md_var.h>
73 #include <machine/pcb.h>
74 #include <machine/pmap_inval.h>
75 #include <machine/globaldata.h>
77 #include <sys/sysref2.h>
81 struct pmap kernel_pmap
;
83 static struct vm_zone pvzone
;
84 static struct vm_object pvzone_obj
;
85 static TAILQ_HEAD(,pmap
) pmap_list
= TAILQ_HEAD_INITIALIZER(pmap_list
);
86 static int pv_entry_count
;
87 static int pv_entry_max
;
88 static int pv_entry_high_water
;
89 static int pmap_pagedaemon_waken
;
90 static boolean_t pmap_initialized
= FALSE
;
91 static int protection_codes
[8];
93 static void i386_protection_init(void);
94 static void pmap_remove_all(vm_page_t m
);
95 static int pmap_release_free_page(struct pmap
*pmap
, vm_page_t p
);
98 #ifndef PMAP_SHPGPERPROC
99 #define PMAP_SHPGPERPROC 200
102 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
104 #define pte_prot(m, p) \
105 (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
111 struct pv_entry
*pvinit
;
113 for (i
= 0; i
< vm_page_array_size
; i
++) {
116 m
= &vm_page_array
[i
];
117 TAILQ_INIT(&m
->md
.pv_list
);
118 m
->md
.pv_list_count
= 0;
121 i
= vm_page_array_size
;
124 pvinit
= (struct pv_entry
*)kmem_alloc(&kernel_map
, i
*sizeof(*pvinit
));
125 zbootinit(&pvzone
, "PV ENTRY", sizeof(*pvinit
), pvinit
, i
);
126 pmap_initialized
= TRUE
;
132 int shpgperproc
= PMAP_SHPGPERPROC
;
134 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc
);
135 pv_entry_max
= shpgperproc
* maxproc
+ vm_page_array_size
;
136 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max
);
137 pv_entry_high_water
= 9 * (pv_entry_max
/ 10);
138 zinitna(&pvzone
, &pvzone_obj
, NULL
, 0, pv_entry_max
, ZONE_INTERRUPT
, 1);
142 * Bootstrap the kernel_pmap so it can be used with pmap_enter().
144 * NOTE! pm_pdir for the kernel pmap is offset so VA's translate
145 * directly into PTD indexes (PTA is also offset for the same reason).
146 * This is necessary because, for now, KVA is not mapped at address 0.
148 * Page table pages are not managed like they are in normal pmaps, so
149 * no pteobj is needed.
154 vm_pindex_t i
= (vm_offset_t
)KernelPTD
>> PAGE_SHIFT
;
156 kernel_pmap
.pm_pdir
= KernelPTD
- (KvaStart
>> SEG_SHIFT
);
157 kernel_pmap
.pm_pdirpte
= KernelPTA
[i
];
158 kernel_pmap
.pm_count
= 1;
159 kernel_pmap
.pm_active
= (cpumask_t
)-1 & ~CPUMASK_LOCK
;
160 TAILQ_INIT(&kernel_pmap
.pm_pvlist
);
161 i386_protection_init();
165 * Initialize pmap0/vmspace0 . Since process 0 never enters user mode we
166 * just dummy it up so it works well enough for fork().
168 * In DragonFly, process pmaps may only be used to manipulate user address
169 * space, never kernel address space.
172 pmap_pinit0(struct pmap
*pmap
)
177 /************************************************************************
178 * Procedures to manage whole physical maps *
179 ************************************************************************
181 * Initialize a preallocated and zeroed pmap structure,
182 * such as one in a vmspace structure.
185 pmap_pinit(struct pmap
*pmap
)
191 * No need to allocate page table space yet but we do need a valid
192 * page directory table.
194 if (pmap
->pm_pdir
== NULL
) {
196 (vpte_t
*)kmem_alloc_pageable(&kernel_map
, PAGE_SIZE
);
200 * allocate object for the pte array and page directory
202 npages
= VPTE_PAGETABLE_SIZE
+
203 (VM_MAX_USER_ADDRESS
/ PAGE_SIZE
) * sizeof(vpte_t
);
204 npages
= (npages
+ PAGE_MASK
) / PAGE_SIZE
;
206 if (pmap
->pm_pteobj
== NULL
)
207 pmap
->pm_pteobj
= vm_object_allocate(OBJT_DEFAULT
, npages
);
208 pmap
->pm_pdindex
= npages
- 1;
211 * allocate the page directory page
213 ptdpg
= vm_page_grab(pmap
->pm_pteobj
, pmap
->pm_pdindex
,
214 VM_ALLOC_NORMAL
| VM_ALLOC_RETRY
);
216 ptdpg
->wire_count
= 1;
217 ++vmstats
.v_wire_count
;
219 /* not usually mapped */
220 vm_page_flag_clear(ptdpg
, PG_MAPPED
| PG_BUSY
);
221 ptdpg
->valid
= VM_PAGE_BITS_ALL
;
223 pmap_kenter((vm_offset_t
)pmap
->pm_pdir
, VM_PAGE_TO_PHYS(ptdpg
));
224 pmap
->pm_pdirpte
= KernelPTA
[(vm_offset_t
)pmap
->pm_pdir
>> PAGE_SHIFT
];
225 if ((ptdpg
->flags
& PG_ZERO
) == 0)
226 bzero(pmap
->pm_pdir
, PAGE_SIZE
);
230 pmap
->pm_ptphint
= NULL
;
231 pmap
->pm_cpucachemask
= 0;
232 TAILQ_INIT(&pmap
->pm_pvlist
);
233 bzero(&pmap
->pm_stats
, sizeof pmap
->pm_stats
);
234 pmap
->pm_stats
.resident_count
= 1;
238 * Clean up a pmap structure so it can be physically freed
243 pmap_puninit(pmap_t pmap
)
245 lwkt_gettoken(&vm_token
);
247 kmem_free(&kernel_map
, (vm_offset_t
)pmap
->pm_pdir
, PAGE_SIZE
);
248 pmap
->pm_pdir
= NULL
;
250 if (pmap
->pm_pteobj
) {
251 vm_object_deallocate(pmap
->pm_pteobj
);
252 pmap
->pm_pteobj
= NULL
;
254 lwkt_reltoken(&vm_token
);
259 * Wire in kernel global address entries. To avoid a race condition
260 * between pmap initialization and pmap_growkernel, this procedure
261 * adds the pmap to the master list (which growkernel scans to update),
262 * then copies the template.
264 * In a virtual kernel there are no kernel global address entries.
269 pmap_pinit2(struct pmap
*pmap
)
272 lwkt_gettoken(&vm_token
);
273 TAILQ_INSERT_TAIL(&pmap_list
, pmap
, pm_pmnode
);
274 lwkt_reltoken(&vm_token
);
279 * Release all resources held by the given physical map.
281 * Should only be called if the map contains no valid mappings.
285 static int pmap_release_callback(struct vm_page
*p
, void *data
);
288 pmap_release(struct pmap
*pmap
)
290 struct mdglobaldata
*gd
= mdcpu
;
291 vm_object_t object
= pmap
->pm_pteobj
;
292 struct rb_vm_page_scan_info info
;
294 KKASSERT(pmap
!= &kernel_pmap
);
296 #if defined(DIAGNOSTIC)
297 if (object
->ref_count
!= 1)
298 panic("pmap_release: pteobj reference count != 1");
301 * Once we destroy the page table, the mapping becomes invalid.
302 * Don't waste time doing a madvise to invalidate the mapping, just
303 * set cpucachemask to 0.
305 if (pmap
->pm_pdir
== gd
->gd_PT1pdir
) {
306 gd
->gd_PT1pdir
= NULL
;
308 /* madvise(gd->gd_PT1map, SEG_SIZE, MADV_INVAL); */
310 if (pmap
->pm_pdir
== gd
->gd_PT2pdir
) {
311 gd
->gd_PT2pdir
= NULL
;
313 /* madvise(gd->gd_PT2map, SEG_SIZE, MADV_INVAL); */
315 if (pmap
->pm_pdir
== gd
->gd_PT3pdir
) {
316 gd
->gd_PT3pdir
= NULL
;
318 /* madvise(gd->gd_PT3map, SEG_SIZE, MADV_INVAL); */
322 info
.object
= object
;
324 lwkt_gettoken(&vm_token
);
325 TAILQ_REMOVE(&pmap_list
, pmap
, pm_pmnode
);
332 info
.limit
= object
->generation
;
334 vm_page_rb_tree_RB_SCAN(&object
->rb_memq
, NULL
,
335 pmap_release_callback
, &info
);
336 if (info
.error
== 0 && info
.mpte
) {
337 if (!pmap_release_free_page(pmap
, info
.mpte
))
341 } while (info
.error
);
344 * Leave the KVA reservation for pm_pdir cached for later reuse.
346 pmap
->pm_pdirpte
= 0;
347 pmap
->pm_cpucachemask
= 0;
348 lwkt_reltoken(&vm_token
);
352 * Callback to release a page table page backing a directory
356 pmap_release_callback(struct vm_page
*p
, void *data
)
358 struct rb_vm_page_scan_info
*info
= data
;
360 if (p
->pindex
== info
->pmap
->pm_pdindex
) {
364 if (!pmap_release_free_page(info
->pmap
, p
)) {
368 if (info
->object
->generation
!= info
->limit
) {
376 * Retire the given physical map from service. Should only be called if
377 * the map contains no valid mappings.
382 pmap_destroy(pmap_t pmap
)
387 lwkt_gettoken(&vm_token
);
388 if (--pmap
->pm_count
== 0) {
390 panic("destroying a pmap is not yet implemented");
392 lwkt_reltoken(&vm_token
);
396 * Add a reference to the specified pmap.
401 pmap_reference(pmap_t pmap
)
404 lwkt_gettoken(&vm_token
);
406 lwkt_reltoken(&vm_token
);
410 /************************************************************************
411 * VMSPACE MANAGEMENT *
412 ************************************************************************
414 * The VMSPACE management we do in our virtual kernel must be reflected
415 * in the real kernel. This is accomplished by making vmspace system
416 * calls to the real kernel.
419 cpu_vmspace_alloc(struct vmspace
*vm
)
424 #define LAST_EXTENT (VM_MAX_USER_ADDRESS - 0x80000000)
426 if (vmspace_create(&vm
->vm_pmap
, 0, NULL
) < 0)
427 panic("vmspace_create() failed");
429 rp
= vmspace_mmap(&vm
->vm_pmap
, (void *)0x00000000, 0x40000000,
430 PROT_READ
|PROT_WRITE
,
431 MAP_FILE
|MAP_SHARED
|MAP_VPAGETABLE
|MAP_FIXED
,
433 if (rp
== MAP_FAILED
)
434 panic("vmspace_mmap: failed1");
435 vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x00000000, 0x40000000,
437 rp
= vmspace_mmap(&vm
->vm_pmap
, (void *)0x40000000, 0x40000000,
438 PROT_READ
|PROT_WRITE
,
439 MAP_FILE
|MAP_SHARED
|MAP_VPAGETABLE
|MAP_FIXED
,
440 MemImageFd
, 0x40000000);
441 if (rp
== MAP_FAILED
)
442 panic("vmspace_mmap: failed2");
443 vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x40000000, 0x40000000,
445 rp
= vmspace_mmap(&vm
->vm_pmap
, (void *)0x80000000, LAST_EXTENT
,
446 PROT_READ
|PROT_WRITE
,
447 MAP_FILE
|MAP_SHARED
|MAP_VPAGETABLE
|MAP_FIXED
,
448 MemImageFd
, 0x80000000);
449 vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x80000000, LAST_EXTENT
,
451 if (rp
== MAP_FAILED
)
452 panic("vmspace_mmap: failed3");
454 r
= vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x00000000, 0x40000000,
455 MADV_SETMAP
, vmspace_pmap(vm
)->pm_pdirpte
);
457 panic("vmspace_mcontrol: failed1");
458 r
= vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x40000000, 0x40000000,
459 MADV_SETMAP
, vmspace_pmap(vm
)->pm_pdirpte
);
461 panic("vmspace_mcontrol: failed2");
462 r
= vmspace_mcontrol(&vm
->vm_pmap
, (void *)0x80000000, LAST_EXTENT
,
463 MADV_SETMAP
, vmspace_pmap(vm
)->pm_pdirpte
);
465 panic("vmspace_mcontrol: failed3");
469 cpu_vmspace_free(struct vmspace
*vm
)
471 if (vmspace_destroy(&vm
->vm_pmap
) < 0)
472 panic("vmspace_destroy() failed");
475 /************************************************************************
476 * Procedures which operate directly on the kernel PMAP *
477 ************************************************************************/
480 * This maps the requested page table and gives us access to it.
482 * This routine can be called from a potentially preempting interrupt
483 * thread or from a normal thread.
486 get_ptbase(struct pmap
*pmap
, vm_offset_t va
)
488 struct mdglobaldata
*gd
= mdcpu
;
490 if (pmap
== &kernel_pmap
) {
491 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
492 return(KernelPTA
+ (va
>> PAGE_SHIFT
));
493 } else if (pmap
->pm_pdir
== gd
->gd_PT1pdir
) {
494 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
495 *gd
->gd_PT1pde
= pmap
->pm_pdirpte
;
496 madvise(gd
->gd_PT1map
, SEG_SIZE
, MADV_INVAL
);
497 atomic_set_int(&pmap
->pm_cpucachemask
, gd
->mi
.gd_cpumask
);
499 return(gd
->gd_PT1map
+ (va
>> PAGE_SHIFT
));
500 } else if (pmap
->pm_pdir
== gd
->gd_PT2pdir
) {
501 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
502 *gd
->gd_PT2pde
= pmap
->pm_pdirpte
;
503 madvise(gd
->gd_PT2map
, SEG_SIZE
, MADV_INVAL
);
504 atomic_set_int(&pmap
->pm_cpucachemask
, gd
->mi
.gd_cpumask
);
506 return(gd
->gd_PT2map
+ (va
>> PAGE_SHIFT
));
510 * If we aren't running from a potentially preempting interrupt,
511 * load a new page table directory into the page table cache
513 if (gd
->mi
.gd_intr_nesting_level
== 0 &&
514 (gd
->mi
.gd_curthread
->td_flags
& TDF_INTTHREAD
) == 0) {
516 * Choose one or the other and map the page table
517 * in the KVA space reserved for it.
519 if ((gd
->gd_PTflip
= 1 - gd
->gd_PTflip
) == 0) {
520 gd
->gd_PT1pdir
= pmap
->pm_pdir
;
521 *gd
->gd_PT1pde
= pmap
->pm_pdirpte
;
522 madvise(gd
->gd_PT1map
, SEG_SIZE
, MADV_INVAL
);
523 atomic_set_int(&pmap
->pm_cpucachemask
,
525 return(gd
->gd_PT1map
+ (va
>> PAGE_SHIFT
));
527 gd
->gd_PT2pdir
= pmap
->pm_pdir
;
528 *gd
->gd_PT2pde
= pmap
->pm_pdirpte
;
529 madvise(gd
->gd_PT2map
, SEG_SIZE
, MADV_INVAL
);
530 atomic_set_int(&pmap
->pm_cpucachemask
,
532 return(gd
->gd_PT2map
+ (va
>> PAGE_SHIFT
));
537 * If we are running from a preempting interrupt use a private
538 * map. The caller must be in a critical section.
540 KKASSERT(IN_CRITICAL_SECT(curthread
));
541 if (pmap
->pm_pdir
== gd
->gd_PT3pdir
) {
542 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
543 *gd
->gd_PT3pde
= pmap
->pm_pdirpte
;
544 madvise(gd
->gd_PT3map
, SEG_SIZE
, MADV_INVAL
);
545 atomic_set_int(&pmap
->pm_cpucachemask
,
549 gd
->gd_PT3pdir
= pmap
->pm_pdir
;
550 *gd
->gd_PT3pde
= pmap
->pm_pdirpte
;
551 madvise(gd
->gd_PT3map
, SEG_SIZE
, MADV_INVAL
);
552 atomic_set_int(&pmap
->pm_cpucachemask
,
555 return(gd
->gd_PT3map
+ (va
>> PAGE_SHIFT
));
559 get_ptbase1(struct pmap
*pmap
, vm_offset_t va
)
561 struct mdglobaldata
*gd
= mdcpu
;
563 if (pmap
== &kernel_pmap
) {
564 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
565 return(KernelPTA
+ (va
>> PAGE_SHIFT
));
566 } else if (pmap
->pm_pdir
== gd
->gd_PT1pdir
) {
567 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
568 *gd
->gd_PT1pde
= pmap
->pm_pdirpte
;
569 madvise(gd
->gd_PT1map
, SEG_SIZE
, MADV_INVAL
);
570 atomic_set_int(&pmap
->pm_cpucachemask
, gd
->mi
.gd_cpumask
);
572 return(gd
->gd_PT1map
+ (va
>> PAGE_SHIFT
));
574 KKASSERT(gd
->mi
.gd_intr_nesting_level
== 0 &&
575 (gd
->mi
.gd_curthread
->td_flags
& TDF_INTTHREAD
) == 0);
576 gd
->gd_PT1pdir
= pmap
->pm_pdir
;
577 *gd
->gd_PT1pde
= pmap
->pm_pdirpte
;
578 madvise(gd
->gd_PT1map
, SEG_SIZE
, MADV_INVAL
);
579 return(gd
->gd_PT1map
+ (va
>> PAGE_SHIFT
));
583 get_ptbase2(struct pmap
*pmap
, vm_offset_t va
)
585 struct mdglobaldata
*gd
= mdcpu
;
587 if (pmap
== &kernel_pmap
) {
588 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
589 return(KernelPTA
+ (va
>> PAGE_SHIFT
));
590 } else if (pmap
->pm_pdir
== gd
->gd_PT2pdir
) {
591 if ((pmap
->pm_cpucachemask
& gd
->mi
.gd_cpumask
) == 0) {
592 *gd
->gd_PT2pde
= pmap
->pm_pdirpte
;
593 madvise(gd
->gd_PT2map
, SEG_SIZE
, MADV_INVAL
);
594 atomic_set_int(&pmap
->pm_cpucachemask
, gd
->mi
.gd_cpumask
);
596 return(gd
->gd_PT2map
+ (va
>> PAGE_SHIFT
));
598 KKASSERT(gd
->mi
.gd_intr_nesting_level
== 0 &&
599 (gd
->mi
.gd_curthread
->td_flags
& TDF_INTTHREAD
) == 0);
600 gd
->gd_PT2pdir
= pmap
->pm_pdir
;
601 *gd
->gd_PT2pde
= pmap
->pm_pdirpte
;
602 madvise(gd
->gd_PT2map
, SEG_SIZE
, MADV_INVAL
);
603 return(gd
->gd_PT2map
+ (va
>> PAGE_SHIFT
));
607 * Return a pointer to the page table entry for the specified va in the
608 * specified pmap. NULL is returned if there is no valid page table page
611 static __inline vpte_t
*
612 pmap_pte(struct pmap
*pmap
, vm_offset_t va
)
616 ptep
= &pmap
->pm_pdir
[va
>> SEG_SHIFT
];
620 return (get_ptbase(pmap
, va
));
626 * Enter a mapping into kernel_pmap. Mappings created in this fashion
627 * are not managed. Mappings must be immediately accessible on all cpus.
629 * Call pmap_inval_pte() to invalidate the virtual pte and clean out the
630 * real pmap and handle related races before storing the new vpte.
633 pmap_kenter(vm_offset_t va
, vm_paddr_t pa
)
638 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
639 npte
= (vpte_t
)pa
| VPTE_R
| VPTE_W
| VPTE_V
;
640 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
642 pmap_inval_pte(ptep
, &kernel_pmap
, va
);
647 * Synchronize a kvm mapping originally made for the private use on
648 * some other cpu so it can be used on all cpus.
650 * XXX add MADV_RESYNC to improve performance.
653 pmap_kenter_sync(vm_offset_t va
)
655 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
659 * Synchronize a kvm mapping originally made for the private use on
660 * some other cpu so it can be used on our cpu. Turns out to be the
661 * same madvise() call, because we have to sync the real pmaps anyway.
663 * XXX add MADV_RESYNC to improve performance.
666 pmap_kenter_sync_quick(vm_offset_t va
)
668 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
673 * Make a previously read-only kernel mapping R+W (not implemented by
677 pmap_kmodify_rw(vm_offset_t va
)
679 *pmap_kpte(va
) |= VPTE_R
| VPTE_W
;
680 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
684 * Make a kernel mapping non-cacheable (not applicable to virtual kernels)
687 pmap_kmodify_nc(vm_offset_t va
)
689 *pmap_kpte(va
) |= VPTE_N
;
690 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
696 * Map a contiguous range of physical memory to a KVM
699 pmap_map(vm_offset_t
*virtp
, vm_paddr_t start
, vm_paddr_t end
, int prot
)
701 vm_offset_t sva
, virt
;
704 while (start
< end
) {
705 pmap_kenter(virt
, start
);
714 pmap_kpte(vm_offset_t va
)
718 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
719 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
724 * Enter an unmanaged KVA mapping for the private use of the current
725 * cpu only. pmap_kenter_sync() may be called to make the mapping usable
728 * It is illegal for the mapping to be accessed by other cpus unleess
729 * pmap_kenter_sync*() is called.
732 pmap_kenter_quick(vm_offset_t va
, vm_paddr_t pa
)
737 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
739 npte
= (vpte_t
)pa
| VPTE_R
| VPTE_W
| VPTE_V
;
740 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
742 pmap_inval_pte_quick(ptep
, &kernel_pmap
, va
);
747 * Make a temporary mapping for a physical address. This is only intended
748 * to be used for panic dumps.
751 pmap_kenter_temporary(vm_paddr_t pa
, int i
)
753 pmap_kenter(crashdumpmap
+ (i
* PAGE_SIZE
), pa
);
754 return ((void *)crashdumpmap
);
758 * Remove an unmanaged mapping created with pmap_kenter*().
761 pmap_kremove(vm_offset_t va
)
765 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
767 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
769 pmap_inval_pte(ptep
, &kernel_pmap
, va
);
774 * Remove an unmanaged mapping created with pmap_kenter*() but synchronize
775 * only with this cpu.
777 * Unfortunately because we optimize new entries by testing VPTE_V later
778 * on, we actually still have to synchronize with all the cpus. XXX maybe
779 * store a junk value and test against 0 in the other places instead?
782 pmap_kremove_quick(vm_offset_t va
)
786 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
788 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
790 pmap_inval_pte(ptep
, &kernel_pmap
, va
); /* NOT _quick */
795 * Extract the physical address from the kernel_pmap that is associated
796 * with the specified virtual address.
799 pmap_kextract(vm_offset_t va
)
804 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
806 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
807 pa
= (vm_paddr_t
)(*ptep
& VPTE_FRAME
) | (va
& PAGE_MASK
);
812 * Map a set of unmanaged VM pages into KVM.
815 pmap_qenter(vm_offset_t va
, struct vm_page
**m
, int count
)
817 KKASSERT(va
>= KvaStart
&& va
+ count
* PAGE_SIZE
< KvaEnd
);
821 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
823 pmap_inval_pte(ptep
, &kernel_pmap
, va
);
824 *ptep
= (vpte_t
)(*m
)->phys_addr
| VPTE_R
| VPTE_W
| VPTE_V
;
832 * Undo the effects of pmap_qenter*().
835 pmap_qremove(vm_offset_t va
, int count
)
837 KKASSERT(va
>= KvaStart
&& va
+ count
* PAGE_SIZE
< KvaEnd
);
841 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
843 pmap_inval_pte(ptep
, &kernel_pmap
, va
);
850 /************************************************************************
851 * Misc support glue called by machine independant code *
852 ************************************************************************
854 * These routines are called by machine independant code to operate on
855 * certain machine-dependant aspects of processes, threads, and pmaps.
859 * Initialize MD portions of the thread structure.
862 pmap_init_thread(thread_t td
)
864 /* enforce pcb placement */
865 td
->td_pcb
= (struct pcb
*)(td
->td_kstack
+ td
->td_kstack_size
) - 1;
866 td
->td_savefpu
= &td
->td_pcb
->pcb_save
;
867 td
->td_sp
= (char *)td
->td_pcb
- 16;
871 * This routine directly affects the fork perf for a process.
874 pmap_init_proc(struct proc
*p
)
879 * Destroy the UPAGES for a process that has exited and disassociate
880 * the process from its thread.
883 pmap_dispose_proc(struct proc
*p
)
885 KASSERT(p
->p_lock
== 0, ("attempt to dispose referenced proc! %p", p
));
889 * We pre-allocate all page table pages for kernel virtual memory so
890 * this routine will only be called if KVM has been exhausted.
895 pmap_growkernel(vm_offset_t kstart
, vm_offset_t kend
)
899 addr
= (kend
+ PAGE_SIZE
* NPTEPG
) & ~(PAGE_SIZE
* NPTEPG
- 1);
901 lwkt_gettoken(&vm_token
);
902 if (addr
> virtual_end
- SEG_SIZE
)
903 panic("KVM exhausted");
904 kernel_vm_end
= addr
;
905 lwkt_reltoken(&vm_token
);
909 * The modification bit is not tracked for any pages in this range. XXX
910 * such pages in this maps should always use pmap_k*() functions and not
913 * XXX User and kernel address spaces are independant for virtual kernels,
914 * this function only applies to the kernel pmap.
917 pmap_track_modified(pmap_t pmap
, vm_offset_t va
)
919 if (pmap
!= &kernel_pmap
)
921 if ((va
< clean_sva
) || (va
>= clean_eva
))
927 /************************************************************************
928 * Procedures supporting managed page table pages *
929 ************************************************************************
931 * These procedures are used to track managed page table pages. These pages
932 * use the page table page's vm_page_t to track PTEs in the page. The
933 * page table pages themselves are arranged in a VM object, pmap->pm_pteobj.
935 * This allows the system to throw away page table pages for user processes
936 * at will and reinstantiate them on demand.
940 * This routine works like vm_page_lookup() but also blocks as long as the
941 * page is busy. This routine does not busy the page it returns.
943 * Unless the caller is managing objects whos pages are in a known state,
944 * the call should be made with a critical section held so the page's object
945 * association remains valid on return.
948 pmap_page_lookup(vm_object_t object
, vm_pindex_t pindex
)
953 m
= vm_page_lookup(object
, pindex
);
954 if (m
&& vm_page_sleep_busy(m
, FALSE
, "pplookp"))
960 * This routine unholds page table pages, and if the hold count
961 * drops to zero, then it decrements the wire count.
963 * We must recheck that this is the last hold reference after busy-sleeping
967 _pmap_unwire_pte_hold(pmap_t pmap
, vm_page_t m
)
969 while (vm_page_sleep_busy(m
, FALSE
, "pmuwpt"))
971 KASSERT(m
->queue
== PQ_NONE
,
972 ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m
));
974 if (m
->hold_count
== 1) {
976 * Unmap the page table page.
979 KKASSERT(pmap
->pm_pdir
[m
->pindex
] != 0);
980 pmap_inval_pde(&pmap
->pm_pdir
[m
->pindex
], pmap
,
981 (vm_offset_t
)m
->pindex
<< SEG_SHIFT
);
982 KKASSERT(pmap
->pm_stats
.resident_count
> 0);
983 --pmap
->pm_stats
.resident_count
;
985 if (pmap
->pm_ptphint
== m
)
986 pmap
->pm_ptphint
= NULL
;
989 * This was our last hold, the page had better be unwired
990 * after we decrement wire_count.
992 * FUTURE NOTE: shared page directory page could result in
993 * multiple wire counts.
997 KKASSERT(m
->wire_count
== 0);
998 --vmstats
.v_wire_count
;
999 vm_page_flag_clear(m
, PG_MAPPED
| PG_WRITEABLE
);
1001 vm_page_free_zero(m
);
1004 KKASSERT(m
->hold_count
> 1);
1010 pmap_unwire_pte_hold(pmap_t pmap
, vm_page_t m
)
1012 KKASSERT(m
->hold_count
> 0);
1013 if (m
->hold_count
> 1) {
1017 return _pmap_unwire_pte_hold(pmap
, m
);
1022 * After removing a page table entry, this routine is used to
1023 * conditionally free the page, and manage the hold/wire counts.
1026 pmap_unuse_pt(pmap_t pmap
, vm_offset_t va
, vm_page_t mpte
)
1032 * page table pages in the kernel_pmap are not managed.
1034 if (pmap
== &kernel_pmap
)
1036 ptepindex
= (va
>> PDRSHIFT
);
1037 if (pmap
->pm_ptphint
&&
1038 (pmap
->pm_ptphint
->pindex
== ptepindex
)) {
1039 mpte
= pmap
->pm_ptphint
;
1041 mpte
= pmap_page_lookup( pmap
->pm_pteobj
, ptepindex
);
1042 pmap
->pm_ptphint
= mpte
;
1045 return pmap_unwire_pte_hold(pmap
, mpte
);
1049 * Attempt to release and free the vm_page backing a page directory page
1050 * in a pmap. Returns 1 on success, 0 on failure (if the procedure had
1054 pmap_release_free_page(struct pmap
*pmap
, vm_page_t p
)
1056 vpte_t
*pde
= pmap
->pm_pdir
;
1059 * This code optimizes the case of freeing non-busy
1060 * page-table pages. Those pages are zero now, and
1061 * might as well be placed directly into the zero queue.
1063 if (vm_page_sleep_busy(p
, FALSE
, "pmaprl"))
1067 KKASSERT(pmap
->pm_stats
.resident_count
> 0);
1068 --pmap
->pm_stats
.resident_count
;
1070 if (p
->hold_count
) {
1071 panic("pmap_release: freeing held page table page");
1074 * Page directory pages need to have the kernel stuff cleared, so
1075 * they can go into the zero queue also.
1077 * In virtual kernels there is no 'kernel stuff'. For the moment
1078 * I just make sure the whole thing has been zero'd even though
1079 * it should already be completely zero'd.
1081 * pmaps for vkernels do not self-map because they do not share
1082 * their address space with the vkernel. Clearing of pde[] thus
1083 * only applies to page table pages and not to the page directory
1086 if (p
->pindex
== pmap
->pm_pdindex
) {
1087 bzero(pde
, VPTE_PAGETABLE_SIZE
);
1088 pmap_kremove((vm_offset_t
)pmap
->pm_pdir
);
1090 KKASSERT(pde
[p
->pindex
] != 0);
1091 pmap_inval_pde(&pde
[p
->pindex
], pmap
,
1092 (vm_offset_t
)p
->pindex
<< SEG_SHIFT
);
1096 * Clear the matching hint
1098 if (pmap
->pm_ptphint
&& (pmap
->pm_ptphint
->pindex
== p
->pindex
))
1099 pmap
->pm_ptphint
= NULL
;
1102 * And throw the page away. The page is completely zero'd out so
1103 * optimize the free call.
1106 vmstats
.v_wire_count
--;
1107 vm_page_free_zero(p
);
1112 * This routine is called if the page table page is not mapped in the page
1115 * The routine is broken up into two parts for readability.
1117 * It must return a held mpte and map the page directory page as required.
1118 * Because vm_page_grab() can block, we must re-check pm_pdir[ptepindex]
1121 _pmap_allocpte(pmap_t pmap
, unsigned ptepindex
)
1127 * Find or fabricate a new pagetable page. A busied page will be
1128 * returned. This call may block.
1130 m
= vm_page_grab(pmap
->pm_pteobj
, ptepindex
,
1131 VM_ALLOC_NORMAL
| VM_ALLOC_ZERO
| VM_ALLOC_RETRY
);
1133 KASSERT(m
->queue
== PQ_NONE
,
1134 ("_pmap_allocpte: %p->queue != PQ_NONE", m
));
1137 * Increment the hold count for the page we will be returning to
1143 * It is possible that someone else got in and mapped by the page
1144 * directory page while we were blocked, if so just unbusy and
1145 * return the held page.
1147 if ((ptepa
= pmap
->pm_pdir
[ptepindex
]) != 0) {
1148 KKASSERT((ptepa
& VPTE_FRAME
) == VM_PAGE_TO_PHYS(m
));
1153 if (m
->wire_count
== 0)
1154 vmstats
.v_wire_count
++;
1158 * Map the pagetable page into the process address space, if
1159 * it isn't already there.
1161 ++pmap
->pm_stats
.resident_count
;
1163 ptepa
= VM_PAGE_TO_PHYS(m
);
1164 pmap
->pm_pdir
[ptepindex
] = (vpte_t
)ptepa
| VPTE_R
| VPTE_W
| VPTE_V
|
1168 * We are likely about to access this page table page, so set the
1169 * page table hint to reduce overhead.
1171 pmap
->pm_ptphint
= m
;
1174 * Try to use the new mapping, but if we cannot, then
1175 * do it with the routine that maps the page explicitly.
1177 if ((m
->flags
& PG_ZERO
) == 0)
1178 pmap_zero_page(ptepa
);
1180 m
->valid
= VM_PAGE_BITS_ALL
;
1181 vm_page_flag_clear(m
, PG_ZERO
);
1182 vm_page_flag_set(m
, PG_MAPPED
);
1189 * Determine the page table page required to access the VA in the pmap
1190 * and allocate it if necessary. Return a held vm_page_t for the page.
1192 * Only used with user pmaps.
1195 pmap_allocpte(pmap_t pmap
, vm_offset_t va
)
1202 * Calculate pagetable page index
1204 ptepindex
= va
>> PDRSHIFT
;
1207 * Get the page directory entry
1209 ptepa
= (vm_offset_t
) pmap
->pm_pdir
[ptepindex
];
1212 * This supports switching from a 4MB page to a
1215 if (ptepa
& VPTE_PS
) {
1216 KKASSERT(pmap
->pm_pdir
[ptepindex
] != 0);
1217 pmap_inval_pde(&pmap
->pm_pdir
[ptepindex
], pmap
,
1218 (vm_offset_t
)ptepindex
<< SEG_SHIFT
);
1223 * If the page table page is mapped, we just increment the
1224 * hold count, and activate it.
1228 * In order to get the page table page, try the
1231 if (pmap
->pm_ptphint
&&
1232 (pmap
->pm_ptphint
->pindex
== ptepindex
)) {
1233 m
= pmap
->pm_ptphint
;
1235 m
= pmap_page_lookup( pmap
->pm_pteobj
, ptepindex
);
1236 pmap
->pm_ptphint
= m
;
1242 * Here if the pte page isn't mapped, or if it has been deallocated.
1244 return _pmap_allocpte(pmap
, ptepindex
);
1247 /************************************************************************
1248 * Managed pages in pmaps *
1249 ************************************************************************
1251 * All pages entered into user pmaps and some pages entered into the kernel
1252 * pmap are managed, meaning that pmap_protect() and other related management
1253 * functions work on these pages.
1257 * free the pv_entry back to the free list. This function may be
1258 * called from an interrupt.
1260 static __inline
void
1261 free_pv_entry(pv_entry_t pv
)
1268 * get a new pv_entry, allocating a block from the system
1269 * when needed. This function may be called from an interrupt.
1275 if (pv_entry_high_water
&&
1276 (pv_entry_count
> pv_entry_high_water
) &&
1277 (pmap_pagedaemon_waken
== 0)) {
1278 pmap_pagedaemon_waken
= 1;
1279 wakeup (&vm_pages_needed
);
1281 return zalloc(&pvzone
);
1285 * This routine is very drastic, but can save the system
1295 static int warningdone
=0;
1297 if (pmap_pagedaemon_waken
== 0)
1299 lwkt_gettoken(&vm_token
);
1300 pmap_pagedaemon_waken
= 0;
1302 if (warningdone
< 5) {
1303 kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
1307 for(i
= 0; i
< vm_page_array_size
; i
++) {
1308 m
= &vm_page_array
[i
];
1309 if (m
->wire_count
|| m
->hold_count
|| m
->busy
||
1310 (m
->flags
& PG_BUSY
))
1314 lwkt_reltoken(&vm_token
);
1318 * If it is the first entry on the list, it is actually
1319 * in the header and we must copy the following entry up
1320 * to the header. Otherwise we must search the list for
1321 * the entry. In either case we free the now unused entry.
1324 pmap_remove_entry(struct pmap
*pmap
, vm_page_t m
, vm_offset_t va
)
1330 if (m
->md
.pv_list_count
< pmap
->pm_stats
.resident_count
) {
1331 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
1332 if (pmap
== pv
->pv_pmap
&& va
== pv
->pv_va
)
1336 TAILQ_FOREACH(pv
, &pmap
->pm_pvlist
, pv_plist
) {
1337 if (va
== pv
->pv_va
)
1343 * Note that pv_ptem is NULL if the page table page itself is not
1344 * managed, even if the page being removed IS managed.
1348 TAILQ_REMOVE(&m
->md
.pv_list
, pv
, pv_list
);
1349 m
->md
.pv_list_count
--;
1350 m
->object
->agg_pv_list_count
--;
1351 TAILQ_REMOVE(&pmap
->pm_pvlist
, pv
, pv_plist
);
1352 if (TAILQ_EMPTY(&m
->md
.pv_list
))
1353 vm_page_flag_clear(m
, PG_MAPPED
| PG_WRITEABLE
);
1354 ++pmap
->pm_generation
;
1355 rtval
= pmap_unuse_pt(pmap
, va
, pv
->pv_ptem
);
1363 * Create a pv entry for page at pa for (pmap, va). If the page table page
1364 * holding the VA is managed, mpte will be non-NULL.
1367 pmap_insert_entry(pmap_t pmap
, vm_offset_t va
, vm_page_t mpte
, vm_page_t m
)
1372 pv
= get_pv_entry();
1377 TAILQ_INSERT_TAIL(&pmap
->pm_pvlist
, pv
, pv_plist
);
1378 TAILQ_INSERT_TAIL(&m
->md
.pv_list
, pv
, pv_list
);
1379 ++pmap
->pm_generation
;
1380 m
->md
.pv_list_count
++;
1381 m
->object
->agg_pv_list_count
++;
1387 * pmap_remove_pte: do the things to unmap a page in a process
1390 pmap_remove_pte(struct pmap
*pmap
, vpte_t
*ptq
, vm_offset_t va
)
1395 oldpte
= pmap_inval_loadandclear(ptq
, pmap
, va
);
1396 if (oldpte
& VPTE_WIRED
)
1397 --pmap
->pm_stats
.wired_count
;
1398 KKASSERT(pmap
->pm_stats
.wired_count
>= 0);
1402 * Machines that don't support invlpg, also don't support
1403 * VPTE_G. XXX VPTE_G is disabled for SMP so don't worry about
1406 if (oldpte
& VPTE_G
)
1407 madvise((void *)va
, PAGE_SIZE
, MADV_INVAL
);
1409 KKASSERT(pmap
->pm_stats
.resident_count
> 0);
1410 --pmap
->pm_stats
.resident_count
;
1411 if (oldpte
& VPTE_MANAGED
) {
1412 m
= PHYS_TO_VM_PAGE(oldpte
);
1413 if (oldpte
& VPTE_M
) {
1414 #if defined(PMAP_DIAGNOSTIC)
1415 if (pmap_nw_modified((pt_entry_t
) oldpte
)) {
1417 "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
1421 if (pmap_track_modified(pmap
, va
))
1424 if (oldpte
& VPTE_A
)
1425 vm_page_flag_set(m
, PG_REFERENCED
);
1426 return pmap_remove_entry(pmap
, m
, va
);
1428 return pmap_unuse_pt(pmap
, va
, NULL
);
1437 * Remove a single page from a process address space.
1439 * This function may not be called from an interrupt if the pmap is
1443 pmap_remove_page(struct pmap
*pmap
, vm_offset_t va
)
1448 * if there is no pte for this address, just skip it!!! Otherwise
1449 * get a local va for mappings for this pmap and remove the entry.
1451 if (*pmap_pde(pmap
, va
) != 0) {
1452 ptq
= get_ptbase(pmap
, va
);
1454 pmap_remove_pte(pmap
, ptq
, va
);
1460 * Remove the given range of addresses from the specified map.
1462 * It is assumed that the start and end are properly rounded to the
1465 * This function may not be called from an interrupt if the pmap is
1471 pmap_remove(struct pmap
*pmap
, vm_offset_t sva
, vm_offset_t eva
)
1475 vm_offset_t ptpaddr
;
1476 vm_pindex_t sindex
, eindex
;
1481 lwkt_gettoken(&vm_token
);
1482 KKASSERT(pmap
->pm_stats
.resident_count
>= 0);
1483 if (pmap
->pm_stats
.resident_count
== 0) {
1484 lwkt_reltoken(&vm_token
);
1489 * special handling of removing one page. a very
1490 * common operation and easy to short circuit some
1493 if (((sva
+ PAGE_SIZE
) == eva
) &&
1494 ((pmap
->pm_pdir
[(sva
>> PDRSHIFT
)] & VPTE_PS
) == 0)) {
1495 pmap_remove_page(pmap
, sva
);
1496 lwkt_reltoken(&vm_token
);
1501 * Get a local virtual address for the mappings that are being
1504 * XXX this is really messy because the kernel pmap is not relative
1507 sindex
= (sva
>> PAGE_SHIFT
);
1508 eindex
= (eva
>> PAGE_SHIFT
);
1510 for (; sindex
< eindex
; sindex
= pdnxt
) {
1514 * Calculate index for next page table.
1516 pdnxt
= ((sindex
+ NPTEPG
) & ~(NPTEPG
- 1));
1517 if (pmap
->pm_stats
.resident_count
== 0)
1520 pdirindex
= sindex
/ NPDEPG
;
1521 if (((ptpaddr
= pmap
->pm_pdir
[pdirindex
]) & VPTE_PS
) != 0) {
1522 KKASSERT(pmap
->pm_pdir
[pdirindex
] != 0);
1523 pmap
->pm_stats
.resident_count
-= NBPDR
/ PAGE_SIZE
;
1524 pmap_inval_pde(&pmap
->pm_pdir
[pdirindex
], pmap
,
1525 (vm_offset_t
)pdirindex
<< SEG_SHIFT
);
1530 * Weed out invalid mappings. Note: we assume that the page
1531 * directory table is always allocated, and in kernel virtual.
1537 * Limit our scan to either the end of the va represented
1538 * by the current page table page, or to the end of the
1539 * range being removed.
1545 * NOTE: pmap_remove_pte() can block.
1547 for (; sindex
!= pdnxt
; sindex
++) {
1550 ptbase
= get_ptbase(pmap
, sindex
<< PAGE_SHIFT
);
1553 va
= i386_ptob(sindex
);
1554 if (pmap_remove_pte(pmap
, ptbase
, va
))
1558 lwkt_reltoken(&vm_token
);
1562 * Removes this physical page from all physical maps in which it resides.
1563 * Reflects back modify bits to the pager.
1565 * This routine may not be called from an interrupt.
1570 pmap_remove_all(vm_page_t m
)
1575 #if defined(PMAP_DIAGNOSTIC)
1577 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
1580 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
)) {
1581 panic("pmap_page_protect: illegal for unmanaged page, va: 0x%08llx", (long long)VM_PAGE_TO_PHYS(m
));
1586 lwkt_gettoken(&vm_token
);
1587 while ((pv
= TAILQ_FIRST(&m
->md
.pv_list
)) != NULL
) {
1588 KKASSERT(pv
->pv_pmap
->pm_stats
.resident_count
> 0);
1589 --pv
->pv_pmap
->pm_stats
.resident_count
;
1591 pte
= pmap_pte(pv
->pv_pmap
, pv
->pv_va
);
1592 KKASSERT(pte
!= NULL
);
1594 tpte
= pmap_inval_loadandclear(pte
, pv
->pv_pmap
, pv
->pv_va
);
1595 if (tpte
& VPTE_WIRED
)
1596 --pv
->pv_pmap
->pm_stats
.wired_count
;
1597 KKASSERT(pv
->pv_pmap
->pm_stats
.wired_count
>= 0);
1600 vm_page_flag_set(m
, PG_REFERENCED
);
1603 * Update the vm_page_t clean and reference bits.
1605 if (tpte
& VPTE_M
) {
1606 #if defined(PMAP_DIAGNOSTIC)
1607 if (pmap_nw_modified((pt_entry_t
) tpte
)) {
1609 "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
1613 if (pmap_track_modified(pv
->pv_pmap
, pv
->pv_va
))
1616 TAILQ_REMOVE(&m
->md
.pv_list
, pv
, pv_list
);
1617 TAILQ_REMOVE(&pv
->pv_pmap
->pm_pvlist
, pv
, pv_plist
);
1618 ++pv
->pv_pmap
->pm_generation
;
1619 m
->md
.pv_list_count
--;
1620 m
->object
->agg_pv_list_count
--;
1621 if (TAILQ_EMPTY(&m
->md
.pv_list
))
1622 vm_page_flag_clear(m
, PG_MAPPED
| PG_WRITEABLE
);
1623 pmap_unuse_pt(pv
->pv_pmap
, pv
->pv_va
, pv
->pv_ptem
);
1626 KKASSERT((m
->flags
& (PG_MAPPED
| PG_WRITEABLE
)) == 0);
1627 lwkt_reltoken(&vm_token
);
1632 * Set the physical protection on the specified range of this map
1635 * This function may not be called from an interrupt if the map is
1636 * not the kernel_pmap.
1641 pmap_protect(pmap_t pmap
, vm_offset_t sva
, vm_offset_t eva
, vm_prot_t prot
)
1645 vm_offset_t pdnxt
, ptpaddr
;
1646 vm_pindex_t sindex
, eindex
;
1652 if ((prot
& VM_PROT_READ
) == VM_PROT_NONE
) {
1653 pmap_remove(pmap
, sva
, eva
);
1657 if (prot
& VM_PROT_WRITE
)
1660 lwkt_gettoken(&vm_token
);
1661 ptbase
= get_ptbase(pmap
, sva
);
1663 sindex
= (sva
>> PAGE_SHIFT
);
1664 eindex
= (eva
>> PAGE_SHIFT
);
1667 for (; sindex
< eindex
; sindex
= pdnxt
) {
1671 pdnxt
= ((sindex
+ NPTEPG
) & ~(NPTEPG
- 1));
1673 pdirindex
= sindex
/ NPDEPG
;
1676 * Clear the modified and writable bits for a 4m page.
1677 * Throw away the modified bit (?)
1679 if (((ptpaddr
= pmap
->pm_pdir
[pdirindex
]) & VPTE_PS
) != 0) {
1680 pmap_clean_pde(&pmap
->pm_pdir
[pdirindex
], pmap
,
1681 (vm_offset_t
)pdirindex
<< SEG_SHIFT
);
1682 pmap
->pm_stats
.resident_count
-= NBPDR
/ PAGE_SIZE
;
1687 * Weed out invalid mappings. Note: we assume that the page
1688 * directory table is always allocated, and in kernel virtual.
1693 if (pdnxt
> eindex
) {
1697 for (; sindex
!= pdnxt
; sindex
++) {
1702 * Clean managed pages and also check the accessed
1703 * bit. Just remove write perms for unmanaged
1704 * pages. Be careful of races, turning off write
1705 * access will force a fault rather then setting
1706 * the modified bit at an unexpected time.
1708 ptep
= &ptbase
[sindex
- sbase
];
1709 if (*ptep
& VPTE_MANAGED
) {
1710 pbits
= pmap_clean_pte(ptep
, pmap
,
1713 if (pbits
& VPTE_A
) {
1714 m
= PHYS_TO_VM_PAGE(pbits
);
1715 vm_page_flag_set(m
, PG_REFERENCED
);
1716 atomic_clear_long(ptep
, VPTE_A
);
1718 if (pbits
& VPTE_M
) {
1719 if (pmap_track_modified(pmap
, i386_ptob(sindex
))) {
1721 m
= PHYS_TO_VM_PAGE(pbits
);
1726 pbits
= pmap_setro_pte(ptep
, pmap
,
1731 lwkt_reltoken(&vm_token
);
1735 * Enter a managed page into a pmap. If the page is not wired related pmap
1736 * data can be destroyed at any time for later demand-operation.
1738 * Insert the vm_page (m) at virtual address (v) in (pmap), with the
1739 * specified protection, and wire the mapping if requested.
1741 * NOTE: This routine may not lazy-evaluate or lose information. The
1742 * page must actually be inserted into the given map NOW.
1744 * NOTE: When entering a page at a KVA address, the pmap must be the
1750 pmap_enter(pmap_t pmap
, vm_offset_t va
, vm_page_t m
, vm_prot_t prot
,
1756 vpte_t origpte
, newpte
;
1764 lwkt_gettoken(&vm_token
);
1767 * Get the page table page. The kernel_pmap's page table pages
1768 * are preallocated and have no associated vm_page_t.
1770 if (pmap
== &kernel_pmap
)
1773 mpte
= pmap_allocpte(pmap
, va
);
1775 pte
= pmap_pte(pmap
, va
);
1778 * Page Directory table entry not valid, we need a new PT page
1779 * and pmap_allocpte() didn't give us one. Oops!
1782 panic("pmap_enter: invalid page directory pmap=%p, va=0x%p\n",
1787 * Deal with races on the original mapping (though don't worry
1788 * about VPTE_A races) by cleaning it. This will force a fault
1789 * if an attempt is made to write to the page.
1791 pa
= VM_PAGE_TO_PHYS(m
) & VPTE_FRAME
;
1792 origpte
= pmap_clean_pte(pte
, pmap
, va
);
1793 opa
= origpte
& VPTE_FRAME
;
1795 if (origpte
& VPTE_PS
)
1796 panic("pmap_enter: attempted pmap_enter on 4MB page");
1799 * Mapping has not changed, must be protection or wiring change.
1801 if (origpte
&& (opa
== pa
)) {
1803 * Wiring change, just update stats. We don't worry about
1804 * wiring PT pages as they remain resident as long as there
1805 * are valid mappings in them. Hence, if a user page is wired,
1806 * the PT page will be also.
1808 if (wired
&& ((origpte
& VPTE_WIRED
) == 0))
1809 ++pmap
->pm_stats
.wired_count
;
1810 else if (!wired
&& (origpte
& VPTE_WIRED
))
1811 --pmap
->pm_stats
.wired_count
;
1812 KKASSERT(pmap
->pm_stats
.wired_count
>= 0);
1815 * Remove the extra pte reference. Note that we cannot
1816 * optimize the RO->RW case because we have adjusted the
1817 * wiring count above and may need to adjust the wiring
1824 * We might be turning off write access to the page,
1825 * so we go ahead and sense modify status.
1827 if (origpte
& VPTE_MANAGED
) {
1828 if ((origpte
& VPTE_M
) &&
1829 pmap_track_modified(pmap
, va
)) {
1831 om
= PHYS_TO_VM_PAGE(opa
);
1835 KKASSERT(m
->flags
& PG_MAPPED
);
1840 * Mapping has changed, invalidate old range and fall through to
1841 * handle validating new mapping.
1845 err
= pmap_remove_pte(pmap
, pte
, va
);
1847 panic("pmap_enter: pte vanished, va: %p", (void *)va
);
1848 pte
= pmap_pte(pmap
, va
);
1849 origpte
= pmap_clean_pte(pte
, pmap
, va
);
1850 opa
= origpte
& VPTE_FRAME
;
1852 kprintf("pmap_enter: Warning, raced pmap %p va %p\n",
1858 * Enter on the PV list if part of our managed memory. Note that we
1859 * raise IPL while manipulating pv_table since pmap_enter can be
1860 * called at interrupt time.
1862 if (pmap_initialized
&&
1863 (m
->flags
& (PG_FICTITIOUS
|PG_UNMANAGED
)) == 0) {
1864 pmap_insert_entry(pmap
, va
, mpte
, m
);
1866 vm_page_flag_set(m
, PG_MAPPED
);
1870 * Increment counters
1872 ++pmap
->pm_stats
.resident_count
;
1874 pmap
->pm_stats
.wired_count
++;
1878 * Now validate mapping with desired protection/wiring.
1880 newpte
= (vm_offset_t
) (pa
| pte_prot(pmap
, prot
) | VPTE_V
);
1883 newpte
|= VPTE_WIRED
;
1884 if (pmap
!= &kernel_pmap
)
1888 * If the mapping or permission bits are different from the
1889 * (now cleaned) original pte, an update is needed. We've
1890 * already downgraded or invalidated the page so all we have
1891 * to do now is update the bits.
1893 * XXX should we synchronize RO->RW changes to avoid another
1896 if ((origpte
& ~(VPTE_W
|VPTE_M
|VPTE_A
)) != newpte
) {
1897 *pte
= newpte
| VPTE_A
;
1898 if (newpte
& VPTE_W
)
1899 vm_page_flag_set(m
, PG_WRITEABLE
);
1901 KKASSERT((newpte
& VPTE_MANAGED
) == 0 || m
->flags
& PG_MAPPED
);
1902 lwkt_reltoken(&vm_token
);
1906 * This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
1908 * Currently this routine may only be used on user pmaps, not kernel_pmap.
1911 pmap_enter_quick(pmap_t pmap
, vm_offset_t va
, vm_page_t m
)
1919 KKASSERT(pmap
!= &kernel_pmap
);
1921 KKASSERT(va
>= VM_MIN_USER_ADDRESS
&& va
< VM_MAX_USER_ADDRESS
);
1924 * Calculate pagetable page (mpte), allocating it if necessary.
1926 * A held page table page (mpte), or NULL, is passed onto the
1927 * section following.
1929 ptepindex
= va
>> PDRSHIFT
;
1931 lwkt_gettoken(&vm_token
);
1935 * Get the page directory entry
1937 ptepa
= (vm_offset_t
) pmap
->pm_pdir
[ptepindex
];
1940 * If the page table page is mapped, we just increment
1941 * the hold count, and activate it.
1944 if (ptepa
& VPTE_PS
)
1945 panic("pmap_enter_quick: unexpected mapping into 4MB page");
1946 if (pmap
->pm_ptphint
&&
1947 (pmap
->pm_ptphint
->pindex
== ptepindex
)) {
1948 mpte
= pmap
->pm_ptphint
;
1950 mpte
= pmap_page_lookup( pmap
->pm_pteobj
, ptepindex
);
1951 pmap
->pm_ptphint
= mpte
;
1956 mpte
= _pmap_allocpte(pmap
, ptepindex
);
1958 } while (mpte
== NULL
);
1961 * Ok, now that the page table page has been validated, get the pte.
1962 * If the pte is already mapped undo mpte's hold_count and
1965 pte
= pmap_pte(pmap
, va
);
1967 pmap_unwire_pte_hold(pmap
, mpte
);
1968 lwkt_reltoken(&vm_token
);
1973 * Enter on the PV list if part of our managed memory. Note that we
1974 * raise IPL while manipulating pv_table since pmap_enter can be
1975 * called at interrupt time.
1977 if ((m
->flags
& (PG_FICTITIOUS
|PG_UNMANAGED
)) == 0) {
1978 pmap_insert_entry(pmap
, va
, mpte
, m
);
1979 vm_page_flag_set(m
, PG_MAPPED
);
1983 * Increment counters
1985 ++pmap
->pm_stats
.resident_count
;
1987 pa
= VM_PAGE_TO_PHYS(m
);
1990 * Now validate mapping with RO protection
1992 if (m
->flags
& (PG_FICTITIOUS
|PG_UNMANAGED
))
1993 *pte
= (vpte_t
)pa
| VPTE_V
| VPTE_U
;
1995 *pte
= (vpte_t
)pa
| VPTE_V
| VPTE_U
| VPTE_MANAGED
;
1996 /*pmap_inval_add(&info, pmap, va); shouldn't be needed 0->valid */
1997 /*pmap_inval_flush(&info); don't need for vkernel */
1998 lwkt_reltoken(&vm_token
);
2002 * Extract the physical address for the translation at the specified
2003 * virtual address in the pmap.
2005 * The caller must hold vm_token if non-blocking operation is desired.
2009 pmap_extract(pmap_t pmap
, vm_offset_t va
)
2014 lwkt_gettoken(&vm_token
);
2015 if (pmap
&& (pte
= pmap
->pm_pdir
[va
>> SEG_SHIFT
]) != 0) {
2016 if (pte
& VPTE_PS
) {
2017 rtval
= pte
& ~((vpte_t
)(1 << SEG_SHIFT
) - 1);
2018 rtval
|= va
& SEG_MASK
;
2020 pte
= *get_ptbase(pmap
, va
);
2021 rtval
= (pte
& VPTE_FRAME
) | (va
& PAGE_MASK
);
2026 lwkt_reltoken(&vm_token
);
2030 #define MAX_INIT_PT (96)
2033 * This routine preloads the ptes for a given object into the specified pmap.
2034 * This eliminates the blast of soft faults on process startup and
2035 * immediately after an mmap.
2039 static int pmap_object_init_pt_callback(vm_page_t p
, void *data
);
2042 pmap_object_init_pt(pmap_t pmap
, vm_offset_t addr
, vm_prot_t prot
,
2043 vm_object_t object
, vm_pindex_t pindex
,
2044 vm_size_t size
, int limit
)
2046 struct rb_vm_page_scan_info info
;
2051 * We can't preinit if read access isn't set or there is no pmap
2054 if ((prot
& VM_PROT_READ
) == 0 || pmap
== NULL
|| object
== NULL
)
2058 * We can't preinit if the pmap is not the current pmap
2060 lp
= curthread
->td_lwp
;
2061 if (lp
== NULL
|| pmap
!= vmspace_pmap(lp
->lwp_vmspace
))
2064 psize
= size
>> PAGE_SHIFT
;
2066 if ((object
->type
!= OBJT_VNODE
) ||
2067 ((limit
& MAP_PREFAULT_PARTIAL
) && (psize
> MAX_INIT_PT
) &&
2068 (object
->resident_page_count
> MAX_INIT_PT
))) {
2072 if (psize
+ pindex
> object
->size
) {
2073 if (object
->size
< pindex
)
2075 psize
= object
->size
- pindex
;
2082 * Use a red-black scan to traverse the requested range and load
2083 * any valid pages found into the pmap.
2085 * We cannot safely scan the object's memq unless we are in a
2086 * critical section since interrupts can remove pages from objects.
2088 info
.start_pindex
= pindex
;
2089 info
.end_pindex
= pindex
+ psize
- 1;
2096 lwkt_gettoken(&vm_token
);
2097 vm_page_rb_tree_RB_SCAN(&object
->rb_memq
, rb_vm_page_scancmp
,
2098 pmap_object_init_pt_callback
, &info
);
2099 lwkt_reltoken(&vm_token
);
2104 * The caller must hold vm_token.
2108 pmap_object_init_pt_callback(vm_page_t p
, void *data
)
2110 struct rb_vm_page_scan_info
*info
= data
;
2111 vm_pindex_t rel_index
;
2113 * don't allow an madvise to blow away our really
2114 * free pages allocating pv entries.
2116 if ((info
->limit
& MAP_PREFAULT_MADVISE
) &&
2117 vmstats
.v_free_count
< vmstats
.v_free_reserved
) {
2120 if (((p
->valid
& VM_PAGE_BITS_ALL
) == VM_PAGE_BITS_ALL
) &&
2121 (p
->busy
== 0) && (p
->flags
& (PG_BUSY
| PG_FICTITIOUS
)) == 0) {
2122 if ((p
->queue
- p
->pc
) == PQ_CACHE
)
2123 vm_page_deactivate(p
);
2125 rel_index
= p
->pindex
- info
->start_pindex
;
2126 pmap_enter_quick(info
->pmap
,
2127 info
->addr
+ i386_ptob(rel_index
), p
);
2134 * Return TRUE if the pmap is in shape to trivially
2135 * pre-fault the specified address.
2137 * Returns FALSE if it would be non-trivial or if a
2138 * pte is already loaded into the slot.
2143 pmap_prefault_ok(pmap_t pmap
, vm_offset_t addr
)
2148 lwkt_gettoken(&vm_token
);
2149 if ((*pmap_pde(pmap
, addr
)) == 0) {
2152 pte
= get_ptbase(pmap
, addr
);
2153 ret
= (*pte
) ? 0 : 1;
2155 lwkt_reltoken(&vm_token
);
2160 * Change the wiring attribute for a map/virtual-address pair.
2161 * The mapping must already exist in the pmap.
2163 * No other requirements.
2166 pmap_change_wiring(pmap_t pmap
, vm_offset_t va
, boolean_t wired
)
2173 lwkt_gettoken(&vm_token
);
2174 pte
= get_ptbase(pmap
, va
);
2176 if (wired
&& (*pte
& VPTE_WIRED
) == 0)
2177 ++pmap
->pm_stats
.wired_count
;
2178 else if (!wired
&& (*pte
& VPTE_WIRED
))
2179 --pmap
->pm_stats
.wired_count
;
2180 KKASSERT(pmap
->pm_stats
.wired_count
>= 0);
2183 * Wiring is not a hardware characteristic so there is no need to
2184 * invalidate TLB. However, in an SMP environment we must use
2185 * a locked bus cycle to update the pte (if we are not using
2186 * the pmap_inval_*() API that is)... it's ok to do this for simple
2190 atomic_set_long(pte
, VPTE_WIRED
);
2192 atomic_clear_long(pte
, VPTE_WIRED
);
2193 lwkt_reltoken(&vm_token
);
2197 * Copy the range specified by src_addr/len
2198 * from the source map to the range dst_addr/len
2199 * in the destination map.
2201 * This routine is only advisory and need not do anything.
2204 pmap_copy(pmap_t dst_pmap
, pmap_t src_pmap
, vm_offset_t dst_addr
,
2205 vm_size_t len
, vm_offset_t src_addr
)
2208 vm_offset_t end_addr
= src_addr
+ len
;
2215 * XXX BUGGY. Amoung other things srcmpte is assumed to remain
2216 * valid through blocking calls, and that's just not going to
2223 if (dst_addr
!= src_addr
)
2225 if (dst_pmap
->pm_pdir
== NULL
)
2227 if (src_pmap
->pm_pdir
== NULL
)
2232 src_frame
= get_ptbase1(src_pmap
, src_addr
);
2233 dst_frame
= get_ptbase2(dst_pmap
, src_addr
);
2236 * critical section protection is required to maintain the page/object
2237 * association, interrupts can free pages and remove them from
2240 for (addr
= src_addr
; addr
< end_addr
; addr
= pdnxt
) {
2241 vpte_t
*src_pte
, *dst_pte
;
2242 vm_page_t dstmpte
, srcmpte
;
2243 vm_offset_t srcptepaddr
;
2246 if (addr
>= VM_MAX_USER_ADDRESS
)
2247 panic("pmap_copy: invalid to pmap_copy page tables\n");
2250 * Don't let optional prefaulting of pages make us go
2251 * way below the low water mark of free pages or way
2252 * above high water mark of used pv entries.
2254 if (vmstats
.v_free_count
< vmstats
.v_free_reserved
||
2255 pv_entry_count
> pv_entry_high_water
)
2258 pdnxt
= ((addr
+ PAGE_SIZE
*NPTEPG
) & ~(PAGE_SIZE
*NPTEPG
- 1));
2259 ptepindex
= addr
>> PDRSHIFT
;
2261 srcptepaddr
= (vm_offset_t
) src_pmap
->pm_pdir
[ptepindex
];
2262 if (srcptepaddr
== 0)
2265 if (srcptepaddr
& VPTE_PS
) {
2266 if (dst_pmap
->pm_pdir
[ptepindex
] == 0) {
2267 dst_pmap
->pm_pdir
[ptepindex
] = (vpte_t
)srcptepaddr
;
2268 dst_pmap
->pm_stats
.resident_count
+= NBPDR
/ PAGE_SIZE
;
2273 srcmpte
= vm_page_lookup(src_pmap
->pm_pteobj
, ptepindex
);
2274 if ((srcmpte
== NULL
) || (srcmpte
->hold_count
== 0) ||
2275 (srcmpte
->flags
& PG_BUSY
)) {
2279 if (pdnxt
> end_addr
)
2282 src_pte
= src_frame
+ ((addr
- src_addr
) >> PAGE_SHIFT
);
2283 dst_pte
= dst_frame
+ ((addr
- src_addr
) >> PAGE_SHIFT
);
2284 while (addr
< pdnxt
) {
2289 * we only virtual copy managed pages
2291 if ((ptetemp
& VPTE_MANAGED
) != 0) {
2293 * We have to check after allocpte for the
2294 * pte still being around... allocpte can
2297 * pmap_allocpte can block, unfortunately
2298 * we have to reload the tables.
2300 dstmpte
= pmap_allocpte(dst_pmap
, addr
);
2301 src_frame
= get_ptbase1(src_pmap
, src_addr
);
2302 dst_frame
= get_ptbase2(dst_pmap
, src_addr
);
2304 if ((*dst_pte
== 0) && (ptetemp
= *src_pte
) &&
2305 (ptetemp
& VPTE_MANAGED
) != 0) {
2307 * Clear the modified and accessed
2308 * (referenced) bits during the copy.
2310 * We do not have to clear the write
2311 * bit to force a fault-on-modify
2312 * because the real kernel's target
2313 * pmap is empty and will fault anyway.
2315 m
= PHYS_TO_VM_PAGE(ptetemp
);
2316 *dst_pte
= ptetemp
& ~(VPTE_M
| VPTE_A
);
2317 ++dst_pmap
->pm_stats
.resident_count
;
2318 pmap_insert_entry(dst_pmap
, addr
,
2320 KKASSERT(m
->flags
& PG_MAPPED
);
2322 pmap_unwire_pte_hold(dst_pmap
, dstmpte
);
2324 if (dstmpte
->hold_count
>= srcmpte
->hold_count
)
2338 * Zero the specified PA by mapping the page into KVM and clearing its
2341 * This function may be called from an interrupt and no locking is
2345 pmap_zero_page(vm_paddr_t phys
)
2347 struct mdglobaldata
*gd
= mdcpu
;
2351 panic("pmap_zero_page: CMAP3 busy");
2352 *gd
->gd_CMAP3
= VPTE_V
| VPTE_R
| VPTE_W
| (phys
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2353 madvise(gd
->gd_CADDR3
, PAGE_SIZE
, MADV_INVAL
);
2355 bzero(gd
->gd_CADDR3
, PAGE_SIZE
);
2361 * pmap_page_assertzero:
2363 * Assert that a page is empty, panic if it isn't.
2366 pmap_page_assertzero(vm_paddr_t phys
)
2368 struct mdglobaldata
*gd
= mdcpu
;
2373 panic("pmap_zero_page: CMAP3 busy");
2374 *gd
->gd_CMAP3
= VPTE_V
| VPTE_R
| VPTE_W
|
2375 (phys
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2376 madvise(gd
->gd_CADDR3
, PAGE_SIZE
, MADV_INVAL
);
2377 for (i
= 0; i
< PAGE_SIZE
; i
+= 4) {
2378 if (*(int *)((char *)gd
->gd_CADDR3
+ i
) != 0) {
2379 panic("pmap_page_assertzero() @ %p not zero!\n",
2380 (void *)gd
->gd_CADDR3
);
2390 * Zero part of a physical page by mapping it into memory and clearing
2391 * its contents with bzero.
2393 * off and size may not cover an area beyond a single hardware page.
2396 pmap_zero_page_area(vm_paddr_t phys
, int off
, int size
)
2398 struct mdglobaldata
*gd
= mdcpu
;
2402 panic("pmap_zero_page: CMAP3 busy");
2403 *gd
->gd_CMAP3
= VPTE_V
| VPTE_R
| VPTE_W
|
2404 (phys
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2405 madvise(gd
->gd_CADDR3
, PAGE_SIZE
, MADV_INVAL
);
2407 bzero((char *)gd
->gd_CADDR3
+ off
, size
);
2415 * Copy the physical page from the source PA to the target PA.
2416 * This function may be called from an interrupt. No locking
2420 pmap_copy_page(vm_paddr_t src
, vm_paddr_t dst
)
2422 struct mdglobaldata
*gd
= mdcpu
;
2425 if (*(int *) gd
->gd_CMAP1
)
2426 panic("pmap_copy_page: CMAP1 busy");
2427 if (*(int *) gd
->gd_CMAP2
)
2428 panic("pmap_copy_page: CMAP2 busy");
2430 *(int *) gd
->gd_CMAP1
= VPTE_V
| VPTE_R
| (src
& PG_FRAME
) | VPTE_A
;
2431 *(int *) gd
->gd_CMAP2
= VPTE_V
| VPTE_R
| VPTE_W
| (dst
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2433 madvise(gd
->gd_CADDR1
, PAGE_SIZE
, MADV_INVAL
);
2434 madvise(gd
->gd_CADDR2
, PAGE_SIZE
, MADV_INVAL
);
2436 bcopy(gd
->gd_CADDR1
, gd
->gd_CADDR2
, PAGE_SIZE
);
2438 *(int *) gd
->gd_CMAP1
= 0;
2439 *(int *) gd
->gd_CMAP2
= 0;
2444 * pmap_copy_page_frag:
2446 * Copy the physical page from the source PA to the target PA.
2447 * This function may be called from an interrupt. No locking
2451 pmap_copy_page_frag(vm_paddr_t src
, vm_paddr_t dst
, size_t bytes
)
2453 struct mdglobaldata
*gd
= mdcpu
;
2456 if (*(int *) gd
->gd_CMAP1
)
2457 panic("pmap_copy_page: CMAP1 busy");
2458 if (*(int *) gd
->gd_CMAP2
)
2459 panic("pmap_copy_page: CMAP2 busy");
2461 *(int *) gd
->gd_CMAP1
= VPTE_V
| (src
& VPTE_FRAME
) | VPTE_A
;
2462 *(int *) gd
->gd_CMAP2
= VPTE_V
| VPTE_R
| VPTE_W
| (dst
& VPTE_FRAME
) | VPTE_A
| VPTE_M
;
2464 madvise(gd
->gd_CADDR1
, PAGE_SIZE
, MADV_INVAL
);
2465 madvise(gd
->gd_CADDR2
, PAGE_SIZE
, MADV_INVAL
);
2467 bcopy((char *)gd
->gd_CADDR1
+ (src
& PAGE_MASK
),
2468 (char *)gd
->gd_CADDR2
+ (dst
& PAGE_MASK
),
2471 *(int *) gd
->gd_CMAP1
= 0;
2472 *(int *) gd
->gd_CMAP2
= 0;
2477 * Returns true if the pmap's pv is one of the first
2478 * 16 pvs linked to from this page. This count may
2479 * be changed upwards or downwards in the future; it
2480 * is only necessary that true be returned for a small
2481 * subset of pmaps for proper page aging.
2486 pmap_page_exists_quick(pmap_t pmap
, vm_page_t m
)
2491 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
))
2495 lwkt_gettoken(&vm_token
);
2497 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
2498 if (pv
->pv_pmap
== pmap
) {
2499 lwkt_reltoken(&vm_token
);
2507 lwkt_reltoken(&vm_token
);
2513 * Remove all pages from specified address space
2514 * this aids process exit speeds. Also, this code
2515 * is special cased for current process only, but
2516 * can have the more generic (and slightly slower)
2517 * mode enabled. This is much faster than pmap_remove
2518 * in the case of running down an entire address space.
2523 pmap_remove_pages(pmap_t pmap
, vm_offset_t sva
, vm_offset_t eva
)
2528 int32_t save_generation
;
2531 lwkt_gettoken(&vm_token
);
2532 for (pv
= TAILQ_FIRST(&pmap
->pm_pvlist
); pv
; pv
= npv
) {
2533 if (pv
->pv_va
>= eva
|| pv
->pv_va
< sva
) {
2534 npv
= TAILQ_NEXT(pv
, pv_plist
);
2538 KKASSERT(pmap
== pv
->pv_pmap
);
2540 pte
= pmap_pte(pmap
, pv
->pv_va
);
2543 * We cannot remove wired pages from a process' mapping
2546 if (*pte
& VPTE_WIRED
) {
2547 npv
= TAILQ_NEXT(pv
, pv_plist
);
2550 tpte
= pmap_inval_loadandclear(pte
, pmap
, pv
->pv_va
);
2552 m
= PHYS_TO_VM_PAGE(tpte
);
2554 KASSERT(m
< &vm_page_array
[vm_page_array_size
],
2555 ("pmap_remove_pages: bad tpte %lx", tpte
));
2557 KKASSERT(pmap
->pm_stats
.resident_count
> 0);
2558 --pmap
->pm_stats
.resident_count
;
2561 * Update the vm_page_t clean and reference bits.
2563 if (tpte
& VPTE_M
) {
2567 npv
= TAILQ_NEXT(pv
, pv_plist
);
2568 TAILQ_REMOVE(&pmap
->pm_pvlist
, pv
, pv_plist
);
2569 save_generation
= ++pmap
->pm_generation
;
2571 m
->md
.pv_list_count
--;
2572 m
->object
->agg_pv_list_count
--;
2573 TAILQ_REMOVE(&m
->md
.pv_list
, pv
, pv_list
);
2574 if (TAILQ_FIRST(&m
->md
.pv_list
) == NULL
)
2575 vm_page_flag_clear(m
, PG_MAPPED
| PG_WRITEABLE
);
2577 pmap_unuse_pt(pmap
, pv
->pv_va
, pv
->pv_ptem
);
2581 * Restart the scan if we blocked during the unuse or free
2582 * calls and other removals were made.
2584 if (save_generation
!= pmap
->pm_generation
) {
2585 kprintf("Warning: pmap_remove_pages race-A avoided\n");
2586 npv
= TAILQ_FIRST(&pmap
->pm_pvlist
);
2589 lwkt_reltoken(&vm_token
);
2594 * pmap_testbit tests bits in active mappings of a VM page.
2596 * The caller must hold vm_token
2599 pmap_testbit(vm_page_t m
, int bit
)
2604 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
))
2607 if (TAILQ_FIRST(&m
->md
.pv_list
) == NULL
)
2612 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
2614 * if the bit being tested is the modified bit, then
2615 * mark clean_map and ptes as never
2618 if (bit
& (VPTE_A
|VPTE_M
)) {
2619 if (!pmap_track_modified(pv
->pv_pmap
, pv
->pv_va
))
2623 #if defined(PMAP_DIAGNOSTIC)
2625 kprintf("Null pmap (tb) at va: 0x%x\n", pv
->pv_va
);
2629 pte
= pmap_pte(pv
->pv_pmap
, pv
->pv_va
);
2640 * This routine is used to clear bits in ptes. Certain bits require special
2641 * handling, in particular (on virtual kernels) the VPTE_M (modify) bit.
2643 * This routine is only called with certain VPTE_* bit combinations.
2645 * The caller must hold vm_token
2647 static __inline
void
2648 pmap_clearbit(vm_page_t m
, int bit
)
2654 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
))
2660 * Loop over all current mappings setting/clearing as appropos If
2661 * setting RO do we need to clear the VAC?
2663 TAILQ_FOREACH(pv
, &m
->md
.pv_list
, pv_list
) {
2665 * don't write protect pager mappings
2667 if (bit
== VPTE_W
) {
2668 if (!pmap_track_modified(pv
->pv_pmap
, pv
->pv_va
))
2672 #if defined(PMAP_DIAGNOSTIC)
2674 kprintf("Null pmap (cb) at va: 0x%x\n", pv
->pv_va
);
2680 * Careful here. We can use a locked bus instruction to
2681 * clear VPTE_A or VPTE_M safely but we need to synchronize
2682 * with the target cpus when we mess with VPTE_W.
2684 * On virtual kernels we must force a new fault-on-write
2685 * in the real kernel if we clear the Modify bit ourselves,
2686 * otherwise the real kernel will not get a new fault and
2687 * will never set our Modify bit again.
2689 pte
= pmap_pte(pv
->pv_pmap
, pv
->pv_va
);
2691 if (bit
== VPTE_W
) {
2693 * We must also clear VPTE_M when clearing
2696 pbits
= pmap_clean_pte(pte
, pv
->pv_pmap
,
2700 } else if (bit
== VPTE_M
) {
2702 * We do not have to make the page read-only
2703 * when clearing the Modify bit. The real
2704 * kernel will make the real PTE read-only
2705 * or otherwise detect the write and set
2706 * our VPTE_M again simply by us invalidating
2707 * the real kernel VA for the pmap (as we did
2708 * above). This allows the real kernel to
2709 * handle the write fault without forwarding
2712 atomic_clear_long(pte
, VPTE_M
);
2713 } else if ((bit
& (VPTE_W
|VPTE_M
)) == (VPTE_W
|VPTE_M
)) {
2715 * We've been asked to clear W & M, I guess
2716 * the caller doesn't want us to update
2717 * the dirty status of the VM page.
2719 pmap_clean_pte(pte
, pv
->pv_pmap
, pv
->pv_va
);
2722 * We've been asked to clear bits that do
2723 * not interact with hardware.
2725 atomic_clear_long(pte
, bit
);
2733 * Lower the permission for all mappings to a given page.
2738 pmap_page_protect(vm_page_t m
, vm_prot_t prot
)
2740 if ((prot
& VM_PROT_WRITE
) == 0) {
2741 lwkt_gettoken(&vm_token
);
2742 if (prot
& (VM_PROT_READ
| VM_PROT_EXECUTE
)) {
2743 pmap_clearbit(m
, VPTE_W
);
2744 vm_page_flag_clear(m
, PG_WRITEABLE
);
2748 lwkt_reltoken(&vm_token
);
2753 pmap_phys_address(vm_pindex_t ppn
)
2755 return (i386_ptob(ppn
));
2759 * Return a count of reference bits for a page, clearing those bits.
2760 * It is not necessary for every reference bit to be cleared, but it
2761 * is necessary that 0 only be returned when there are truly no
2762 * reference bits set.
2764 * XXX: The exact number of bits to check and clear is a matter that
2765 * should be tested and standardized at some point in the future for
2766 * optimal aging of shared pages.
2771 pmap_ts_referenced(vm_page_t m
)
2773 pv_entry_t pv
, pvf
, pvn
;
2777 if (!pmap_initialized
|| (m
->flags
& PG_FICTITIOUS
))
2781 lwkt_gettoken(&vm_token
);
2783 if ((pv
= TAILQ_FIRST(&m
->md
.pv_list
)) != NULL
) {
2788 pvn
= TAILQ_NEXT(pv
, pv_list
);
2790 TAILQ_REMOVE(&m
->md
.pv_list
, pv
, pv_list
);
2792 TAILQ_INSERT_TAIL(&m
->md
.pv_list
, pv
, pv_list
);
2794 if (!pmap_track_modified(pv
->pv_pmap
, pv
->pv_va
))
2797 pte
= pmap_pte(pv
->pv_pmap
, pv
->pv_va
);
2799 if (pte
&& (*pte
& VPTE_A
)) {
2801 atomic_clear_long(pte
, VPTE_A
);
2803 atomic_clear_long_nonlocked(pte
, VPTE_A
);
2810 } while ((pv
= pvn
) != NULL
&& pv
!= pvf
);
2812 lwkt_reltoken(&vm_token
);
2819 * Return whether or not the specified physical page was modified
2820 * in any physical maps.
2825 pmap_is_modified(vm_page_t m
)
2829 lwkt_gettoken(&vm_token
);
2830 res
= pmap_testbit(m
, VPTE_M
);
2831 lwkt_reltoken(&vm_token
);
2836 * Clear the modify bits on the specified physical page.
2841 pmap_clear_modify(vm_page_t m
)
2843 lwkt_gettoken(&vm_token
);
2844 pmap_clearbit(m
, VPTE_M
);
2845 lwkt_reltoken(&vm_token
);
2849 * Clear the reference bit on the specified physical page.
2854 pmap_clear_reference(vm_page_t m
)
2856 lwkt_gettoken(&vm_token
);
2857 pmap_clearbit(m
, VPTE_A
);
2858 lwkt_reltoken(&vm_token
);
2862 * Miscellaneous support routines follow
2866 i386_protection_init(void)
2870 kp
= protection_codes
;
2871 for (prot
= 0; prot
< 8; prot
++) {
2872 if (prot
& VM_PROT_READ
)
2874 if (prot
& VM_PROT_WRITE
)
2876 if (prot
& VM_PROT_EXECUTE
)
2885 * Map a set of physical memory pages into the kernel virtual
2886 * address space. Return a pointer to where it is mapped. This
2887 * routine is intended to be used for mapping device memory,
2890 * NOTE: we can't use pgeflag unless we invalidate the pages one at
2894 pmap_mapdev(vm_paddr_t pa
, vm_size_t size
)
2896 vm_offset_t va
, tmpva
, offset
;
2899 offset
= pa
& PAGE_MASK
;
2900 size
= roundup(offset
+ size
, PAGE_SIZE
);
2902 va
= kmem_alloc_nofault(&kernel_map
, size
, PAGE_SIZE
);
2904 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2906 pa
= pa
& VPTE_FRAME
;
2907 for (tmpva
= va
; size
> 0;) {
2908 pte
= KernelPTA
+ (tmpva
>> PAGE_SHIFT
);
2909 *pte
= pa
| VPTE_R
| VPTE_W
| VPTE_V
; /* | pgeflag; */
2917 return ((void *)(va
+ offset
));
2921 pmap_unmapdev(vm_offset_t va
, vm_size_t size
)
2923 vm_offset_t base
, offset
;
2925 base
= va
& VPTE_FRAME
;
2926 offset
= va
& PAGE_MASK
;
2927 size
= roundup(offset
+ size
, PAGE_SIZE
);
2928 pmap_qremove(va
, size
>> PAGE_SHIFT
);
2929 kmem_free(&kernel_map
, base
, size
);
2935 * Perform the pmap work for mincore
2940 pmap_mincore(pmap_t pmap
, vm_offset_t addr
)
2946 lwkt_gettoken(&vm_token
);
2948 ptep
= pmap_pte(pmap
, addr
);
2950 lwkt_reltoken(&vm_token
);
2954 if ((pte
= *ptep
) != 0) {
2957 val
= MINCORE_INCORE
;
2958 if ((pte
& VPTE_MANAGED
) == 0)
2961 pa
= pte
& VPTE_FRAME
;
2963 m
= PHYS_TO_VM_PAGE(pa
);
2969 val
|= MINCORE_MODIFIED
|MINCORE_MODIFIED_OTHER
;
2971 * Modified by someone
2973 else if (m
->dirty
|| pmap_is_modified(m
))
2974 val
|= MINCORE_MODIFIED_OTHER
;
2979 val
|= MINCORE_REFERENCED
|MINCORE_REFERENCED_OTHER
;
2982 * Referenced by someone
2984 else if ((m
->flags
& PG_REFERENCED
) || pmap_ts_referenced(m
)) {
2985 val
|= MINCORE_REFERENCED_OTHER
;
2986 vm_page_flag_set(m
, PG_REFERENCED
);
2990 lwkt_reltoken(&vm_token
);
2995 pmap_replacevm(struct proc
*p
, struct vmspace
*newvm
, int adjrefs
)
2997 struct vmspace
*oldvm
;
3000 oldvm
= p
->p_vmspace
;
3002 if (oldvm
!= newvm
) {
3003 p
->p_vmspace
= newvm
;
3004 KKASSERT(p
->p_nthreads
== 1);
3005 lp
= RB_ROOT(&p
->p_lwp_tree
);
3006 pmap_setlwpvm(lp
, newvm
);
3008 sysref_get(&newvm
->vm_sysref
);
3009 sysref_put(&oldvm
->vm_sysref
);
3016 pmap_setlwpvm(struct lwp
*lp
, struct vmspace
*newvm
)
3018 struct vmspace
*oldvm
;
3022 oldvm
= lp
->lwp_vmspace
;
3024 if (oldvm
!= newvm
) {
3025 lp
->lwp_vmspace
= newvm
;
3026 if (curthread
->td_lwp
== lp
) {
3027 pmap
= vmspace_pmap(newvm
);
3029 atomic_set_int(&pmap
->pm_active
, mycpu
->gd_cpumask
);
3031 pmap
->pm_active
|= 1;
3033 #if defined(SWTCH_OPTIM_STATS)
3036 pmap
= vmspace_pmap(oldvm
);
3038 atomic_clear_int(&pmap
->pm_active
, mycpu
->gd_cpumask
);
3040 pmap
->pm_active
&= ~1;
3049 pmap_addr_hint(vm_object_t obj
, vm_offset_t addr
, vm_size_t size
)
3052 if ((obj
== NULL
) || (size
< NBPDR
) || (obj
->type
!= OBJT_DEVICE
)) {
3056 addr
= (addr
+ (NBPDR
- 1)) & ~(NBPDR
- 1);
3061 * Used by kmalloc/kfree, page already exists at va
3064 pmap_kvtom(vm_offset_t va
)
3068 KKASSERT(va
>= KvaStart
&& va
< KvaEnd
);
3069 ptep
= KernelPTA
+ (va
>> PAGE_SHIFT
);
3070 return(PHYS_TO_VM_PAGE(*ptep
& PG_FRAME
));