4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/sysmacros.h>
29 #include <sys/atomic.h>
30 #include <sys/bitmap.h>
31 #include <sys/machparam.h>
32 #include <sys/machsystm.h>
34 #include <sys/systm.h>
35 #include <sys/cpuvar.h>
36 #include <sys/thread.h>
42 #include <sys/vmsystm.h>
43 #include <sys/promif.h>
45 #include <sys/x86_archext.h>
46 #include <sys/archsystm.h>
47 #include <sys/bootconf.h>
48 #include <sys/dumphdr.h>
49 #include <vm/seg_kmem.h>
50 #include <vm/seg_kpm.h>
52 #include <vm/hat_i86.h>
53 #include <sys/cmn_err.h>
54 #include <sys/panic.h>
57 #include <sys/hypervisor.h>
58 #include <sys/xpv_panic.h>
61 #include <sys/bootinfo.h>
62 #include <vm/kboot_mmu.h>
64 static void x86pte_zero(htable_t
*dest
, uint_t entry
, uint_t count
);
66 kmem_cache_t
*htable_cache
;
69 * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
70 * is used in order to facilitate testing of the htable_steal() code.
71 * By resetting htable_reserve_amount to a lower value, we can force
72 * stealing to occur. The reserve amount is a guess to get us through boot.
74 #define HTABLE_RESERVE_AMOUNT (200)
75 uint_t htable_reserve_amount
= HTABLE_RESERVE_AMOUNT
;
76 kmutex_t htable_reserve_mutex
;
77 uint_t htable_reserve_cnt
;
78 htable_t
*htable_reserve_pool
;
81 * Used to hand test htable_steal().
84 ulong_t force_steal
= 0;
85 ulong_t ptable_cnt
= 0;
89 * This variable is so that we can tune this via /etc/system
90 * Any value works, but a power of two <= mmu.ptes_per_table is best.
92 uint_t htable_steal_passes
= 8;
95 * mutex stuff for access to htable hash
97 #define NUM_HTABLE_MUTEX 128
98 kmutex_t htable_mutex
[NUM_HTABLE_MUTEX
];
99 #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1))
101 #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
102 #define HTABLE_EXIT(h) mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
105 * forward declarations
107 static void link_ptp(htable_t
*higher
, htable_t
*new, uintptr_t vaddr
);
108 static void unlink_ptp(htable_t
*higher
, htable_t
*old
, uintptr_t vaddr
);
109 static void htable_free(htable_t
*ht
);
110 static x86pte_t
*x86pte_access_pagetable(htable_t
*ht
, uint_t index
);
111 static void x86pte_release_pagetable(htable_t
*ht
);
112 static x86pte_t
x86pte_cas(htable_t
*ht
, uint_t entry
, x86pte_t old
,
116 * A counter to track if we are stealing or reaping htables. When non-zero
117 * htable_free() will directly free htables (either to the reserve or kmem)
118 * instead of putting them in a hat's htable cache.
120 uint32_t htable_dont_cache
= 0;
123 * Track the number of active pagetables, so we can know how many to reap
125 static uint32_t active_ptables
= 0;
129 * Deal with hypervisor complications.
132 xen_flush_va(caddr_t va
)
137 if (IN_XPV_PANIC()) {
138 mmu_tlbflush_entry((caddr_t
)va
);
140 t
.cmd
= MMUEXT_INVLPG_LOCAL
;
141 t
.arg1
.linear_addr
= (uintptr_t)va
;
142 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
143 panic("HYPERVISOR_mmuext_op() failed");
149 xen_gflush_va(caddr_t va
, cpuset_t cpus
)
154 if (IN_XPV_PANIC()) {
155 mmu_tlbflush_entry((caddr_t
)va
);
159 t
.cmd
= MMUEXT_INVLPG_MULTI
;
160 t
.arg1
.linear_addr
= (uintptr_t)va
;
161 /*LINTED: constant in conditional context*/
162 set_xen_guest_handle(t
.arg2
.vcpumask
, &cpus
);
163 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
164 panic("HYPERVISOR_mmuext_op() failed");
174 if (IN_XPV_PANIC()) {
175 xpv_panic_reload_cr3();
177 t
.cmd
= MMUEXT_TLB_FLUSH_LOCAL
;
178 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
179 panic("HYPERVISOR_mmuext_op() failed");
185 xen_gflush_tlb(cpuset_t cpus
)
190 ASSERT(!IN_XPV_PANIC());
191 t
.cmd
= MMUEXT_TLB_FLUSH_MULTI
;
192 /*LINTED: constant in conditional context*/
193 set_xen_guest_handle(t
.arg2
.vcpumask
, &cpus
);
194 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
195 panic("HYPERVISOR_mmuext_op() failed");
200 * Install/Adjust a kpm mapping under the hypervisor.
201 * Value of "how" should be:
202 * PT_WRITABLE | PT_VALID - regular kpm mapping
203 * PT_VALID - make mapping read-only
206 * returns 0 on success. non-zero for failure.
209 xen_kpm_page(pfn_t pfn
, uint_t how
)
211 paddr_t pa
= mmu_ptob((paddr_t
)pfn
);
212 x86pte_t pte
= PT_NOCONSIST
| PT_REF
| PT_MOD
;
214 if (kpm_vbase
== NULL
)
218 pte
|= pa_to_ma(pa
) | how
;
221 return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase
+ pa
,
222 pte
, UVMF_INVLPG
| UVMF_ALL
));
226 xen_pin(pfn_t pfn
, level_t lvl
)
231 t
.cmd
= MMUEXT_PIN_L1_TABLE
+ lvl
;
232 t
.arg1
.mfn
= pfn_to_mfn(pfn
);
233 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
234 panic("HYPERVISOR_mmuext_op() failed");
244 t
.cmd
= MMUEXT_UNPIN_TABLE
;
245 t
.arg1
.mfn
= pfn_to_mfn(pfn
);
246 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
247 panic("HYPERVISOR_mmuext_op() failed");
252 xen_map(uint64_t pte
, caddr_t va
)
254 if (HYPERVISOR_update_va_mapping((uintptr_t)va
, pte
,
255 UVMF_INVLPG
| UVMF_LOCAL
))
256 panic("HYPERVISOR_update_va_mapping() failed");
261 * Allocate a memory page for a hardware page table.
263 * A wrapper around page_get_physical(), with some extra checks.
266 ptable_alloc(uintptr_t seed
)
274 * The first check is to see if there is memory in the system. If we
275 * drop to throttlefree, then fail the ptable_alloc() and let the
276 * stealing code kick in. Note that we have to do this test here,
277 * since the test in page_create_throttle() would let the NOSLEEP
278 * allocation go through and deplete the page reserves.
280 * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check.
282 if (!NOMEMWAIT() && freemem
<= throttlefree
+ 1)
283 return (PFN_INVALID
);
287 * This code makes htable_steal() easier to test. By setting
288 * force_steal we force pagetable allocations to fall
289 * into the stealing code. Roughly 1 in ever "force_steal"
290 * page table allocations will fail.
292 if (proc_pageout
!= NULL
&& force_steal
> 1 &&
293 ++ptable_cnt
> force_steal
) {
295 return (PFN_INVALID
);
299 pp
= page_get_physical(seed
);
301 return (PFN_INVALID
);
302 ASSERT(PAGE_SHARED(pp
));
304 if (pfn
== PFN_INVALID
)
305 panic("ptable_alloc(): Invalid PFN!!");
306 atomic_inc_32(&active_ptables
);
307 HATSTAT_INC(hs_ptable_allocs
);
312 * Free an htable's associated page table page. See the comments
313 * for ptable_alloc().
316 ptable_free(pfn_t pfn
)
318 page_t
*pp
= page_numtopp_nolock(pfn
);
321 * need to destroy the page used for the pagetable
323 ASSERT(pfn
!= PFN_INVALID
);
324 HATSTAT_INC(hs_ptable_frees
);
325 atomic_dec_32(&active_ptables
);
327 panic("ptable_free(): no page for pfn!");
328 ASSERT(PAGE_SHARED(pp
));
329 ASSERT(pfn
== pp
->p_pagenum
);
330 ASSERT(!IN_XPV_PANIC());
333 * Get an exclusive lock, might have to wait for a kmem reader.
335 if (!page_tryupgrade(pp
)) {
336 u_offset_t off
= pp
->p_offset
;
338 pp
= page_lookup(&kvp
, off
, SE_EXCL
);
340 panic("page not found");
343 if (kpm_vbase
&& xen_kpm_page(pfn
, PT_VALID
| PT_WRITABLE
) < 0)
344 panic("failure making kpm r/w pfn=0x%lx", pfn
);
346 page_hashout(pp
, NULL
);
352 * Put one htable on the reserve list.
355 htable_put_reserve(htable_t
*ht
)
357 ht
->ht_hat
= NULL
; /* no longer tied to a hat */
358 ASSERT(ht
->ht_pfn
== PFN_INVALID
);
359 HATSTAT_INC(hs_htable_rputs
);
360 mutex_enter(&htable_reserve_mutex
);
361 ht
->ht_next
= htable_reserve_pool
;
362 htable_reserve_pool
= ht
;
363 ++htable_reserve_cnt
;
364 mutex_exit(&htable_reserve_mutex
);
368 * Take one htable from the reserve.
371 htable_get_reserve(void)
375 mutex_enter(&htable_reserve_mutex
);
376 if (htable_reserve_cnt
!= 0) {
377 ht
= htable_reserve_pool
;
379 ASSERT(ht
->ht_pfn
== PFN_INVALID
);
380 htable_reserve_pool
= ht
->ht_next
;
381 --htable_reserve_cnt
;
382 HATSTAT_INC(hs_htable_rgets
);
384 mutex_exit(&htable_reserve_mutex
);
389 * Allocate initial htables and put them on the reserve list
392 htable_initial_reserve(uint_t count
)
396 count
+= HTABLE_RESERVE_AMOUNT
;
398 ht
= kmem_cache_alloc(htable_cache
, KM_NOSLEEP
);
401 ASSERT(use_boot_reserve
);
402 ht
->ht_pfn
= PFN_INVALID
;
403 htable_put_reserve(ht
);
409 * Readjust the reserves after a thread finishes using them.
412 htable_adjust_reserve()
417 * Free any excess htables in the reserve list
419 while (htable_reserve_cnt
> htable_reserve_amount
&&
420 !USE_HAT_RESERVES()) {
421 ht
= htable_get_reserve();
424 ASSERT(ht
->ht_pfn
== PFN_INVALID
);
425 kmem_cache_free(htable_cache
, ht
);
431 * This routine steals htables from user processes for htable_alloc() or
435 htable_steal(uint_t cnt
)
437 hat_t
*hat
= kas
.a_hat
; /* list starts with khat */
438 htable_t
*list
= NULL
;
443 static uint_t h_seed
= 0;
452 * Limit htable_steal_passes to something reasonable
454 if (htable_steal_passes
== 0)
455 htable_steal_passes
= 1;
456 if (htable_steal_passes
> mmu
.ptes_per_table
)
457 htable_steal_passes
= mmu
.ptes_per_table
;
460 * Loop through all user hats. The 1st pass takes cached htables that
461 * aren't in use. The later passes steal by removing mappings, too.
463 atomic_inc_32(&htable_dont_cache
);
464 for (pass
= 0; pass
<= htable_steal_passes
&& stolen
< cnt
; ++pass
) {
465 threshold
= pass
* mmu
.ptes_per_table
/ htable_steal_passes
;
470 * Clear the victim flag and move to next hat
472 mutex_enter(&hat_list_lock
);
473 if (hat
!= kas
.a_hat
) {
474 hat
->hat_flags
&= ~HAT_VICTIM
;
475 cv_broadcast(&hat_list_cv
);
480 * Skip any hat that is already being stolen from.
482 * We skip SHARED hats, as these are dummy
483 * hats that host ISM shared page tables.
485 * We also skip if HAT_FREEING because hat_pte_unmap()
486 * won't zero out the PTE's. That would lead to hitting
487 * stale PTEs either here or under hat_unload() when we
488 * steal and unload the same page table in competing
491 while (hat
!= NULL
&&
493 (HAT_VICTIM
| HAT_SHARED
| HAT_FREEING
)) != 0)
497 mutex_exit(&hat_list_lock
);
506 * Try to spread the pain of stealing,
507 * move victim HAT to the end of the HAT list.
509 if (pass
>= 1 && cnt
== 1 &&
510 kas
.a_hat
->hat_prev
!= hat
) {
512 /* unlink victim hat */
514 hat
->hat_prev
->hat_next
=
517 kas
.a_hat
->hat_next
=
520 hat
->hat_next
->hat_prev
=
523 kas
.a_hat
->hat_prev
=
527 /* relink at end of hat list */
528 hat
->hat_next
= NULL
;
529 hat
->hat_prev
= kas
.a_hat
->hat_prev
;
531 hat
->hat_prev
->hat_next
= hat
;
533 kas
.a_hat
->hat_next
= hat
;
534 kas
.a_hat
->hat_prev
= hat
;
538 mutex_exit(&hat_list_lock
);
543 * Mark the HAT as a stealing victim.
545 hat
->hat_flags
|= HAT_VICTIM
;
546 mutex_exit(&hat_list_lock
);
549 * Take any htables from the hat's cached "free" list.
552 while ((ht
= hat
->hat_ht_cached
) != NULL
&&
554 hat
->hat_ht_cached
= ht
->ht_next
;
562 * Don't steal on first pass.
564 if (pass
== 0 || stolen
== cnt
)
568 * Search the active htables for one to steal.
569 * Start at a different hash bucket every time to
570 * help spread the pain of stealing.
572 h
= h_start
= h_seed
++ % hat
->hat_num_hash
;
576 for (ht
= hat
->hat_ht_hash
[h
]; ht
;
580 * Can we rule out reaping?
582 if (ht
->ht_busy
!= 0 ||
583 (ht
->ht_flags
& HTABLE_SHARED_PFN
)||
585 ht
->ht_valid_cnt
> threshold
||
586 ht
->ht_lock_cnt
!= 0)
590 * Increment busy so the htable can't
591 * disappear. We drop the htable mutex
592 * to avoid deadlocks with
593 * hat_pageunload() and the hment mutex
594 * while we call hat_pte_unmap()
601 * - unload and invalidate all PTEs
603 for (e
= 0, va
= ht
->ht_vaddr
;
604 e
< HTABLE_NUM_PTES(ht
) &&
605 ht
->ht_valid_cnt
> 0 &&
607 ht
->ht_lock_cnt
== 0;
608 ++e
, va
+= MMU_PAGESIZE
) {
609 pte
= x86pte_get(ht
, e
);
610 if (!PTE_ISVALID(pte
))
613 HAT_UNLOAD
, pte
, NULL
);
617 * Reacquire htable lock. If we didn't
618 * remove all mappings in the table,
619 * or another thread added a new mapping
620 * behind us, give up on this table.
623 if (ht
->ht_busy
!= 1 ||
624 ht
->ht_valid_cnt
!= 0 ||
625 ht
->ht_lock_cnt
!= 0) {
631 * Steal it and unlink the page table.
633 higher
= ht
->ht_parent
;
634 unlink_ptp(higher
, ht
, ht
->ht_vaddr
);
637 * remove from the hash list
640 ht
->ht_next
->ht_prev
=
644 ht
->ht_prev
->ht_next
=
647 ASSERT(hat
->hat_ht_hash
[h
] ==
649 hat
->hat_ht_hash
[h
] =
654 * Break to outer loop to release the
655 * higher (ht_parent) pagetable. This
656 * spreads out the pain caused by
666 htable_release(higher
);
667 if (++h
== hat
->hat_num_hash
)
669 } while (stolen
< cnt
&& h
!= h_start
);
672 atomic_dec_32(&htable_dont_cache
);
677 * This is invoked from kmem when the system is low on memory. We try
678 * to free hments, htables, and ptables to improve the memory situation.
682 htable_reap(void *handle
)
688 HATSTAT_INC(hs_reap_attempts
);
689 if (!can_steal_post_boot
)
693 * Try to reap 5% of the page tables bounded by a maximum of
694 * 5% of physmem and a minimum of 10.
696 reap_cnt
= MAX(MIN(physmem
/ 20, active_ptables
/ 20), 10);
699 * Let htable_steal() do the work, we just call htable_free()
701 XPV_DISALLOW_MIGRATE();
702 list
= htable_steal(reap_cnt
);
704 while ((ht
= list
) != NULL
) {
706 HATSTAT_INC(hs_reaped
);
711 * Free up excess reserves
713 htable_adjust_reserve();
714 hment_adjust_reserve();
718 * Allocate an htable, stealing one or using the reserve if necessary
730 uint_t need_to_zero
= 1;
731 int kmflags
= (can_steal_post_boot
? KM_NOSLEEP
: KM_SLEEP
);
733 if (level
< 0 || level
> TOP_LEVEL(hat
))
734 panic("htable_alloc(): level %d out of range\n", level
);
736 is_vlp
= (hat
->hat_flags
& HAT_VLP
) && level
== VLP_LEVEL
;
737 if (is_vlp
|| shared
!= NULL
)
741 * First reuse a cached htable from the hat_ht_cached field, this
742 * avoids unnecessary trips through kmem/page allocators.
744 if (hat
->hat_ht_cached
!= NULL
&& !is_bare
) {
746 ht
= hat
->hat_ht_cached
;
748 hat
->hat_ht_cached
= ht
->ht_next
;
750 /* XX64 ASSERT() they're all zero somehow */
751 ASSERT(ht
->ht_pfn
!= PFN_INVALID
);
758 * Allocate an htable, possibly refilling the reserves.
760 if (USE_HAT_RESERVES()) {
761 ht
= htable_get_reserve();
764 * Donate successful htable allocations to the reserve.
767 ht
= kmem_cache_alloc(htable_cache
, kmflags
);
770 ht
->ht_pfn
= PFN_INVALID
;
771 if (USE_HAT_RESERVES() ||
772 htable_reserve_cnt
>= htable_reserve_amount
)
774 htable_put_reserve(ht
);
779 * allocate a page for the hardware page table if needed
781 if (ht
!= NULL
&& !is_bare
) {
783 ht
->ht_pfn
= ptable_alloc((uintptr_t)ht
);
784 if (ht
->ht_pfn
== PFN_INVALID
) {
785 if (USE_HAT_RESERVES())
786 htable_put_reserve(ht
);
788 kmem_cache_free(htable_cache
, ht
);
795 * If allocations failed, kick off a kmem_reap() and resort to
796 * htable steal(). We may spin here if the system is very low on
797 * memory. If the kernel itself has consumed all memory and kmem_reap()
798 * can't free up anything, then we'll really get stuck here.
799 * That should only happen in a system where the administrator has
800 * misconfigured VM parameters via /etc/system.
802 while (ht
== NULL
&& can_steal_post_boot
) {
804 ht
= htable_steal(1);
805 HATSTAT_INC(hs_steals
);
808 * If we stole for a bare htable, release the pagetable page.
812 ptable_free(ht
->ht_pfn
);
813 ht
->ht_pfn
= PFN_INVALID
;
814 #if defined(__xpv) && defined(__amd64)
816 * make stolen page table writable again in kpm
818 } else if (kpm_vbase
&& xen_kpm_page(ht
->ht_pfn
,
819 PT_VALID
| PT_WRITABLE
) < 0) {
820 panic("failure making kpm r/w pfn=0x%lx",
828 * All attempts to allocate or steal failed. This should only happen
829 * if we run out of memory during boot, due perhaps to a huge
830 * boot_archive. At this point there's no way to continue.
833 panic("htable_alloc(): couldn't steal\n");
835 #if defined(__amd64) && defined(__xpv)
837 * Under the 64-bit hypervisor, we have 2 top level page tables.
838 * If this allocation fails, we'll resort to stealing.
839 * We use the stolen page indirectly, by freeing the
840 * stolen htable first.
842 if (level
== mmu
.max_level
) {
846 hat
->hat_user_ptable
= ptable_alloc((uintptr_t)ht
+ 1);
847 if (hat
->hat_user_ptable
!= PFN_INVALID
)
849 stolen
= htable_steal(1);
851 panic("2nd steal ptable failed\n");
854 block_zero_no_xmm(kpm_vbase
+ pfn_to_pa(hat
->hat_user_ptable
),
860 * Shared page tables have all entries locked and entries may not
861 * be added or deleted.
864 if (shared
!= NULL
) {
865 ASSERT(shared
->ht_valid_cnt
> 0);
866 ht
->ht_flags
|= HTABLE_SHARED_PFN
;
867 ht
->ht_pfn
= shared
->ht_pfn
;
869 ht
->ht_valid_cnt
= 0; /* updated in hat_share() */
870 ht
->ht_shares
= shared
;
873 ht
->ht_shares
= NULL
;
875 ht
->ht_valid_cnt
= 0;
879 * setup flags, etc. for VLP htables
882 ht
->ht_flags
|= HTABLE_VLP
;
883 ASSERT(ht
->ht_pfn
== PFN_INVALID
);
891 ht
->ht_parent
= NULL
;
892 ht
->ht_vaddr
= vaddr
;
893 ht
->ht_level
= level
;
899 * Zero out any freshly allocated page table
902 x86pte_zero(ht
, 0, mmu
.ptes_per_table
);
904 #if defined(__amd64) && defined(__xpv)
905 if (!is_bare
&& kpm_vbase
) {
906 (void) xen_kpm_page(ht
->ht_pfn
, PT_VALID
);
907 if (level
== mmu
.max_level
)
908 (void) xen_kpm_page(hat
->hat_user_ptable
, PT_VALID
);
916 * Free up an htable, either to a hat's cached list, the reserves or
920 htable_free(htable_t
*ht
)
922 hat_t
*hat
= ht
->ht_hat
;
925 * If the process isn't exiting, cache the free htable in the hat
926 * structure. We always do this for the boot time reserve. We don't
927 * do this if the hat is exiting or we are stealing/reaping htables.
930 !(ht
->ht_flags
& HTABLE_SHARED_PFN
) &&
932 (!(hat
->hat_flags
& HAT_FREEING
) && !htable_dont_cache
))) {
933 ASSERT((ht
->ht_flags
& HTABLE_VLP
) == 0);
934 ASSERT(ht
->ht_pfn
!= PFN_INVALID
);
936 ht
->ht_next
= hat
->hat_ht_cached
;
937 hat
->hat_ht_cached
= ht
;
943 * If we have a hardware page table, free it.
944 * We don't free page tables that are accessed by sharing.
946 if (ht
->ht_flags
& HTABLE_SHARED_PFN
) {
947 ASSERT(ht
->ht_pfn
!= PFN_INVALID
);
948 } else if (!(ht
->ht_flags
& HTABLE_VLP
)) {
949 ptable_free(ht
->ht_pfn
);
950 #if defined(__amd64) && defined(__xpv)
951 if (ht
->ht_level
== mmu
.max_level
) {
952 ptable_free(hat
->hat_user_ptable
);
953 hat
->hat_user_ptable
= PFN_INVALID
;
957 ht
->ht_pfn
= PFN_INVALID
;
960 * Free it or put into reserves.
962 if (USE_HAT_RESERVES() || htable_reserve_cnt
< htable_reserve_amount
) {
963 htable_put_reserve(ht
);
965 kmem_cache_free(htable_cache
, ht
);
966 htable_adjust_reserve();
972 * This is called when a hat is being destroyed or swapped out. We reap all
973 * the remaining htables in the hat cache. If destroying all left over
974 * htables are also destroyed.
976 * We also don't need to invalidate any of the PTPs nor do any demapping.
979 htable_purge_hat(hat_t
*hat
)
985 * Purge the htable cache if just reaping.
987 if (!(hat
->hat_flags
& HAT_FREEING
)) {
988 atomic_inc_32(&htable_dont_cache
);
991 ht
= hat
->hat_ht_cached
;
996 hat
->hat_ht_cached
= ht
->ht_next
;
1000 atomic_dec_32(&htable_dont_cache
);
1005 * if freeing, no locking is needed
1007 while ((ht
= hat
->hat_ht_cached
) != NULL
) {
1008 hat
->hat_ht_cached
= ht
->ht_next
;
1013 * walk thru the htable hash table and free all the htables in it.
1015 for (h
= 0; h
< hat
->hat_num_hash
; ++h
) {
1016 while ((ht
= hat
->hat_ht_hash
[h
]) != NULL
) {
1018 ht
->ht_next
->ht_prev
= ht
->ht_prev
;
1021 ht
->ht_prev
->ht_next
= ht
->ht_next
;
1023 ASSERT(hat
->hat_ht_hash
[h
] == ht
);
1024 hat
->hat_ht_hash
[h
] = ht
->ht_next
;
1032 * Unlink an entry for a table at vaddr and level out of the existing table
1033 * one level higher. We are always holding the HASH_ENTER() when doing this.
1036 unlink_ptp(htable_t
*higher
, htable_t
*old
, uintptr_t vaddr
)
1038 uint_t entry
= htable_va2entry(vaddr
, higher
);
1039 x86pte_t expect
= MAKEPTP(old
->ht_pfn
, old
->ht_level
);
1041 hat_t
*hat
= old
->ht_hat
;
1043 ASSERT(higher
->ht_busy
> 0);
1044 ASSERT(higher
->ht_valid_cnt
> 0);
1045 ASSERT(old
->ht_valid_cnt
== 0);
1046 found
= x86pte_cas(higher
, entry
, expect
, 0);
1049 * This is weird, but Xen apparently automatically unlinks empty
1050 * pagetables from the upper page table. So allow PTP to be 0 already.
1052 if (found
!= expect
&& found
!= 0)
1054 if (found
!= expect
)
1056 panic("Bad PTP found=" FMT_PTE
", expected=" FMT_PTE
,
1060 * When a top level VLP page table entry changes, we must issue
1061 * a reload of cr3 on all processors.
1063 * If we don't need do do that, then we still have to INVLPG against
1064 * an address covered by the inner page table, as the latest processors
1065 * have TLB-like caches for non-leaf page table entries.
1067 if (!(hat
->hat_flags
& HAT_FREEING
)) {
1068 hat_tlb_inval(hat
, (higher
->ht_flags
& HTABLE_VLP
) ?
1069 DEMAP_ALL_ADDR
: old
->ht_vaddr
);
1072 HTABLE_DEC(higher
->ht_valid_cnt
);
1076 * Link an entry for a new table at vaddr and level into the existing table
1077 * one level higher. We are always holding the HASH_ENTER() when doing this.
1080 link_ptp(htable_t
*higher
, htable_t
*new, uintptr_t vaddr
)
1082 uint_t entry
= htable_va2entry(vaddr
, higher
);
1083 x86pte_t newptp
= MAKEPTP(new->ht_pfn
, new->ht_level
);
1086 ASSERT(higher
->ht_busy
> 0);
1088 ASSERT(new->ht_level
!= mmu
.max_level
);
1090 HTABLE_INC(higher
->ht_valid_cnt
);
1092 found
= x86pte_cas(higher
, entry
, 0, newptp
);
1093 if ((found
& ~PT_REF
) != 0)
1094 panic("HAT: ptp not 0, found=" FMT_PTE
, found
);
1097 * When any top level VLP page table entry changes, we must issue
1098 * a reload of cr3 on all processors using it.
1099 * We also need to do this for the kernel hat on PAE 32 bit kernel.
1103 (higher
->ht_hat
== kas
.a_hat
&& higher
->ht_level
== VLP_LEVEL
) ||
1105 (higher
->ht_flags
& HTABLE_VLP
))
1106 hat_tlb_inval(higher
->ht_hat
, DEMAP_ALL_ADDR
);
1110 * Release of hold on an htable. If this is the last use and the pagetable
1111 * is empty we may want to free it, then recursively look at the pagetable
1112 * above it. The recursion is handled by the outer while() loop.
1114 * On the metal, during process exit, we don't bother unlinking the tables from
1115 * upper level pagetables. They are instead handled in bulk by hat_free_end().
1116 * We can't do this on the hypervisor as we need the page table to be
1117 * implicitly unpinnned before it goes to the free page lists. This can't
1118 * happen unless we fully unlink it from the page table hierarchy.
1121 htable_release(htable_t
*ht
)
1130 while (ht
!= NULL
) {
1135 level
= ht
->ht_level
;
1136 hashval
= HTABLE_HASH(hat
, va
, level
);
1139 * The common case is that this isn't the last use of
1140 * an htable so we don't want to free the htable.
1142 HTABLE_ENTER(hashval
);
1143 ASSERT(ht
->ht_valid_cnt
>= 0);
1144 ASSERT(ht
->ht_busy
> 0);
1145 if (ht
->ht_valid_cnt
> 0)
1147 if (ht
->ht_busy
> 1)
1149 ASSERT(ht
->ht_lock_cnt
== 0);
1153 * we always release empty shared htables
1155 if (!(ht
->ht_flags
& HTABLE_SHARED_PFN
)) {
1158 * don't release if in address space tear down
1160 if (hat
->hat_flags
& HAT_FREEING
)
1164 * At and above max_page_level, free if it's for
1165 * a boot-time kernel mapping below kernelbase.
1167 if (level
>= mmu
.max_page_level
&&
1168 (hat
!= kas
.a_hat
|| va
>= kernelbase
))
1174 * Remember if we destroy an htable that shares its PFN
1177 if (ht
->ht_flags
& HTABLE_SHARED_PFN
) {
1178 ASSERT(shared
== NULL
);
1179 shared
= ht
->ht_shares
;
1180 HATSTAT_INC(hs_htable_unshared
);
1184 * Handle release of a table and freeing the htable_t.
1185 * Unlink it from the table higher (ie. ht_parent).
1187 higher
= ht
->ht_parent
;
1188 ASSERT(higher
!= NULL
);
1191 * Unlink the pagetable.
1193 unlink_ptp(higher
, ht
, va
);
1196 * remove this htable from its hash list
1199 ht
->ht_next
->ht_prev
= ht
->ht_prev
;
1202 ht
->ht_prev
->ht_next
= ht
->ht_next
;
1204 ASSERT(hat
->hat_ht_hash
[hashval
] == ht
);
1205 hat
->hat_ht_hash
[hashval
] = ht
->ht_next
;
1207 HTABLE_EXIT(hashval
);
1212 ASSERT(ht
->ht_busy
>= 1);
1214 HTABLE_EXIT(hashval
);
1217 * If we released a shared htable, do a release on the htable
1218 * from which it shared
1225 * Find the htable for the pagetable at the given level for the given address.
1226 * If found acquires a hold that eventually needs to be htable_release()d
1229 htable_lookup(hat_t
*hat
, uintptr_t vaddr
, level_t level
)
1233 htable_t
*ht
= NULL
;
1236 ASSERT(level
<= TOP_LEVEL(hat
));
1238 if (level
== TOP_LEVEL(hat
)) {
1239 #if defined(__amd64)
1241 * 32 bit address spaces on 64 bit kernels need to check
1242 * for overflow of the 32 bit address space
1244 if ((hat
->hat_flags
& HAT_VLP
) && vaddr
>= ((uint64_t)1 << 32))
1249 base
= vaddr
& LEVEL_MASK(level
+ 1);
1252 hashval
= HTABLE_HASH(hat
, base
, level
);
1253 HTABLE_ENTER(hashval
);
1254 for (ht
= hat
->hat_ht_hash
[hashval
]; ht
; ht
= ht
->ht_next
) {
1255 if (ht
->ht_hat
== hat
&&
1256 ht
->ht_vaddr
== base
&&
1257 ht
->ht_level
== level
)
1263 HTABLE_EXIT(hashval
);
1268 * Acquires a hold on a known htable (from a locked hment entry).
1271 htable_acquire(htable_t
*ht
)
1273 hat_t
*hat
= ht
->ht_hat
;
1274 level_t level
= ht
->ht_level
;
1275 uintptr_t base
= ht
->ht_vaddr
;
1276 uint_t hashval
= HTABLE_HASH(hat
, base
, level
);
1278 HTABLE_ENTER(hashval
);
1281 * make sure the htable is there
1286 for (h
= hat
->hat_ht_hash
[hashval
];
1294 HTABLE_EXIT(hashval
);
1298 * Find the htable for the pagetable at the given level for the given address.
1299 * If found acquires a hold that eventually needs to be htable_release()d
1300 * If not found the table is created.
1302 * Since we can't hold a hash table mutex during allocation, we have to
1303 * drop it and redo the search on a create. Then we may have to free the newly
1304 * allocated htable if another thread raced in and created it ahead of us.
1317 htable_t
*higher
= NULL
;
1318 htable_t
*new = NULL
;
1320 if (level
< 0 || level
> TOP_LEVEL(hat
))
1321 panic("htable_create(): level %d out of range\n", level
);
1324 * Create the page tables in top down order.
1326 for (l
= TOP_LEVEL(hat
); l
>= level
; --l
) {
1328 if (l
== TOP_LEVEL(hat
))
1331 base
= vaddr
& LEVEL_MASK(l
+ 1);
1333 h
= HTABLE_HASH(hat
, base
, l
);
1336 * look up the htable at this level
1339 if (l
== TOP_LEVEL(hat
)) {
1340 ht
= hat
->hat_htable
;
1342 for (ht
= hat
->hat_ht_hash
[h
]; ht
; ht
= ht
->ht_next
) {
1343 ASSERT(ht
->ht_hat
== hat
);
1344 if (ht
->ht_vaddr
== base
&&
1351 * if we found the htable, increment its busy cnt
1352 * and if we had allocated a new htable, free it.
1356 * If we find a pre-existing shared table, it must
1357 * share from the same place.
1359 if (l
== level
&& shared
&& ht
->ht_shares
&&
1360 ht
->ht_shares
!= shared
) {
1361 panic("htable shared from wrong place "
1362 "found htable=%p shared=%p",
1363 (void *)ht
, (void *)shared
);
1370 htable_release(higher
);
1374 * if we didn't find it on the first search
1375 * allocate a new one and search again
1377 } else if (new == NULL
) {
1379 new = htable_alloc(hat
, base
, l
,
1380 l
== level
? shared
: NULL
);
1384 * 2nd search and still not there, use "new" table
1385 * Link new table into higher, when not at top level.
1389 if (higher
!= NULL
) {
1390 link_ptp(higher
, ht
, base
);
1391 ht
->ht_parent
= higher
;
1393 ht
->ht_next
= hat
->hat_ht_hash
[h
];
1394 ASSERT(ht
->ht_prev
== NULL
);
1395 if (hat
->hat_ht_hash
[h
])
1396 hat
->hat_ht_hash
[h
]->ht_prev
= ht
;
1397 hat
->hat_ht_hash
[h
] = ht
;
1401 * Note we don't do htable_release(higher).
1402 * That happens recursively when "new" is removed by
1403 * htable_release() or htable_steal().
1408 * If we just created a new shared page table we
1409 * increment the shared htable's busy count, so that
1410 * it can't be the victim of a steal even if it's empty.
1412 if (l
== level
&& shared
) {
1413 (void) htable_lookup(shared
->ht_hat
,
1414 shared
->ht_vaddr
, shared
->ht_level
);
1415 HATSTAT_INC(hs_htable_shared
);
1424 * Inherit initial pagetables from the boot program. On the 64-bit
1425 * hypervisor we also temporarily mark the p_index field of page table
1426 * pages, so we know not to try making them writable in seg_kpm.
1442 extern page_t
*boot_claim_page(pfn_t
);
1444 ht
= htable_get_reserve();
1445 if (level
== mmu
.max_level
)
1446 kas
.a_hat
->hat_htable
= ht
;
1448 ht
->ht_parent
= parent
;
1449 ht
->ht_vaddr
= base
;
1450 ht
->ht_level
= level
;
1456 ht
->ht_lock_cnt
= 0;
1457 ht
->ht_valid_cnt
= 0;
1461 h
= HTABLE_HASH(hat
, base
, level
);
1463 ht
->ht_next
= hat
->hat_ht_hash
[h
];
1464 ASSERT(ht
->ht_prev
== NULL
);
1465 if (hat
->hat_ht_hash
[h
])
1466 hat
->hat_ht_hash
[h
]->ht_prev
= ht
;
1467 hat
->hat_ht_hash
[h
] = ht
;
1471 * make sure the page table physical page is not FREE
1473 if (page_resv(1, KM_NOSLEEP
) == 0)
1474 panic("page_resv() failed in ptable alloc");
1476 pp
= boot_claim_page(pfn
);
1480 * Page table pages that were allocated by dboot or
1481 * in very early startup didn't go through boot_mapin()
1482 * and so won't have vnode/offsets. Fix that here.
1484 if (pp
->p_vnode
== NULL
) {
1485 /* match offset calculation in page_get_physical() */
1486 u_offset_t offset
= (uintptr_t)ht
;
1487 if (offset
> kernelbase
)
1488 offset
-= kernelbase
;
1489 offset
<<= MMU_PAGESHIFT
;
1490 #if defined(__amd64)
1491 offset
+= mmu
.hole_start
; /* something in VA hole */
1493 offset
+= 1ULL << 40; /* something > 4 Gig */
1495 ASSERT(page_exists(&kvp
, offset
) == NULL
);
1496 (void) page_hashin(pp
, &kvp
, offset
, NULL
);
1499 #if defined(__xpv) && defined(__amd64)
1501 * Record in the page_t that is a pagetable for segkpm setup.
1508 * Count valid mappings and recursively attach lower level pagetables.
1510 ptep
= kbm_remap_window(pfn_to_pa(pfn
), 0);
1511 for (i
= 0; i
< HTABLE_NUM_PTES(ht
); ++i
) {
1515 pte
= ((x86pte32_t
*)ptep
)[i
];
1516 if (!IN_HYPERVISOR_VA(base
) && PTE_ISVALID(pte
)) {
1518 if (!PTE_ISPAGE(pte
, level
)) {
1519 htable_attach(hat
, base
, level
- 1,
1520 ht
, PTE2PFN(pte
, level
));
1521 ptep
= kbm_remap_window(pfn_to_pa(pfn
), 0);
1524 base
+= LEVEL_SIZE(level
);
1525 if (base
== mmu
.hole_start
)
1526 base
= (mmu
.hole_end
+ MMU_PAGEOFFSET
) & MMU_PAGEMASK
;
1530 * As long as all the mappings we had were below kernel base
1531 * we can release the htable.
1533 if (base
< kernelbase
)
1538 * Walk through a given htable looking for the first valid entry. This
1539 * routine takes both a starting and ending address. The starting address
1540 * is required to be within the htable provided by the caller, but there is
1541 * no such restriction on the ending address.
1543 * If the routine finds a valid entry in the htable (at or beyond the
1544 * starting address), the PTE (and its address) will be returned.
1545 * This PTE may correspond to either a page or a pagetable - it is the
1546 * caller's responsibility to determine which. If no valid entry is
1547 * found, 0 (and invalid PTE) and the next unexamined address will be
1550 * The loop has been carefully coded for optimization.
1553 htable_scan(htable_t
*ht
, uintptr_t *vap
, uintptr_t eaddr
)
1556 x86pte_t found_pte
= (x86pte_t
)0;
1558 caddr_t end_pte_ptr
;
1559 int l
= ht
->ht_level
;
1560 uintptr_t va
= *vap
& LEVEL_MASK(l
);
1561 size_t pgsize
= LEVEL_SIZE(l
);
1563 ASSERT(va
>= ht
->ht_vaddr
);
1564 ASSERT(va
<= HTABLE_LAST_PAGE(ht
));
1567 * Compute the starting index and ending virtual address
1569 e
= htable_va2entry(va
, ht
);
1572 * The following page table scan code knows that the valid
1573 * bit of a PTE is in the lowest byte AND that x86 is little endian!!
1575 pte_ptr
= (caddr_t
)x86pte_access_pagetable(ht
, 0);
1576 end_pte_ptr
= (caddr_t
)PT_INDEX_PTR(pte_ptr
, HTABLE_NUM_PTES(ht
));
1577 pte_ptr
= (caddr_t
)PT_INDEX_PTR((x86pte_t
*)pte_ptr
, e
);
1578 while (!PTE_ISVALID(*pte_ptr
)) {
1582 pte_ptr
+= mmu
.pte_size
;
1583 ASSERT(pte_ptr
<= end_pte_ptr
);
1584 if (pte_ptr
== end_pte_ptr
)
1589 * if we found a valid PTE, load the entire PTE
1591 if (va
< eaddr
&& pte_ptr
!= end_pte_ptr
)
1592 found_pte
= GET_PTE((x86pte_t
*)pte_ptr
);
1593 x86pte_release_pagetable(ht
);
1595 #if defined(__amd64)
1597 * deal with VA hole on amd64
1599 if (l
== mmu
.max_level
&& va
>= mmu
.hole_start
&& va
<= mmu
.hole_end
)
1600 va
= mmu
.hole_end
+ va
- mmu
.hole_start
;
1601 #endif /* __amd64 */
1608 * Find the address and htable for the first populated translation at or
1609 * above the given virtual address. The caller may also specify an upper
1610 * limit to the address range to search. Uses level information to quickly
1611 * skip unpopulated sections of virtual address spaces.
1613 * If not found returns NULL. When found, returns the htable and virt addr
1614 * and has a hold on the htable.
1623 uintptr_t va
= *vaddr
;
1625 htable_t
*prev
= *htp
;
1627 level_t max_mapped_level
;
1633 * If this is a user address, then we know we need not look beyond
1636 ASSERT(hat
== kas
.a_hat
|| eaddr
<= kernelbase
||
1637 eaddr
== HTABLE_WALK_TO_END
);
1638 if (hat
!= kas
.a_hat
&& eaddr
== HTABLE_WALK_TO_END
)
1642 * If we're coming in with a previous page table, search it first
1643 * without doing an htable_lookup(), this should be frequent.
1646 ASSERT(prev
->ht_busy
> 0);
1647 ASSERT(prev
->ht_vaddr
<= va
);
1649 if (va
<= HTABLE_LAST_PAGE(prev
)) {
1650 pte
= htable_scan(prev
, &va
, eaddr
);
1652 if (PTE_ISPAGE(pte
, l
)) {
1660 * We found nothing in the htable provided by the caller,
1661 * so fall through and do the full search
1663 htable_release(prev
);
1667 * Find the level of the largest pagesize used by this HAT.
1669 if (hat
->hat_ism_pgcnt
> 0) {
1670 max_mapped_level
= mmu
.umax_page_level
;
1672 max_mapped_level
= 0;
1673 for (l
= 1; l
<= mmu
.max_page_level
; ++l
)
1674 if (hat
->hat_pages_mapped
[l
] != 0)
1675 max_mapped_level
= l
;
1678 while (va
< eaddr
&& va
>= *vaddr
) {
1679 ASSERT(!IN_VA_HOLE(va
));
1682 * Find lowest table with any entry for given address.
1684 for (l
= 0; l
<= TOP_LEVEL(hat
); ++l
) {
1685 ht
= htable_lookup(hat
, va
, l
);
1687 pte
= htable_scan(ht
, &va
, eaddr
);
1688 if (PTE_ISPAGE(pte
, l
)) {
1698 * No htable at this level for the address. If there
1699 * is no larger page size that could cover it, we can
1700 * skip right to the start of the next page table.
1702 ASSERT(l
< TOP_LEVEL(hat
));
1703 if (l
>= max_mapped_level
) {
1704 va
= NEXT_ENTRY_VA(va
, l
+ 1);
1717 * Find the htable and page table entry index of the given virtual address
1718 * with pagesize at or below given level.
1719 * If not found returns NULL. When found, returns the htable, sets
1720 * entry, and has a hold on the htable.
1734 ASSERT(level
<= mmu
.max_page_level
);
1736 for (l
= 0; l
<= level
; ++l
) {
1737 ht
= htable_lookup(hat
, vaddr
, l
);
1740 e
= htable_va2entry(vaddr
, ht
);
1744 *pte
= x86pte_get(ht
, e
);
1751 * Find the htable and page table entry index of the given virtual address.
1752 * There must be a valid page mapped at the given address.
1753 * If not found returns NULL. When found, returns the htable, sets
1754 * entry, and has a hold on the htable.
1757 htable_getpage(struct hat
*hat
, uintptr_t vaddr
, uint_t
*entry
)
1763 ht
= htable_getpte(hat
, vaddr
, &e
, &pte
, mmu
.max_page_level
);
1770 if (PTE_ISPAGE(pte
, ht
->ht_level
))
1781 * To save on kernel VA usage, we avoid debug information in 32 bit
1784 #if defined(__amd64)
1785 int kmem_flags
= KMC_NOHASH
;
1786 #elif defined(__i386)
1787 int kmem_flags
= KMC_NOHASH
| KMC_NODEBUG
;
1791 * initialize kmem caches
1793 htable_cache
= kmem_cache_create("htable_t",
1794 sizeof (htable_t
), 0, NULL
, NULL
,
1795 htable_reap
, NULL
, hat_memload_arena
, kmem_flags
);
1799 * get the pte index for the virtual address in the given htable's pagetable
1802 htable_va2entry(uintptr_t va
, htable_t
*ht
)
1804 level_t l
= ht
->ht_level
;
1806 ASSERT(va
>= ht
->ht_vaddr
);
1807 ASSERT(va
<= HTABLE_LAST_PAGE(ht
));
1808 return ((va
>> LEVEL_SHIFT(l
)) & (HTABLE_NUM_PTES(ht
) - 1));
1812 * Given an htable and the index of a pte in it, return the virtual address
1816 htable_e2va(htable_t
*ht
, uint_t entry
)
1818 level_t l
= ht
->ht_level
;
1821 ASSERT(entry
< HTABLE_NUM_PTES(ht
));
1822 va
= ht
->ht_vaddr
+ ((uintptr_t)entry
<< LEVEL_SHIFT(l
));
1825 * Need to skip over any VA hole in top level table
1827 #if defined(__amd64)
1828 if (ht
->ht_level
== mmu
.max_level
&& va
>= mmu
.hole_start
)
1829 va
+= ((mmu
.hole_end
- mmu
.hole_start
) + 1);
1836 * The code uses compare and swap instructions to read/write PTE's to
1837 * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
1838 * will naturally be atomic.
1840 * The combination of using kpreempt_disable()/_enable() and the hci_mutex
1841 * are used to ensure that an interrupt won't overwrite a temporary mapping
1842 * while it's in use. If an interrupt thread tries to access a PTE, it will
1843 * yield briefly back to the pinned thread which holds the cpu's hci_mutex.
1846 x86pte_cpu_init(cpu_t
*cpu
)
1848 struct hat_cpu_info
*hci
;
1850 hci
= kmem_zalloc(sizeof (*hci
), KM_SLEEP
);
1851 mutex_init(&hci
->hci_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
1852 cpu
->cpu_hat_info
= hci
;
1856 x86pte_cpu_fini(cpu_t
*cpu
)
1858 struct hat_cpu_info
*hci
= cpu
->cpu_hat_info
;
1860 kmem_free(hci
, sizeof (*hci
));
1861 cpu
->cpu_hat_info
= NULL
;
1866 * On 32 bit kernels, loading a 64 bit PTE is a little tricky
1869 get_pte64(x86pte_t
*ptr
)
1871 volatile uint32_t *p
= (uint32_t *)ptr
;
1874 ASSERT(mmu
.pae_hat
!= 0);
1877 t
|= (uint64_t)p
[1] << 32;
1878 if ((t
& 0xffffffff) == p
[0])
1885 * Disable preemption and establish a mapping to the pagetable with the
1886 * given pfn. This is optimized for there case where it's the same
1887 * pfn as we last used referenced from this CPU.
1890 x86pte_access_pagetable(htable_t
*ht
, uint_t index
)
1893 * VLP pagetables are contained in the hat_t
1895 if (ht
->ht_flags
& HTABLE_VLP
)
1896 return (PT_INDEX_PTR(ht
->ht_hat
->hat_vlp_ptes
, index
));
1897 return (x86pte_mapin(ht
->ht_pfn
, index
, ht
));
1901 * map the given pfn into the page table window.
1905 x86pte_mapin(pfn_t pfn
, uint_t index
, htable_t
*ht
)
1912 ASSERT(pfn
!= PFN_INVALID
);
1914 if (!khat_running
) {
1915 caddr_t va
= kbm_remap_window(pfn_to_pa(pfn
), 1);
1916 return (PT_INDEX_PTR(va
, index
));
1920 * If kpm is available, use it.
1923 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn
), index
));
1926 * Disable preemption and grab the CPU's hci_mutex
1929 ASSERT(CPU
->cpu_hat_info
!= NULL
);
1930 mutex_enter(&CPU
->cpu_hat_info
->hci_mutex
);
1931 x
= PWIN_TABLE(CPU
->cpu_id
);
1932 pteptr
= (x86pte_t
*)PWIN_PTE_VA(x
);
1937 pte
= *(x86pte32_t
*)pteptr
;
1940 newpte
= MAKEPTE(pfn
, 0) | mmu
.pt_global
| mmu
.pt_nx
;
1943 * For hardware we can use a writable mapping.
1948 newpte
|= PT_WRITABLE
;
1950 if (!PTE_EQUIV(newpte
, pte
)) {
1953 if (!IN_XPV_PANIC()) {
1954 xen_map(newpte
, PWIN_VA(x
));
1958 XPV_ALLOW_PAGETABLE_UPDATES();
1962 *(x86pte32_t
*)pteptr
= newpte
;
1963 XPV_DISALLOW_PAGETABLE_UPDATES();
1964 mmu_tlbflush_entry((caddr_t
)(PWIN_VA(x
)));
1967 return (PT_INDEX_PTR(PWIN_VA(x
), index
));
1971 * Release access to a page table.
1974 x86pte_release_pagetable(htable_t
*ht
)
1977 * nothing to do for VLP htables
1979 if (ht
->ht_flags
& HTABLE_VLP
)
1988 if (kpm_vbase
!= NULL
|| !khat_running
)
1992 * Drop the CPU's hci_mutex and restore preemption.
1995 if (!IN_XPV_PANIC()) {
1999 * We need to always clear the mapping in case a page
2000 * that was once a page table page is ballooned out.
2002 va
= (uintptr_t)PWIN_VA(PWIN_TABLE(CPU
->cpu_id
));
2003 (void) HYPERVISOR_update_va_mapping(va
, 0,
2004 UVMF_INVLPG
| UVMF_LOCAL
);
2007 mutex_exit(&CPU
->cpu_hat_info
->hci_mutex
);
2012 * Atomic retrieval of a pagetable entry
2015 x86pte_get(htable_t
*ht
, uint_t entry
)
2021 * Be careful that loading PAE entries in 32 bit kernel is atomic.
2023 ASSERT(entry
< mmu
.ptes_per_table
);
2024 ptep
= x86pte_access_pagetable(ht
, entry
);
2025 pte
= GET_PTE(ptep
);
2026 x86pte_release_pagetable(ht
);
2031 * Atomic unconditional set of a page table entry, it returns the previous
2032 * value. For pre-existing mappings if the PFN changes, then we don't care
2033 * about the old pte's REF / MOD bits. If the PFN remains the same, we leave
2034 * the MOD/REF bits unchanged.
2036 * If asked to overwrite a link to a lower page table with a large page
2037 * mapping, this routine returns the special value of LPAGE_ERROR. This
2038 * allows the upper HAT layers to retry with a smaller mapping size.
2041 x86pte_set(htable_t
*ht
, uint_t entry
, x86pte_t
new, void *ptr
)
2046 level_t l
= ht
->ht_level
;
2047 x86pte_t pfn_mask
= (l
!= 0) ? PT_PADDR_LGPG
: PT_PADDR
;
2049 uintptr_t addr
= htable_e2va(ht
, entry
);
2050 hat_t
*hat
= ht
->ht_hat
;
2052 ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */
2053 ASSERT(!(ht
->ht_flags
& HTABLE_SHARED_PFN
));
2055 ptep
= x86pte_access_pagetable(ht
, entry
);
2060 * Install the new PTE. If remapping the same PFN, then
2061 * copy existing REF/MOD bits to new mapping.
2064 prev
= GET_PTE(ptep
);
2066 if (PTE_ISVALID(n
) && (prev
& pfn_mask
) == (new & pfn_mask
))
2067 n
|= prev
& (PT_REF
| PT_MOD
);
2070 * Another thread may have installed this mapping already,
2071 * flush the local TLB and be done.
2076 if (!IN_XPV_PANIC())
2077 xen_flush_va((caddr_t
)addr
);
2080 mmu_tlbflush_entry((caddr_t
)addr
);
2085 * Detect if we have a collision of installing a large
2086 * page mapping where there already is a lower page table.
2088 if (l
> 0 && (prev
& PT_VALID
) && !(prev
& PT_PAGESIZE
)) {
2093 XPV_ALLOW_PAGETABLE_UPDATES();
2094 old
= CAS_PTE(ptep
, prev
, n
);
2095 XPV_DISALLOW_PAGETABLE_UPDATES();
2096 } while (old
!= prev
);
2099 * Do a TLB demap if needed, ie. the old pte was valid.
2101 * Note that a stale TLB writeback to the PTE here either can't happen
2102 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
2103 * mappings, but they were created with REF and MOD already set, so
2104 * no stale writeback will happen.
2106 * Segmap is the only place where remaps happen on the same pfn and for
2107 * that we want to preserve the stale REF/MOD bits.
2110 hat_tlb_inval(hat
, addr
);
2114 x86pte_release_pagetable(ht
);
2119 * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2120 * This is used for links between pagetables of different levels.
2121 * Note we always create these links with dirty/access set, so they should
2125 x86pte_cas(htable_t
*ht
, uint_t entry
, x86pte_t old
, x86pte_t
new)
2131 * We can't use writable pagetables for upper level tables, so fake it.
2138 if (!IN_XPV_PANIC()) {
2139 ASSERT(!(ht
->ht_flags
& HTABLE_VLP
)); /* no VLP yet */
2140 ma
= pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht
->ht_pfn
), entry
));
2141 t
[0].ptr
= ma
| MMU_NORMAL_PT_UPDATE
;
2144 #if defined(__amd64)
2146 * On the 64-bit hypervisor we need to maintain the user mode
2147 * top page table too.
2149 if (ht
->ht_level
== mmu
.max_level
&& ht
->ht_hat
!= kas
.a_hat
) {
2150 ma
= pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2151 ht
->ht_hat
->hat_user_ptable
), entry
));
2152 t
[1].ptr
= ma
| MMU_NORMAL_PT_UPDATE
;
2156 #endif /* __amd64 */
2158 if (HYPERVISOR_mmu_update(t
, cnt
, &count
, DOMID_SELF
))
2159 panic("HYPERVISOR_mmu_update() failed");
2160 ASSERT(count
== cnt
);
2164 ptep
= x86pte_access_pagetable(ht
, entry
);
2165 XPV_ALLOW_PAGETABLE_UPDATES();
2166 pte
= CAS_PTE(ptep
, old
, new);
2167 XPV_DISALLOW_PAGETABLE_UPDATES();
2168 x86pte_release_pagetable(ht
);
2173 * Invalidate a page table entry as long as it currently maps something that
2174 * matches the value determined by expect.
2176 * Also invalidates any TLB entries and returns the previous value of the PTE.
2189 ASSERT(!(ht
->ht_flags
& HTABLE_SHARED_PFN
));
2190 ASSERT(ht
->ht_level
<= mmu
.max_page_level
);
2192 if (pte_ptr
!= NULL
)
2195 ptep
= x86pte_access_pagetable(ht
, entry
);
2199 * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing
2200 * with anything else.
2202 if ((ht
->ht_hat
->hat_flags
& HAT_FREEING
) && !IN_XPV_PANIC()) {
2207 oldpte
= GET_PTE(ptep
);
2208 if (expect
!= 0 && (oldpte
& PT_PADDR
) != (expect
& PT_PADDR
))
2210 ma
= pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht
->ht_pfn
), entry
));
2211 t
[0].ptr
= ma
| MMU_NORMAL_PT_UPDATE
;
2213 if (HYPERVISOR_mmu_update(t
, 1, &count
, DOMID_SELF
))
2214 panic("HYPERVISOR_mmu_update() failed");
2221 * Note that the loop is needed to handle changes due to h/w updating
2225 oldpte
= GET_PTE(ptep
);
2226 if (expect
!= 0 && (oldpte
& PT_PADDR
) != (expect
& PT_PADDR
))
2228 XPV_ALLOW_PAGETABLE_UPDATES();
2229 found
= CAS_PTE(ptep
, oldpte
, 0);
2230 XPV_DISALLOW_PAGETABLE_UPDATES();
2231 } while (found
!= oldpte
);
2232 if (oldpte
& (PT_REF
| PT_MOD
))
2233 hat_tlb_inval(ht
->ht_hat
, htable_e2va(ht
, entry
));
2236 if (pte_ptr
== NULL
)
2237 x86pte_release_pagetable(ht
);
2242 * Change a page table entry af it currently matches the value in expect.
2255 ASSERT(!(ht
->ht_flags
& HTABLE_SHARED_PFN
));
2256 ASSERT(ht
->ht_level
<= mmu
.max_page_level
);
2258 ptep
= x86pte_access_pagetable(ht
, entry
);
2259 XPV_ALLOW_PAGETABLE_UPDATES();
2260 found
= CAS_PTE(ptep
, expect
, new);
2261 XPV_DISALLOW_PAGETABLE_UPDATES();
2262 if (found
== expect
) {
2263 hat_tlb_inval(ht
->ht_hat
, htable_e2va(ht
, entry
));
2266 * When removing write permission *and* clearing the
2267 * MOD bit, check if a write happened via a stale
2268 * TLB entry before the TLB shootdown finished.
2270 * If it did happen, simply re-enable write permission and
2271 * act like the original CAS failed.
2273 if ((expect
& (PT_WRITABLE
| PT_MOD
)) == PT_WRITABLE
&&
2274 (new & (PT_WRITABLE
| PT_MOD
)) == 0 &&
2275 (GET_PTE(ptep
) & PT_MOD
) != 0) {
2277 found
= GET_PTE(ptep
);
2278 XPV_ALLOW_PAGETABLE_UPDATES();
2280 CAS_PTE(ptep
, found
, found
| PT_WRITABLE
);
2281 XPV_DISALLOW_PAGETABLE_UPDATES();
2282 } while ((found
& PT_WRITABLE
) == 0);
2285 x86pte_release_pagetable(ht
);
2291 * Copy page tables - this is just a little more complicated than the
2292 * previous routines. Note that it's also not atomic! It also is never
2293 * used for VLP pagetables.
2296 x86pte_copy(htable_t
*src
, htable_t
*dest
, uint_t entry
, uint_t count
)
2304 ASSERT(khat_running
);
2305 ASSERT(!(dest
->ht_flags
& HTABLE_VLP
));
2306 ASSERT(!(src
->ht_flags
& HTABLE_VLP
));
2307 ASSERT(!(src
->ht_flags
& HTABLE_SHARED_PFN
));
2308 ASSERT(!(dest
->ht_flags
& HTABLE_SHARED_PFN
));
2311 * Acquire access to the CPU pagetable windows for the dest and source.
2313 dst_va
= (caddr_t
)x86pte_access_pagetable(dest
, entry
);
2316 PT_INDEX_PTR(hat_kpm_pfn2va(src
->ht_pfn
), entry
);
2318 uint_t x
= PWIN_SRC(CPU
->cpu_id
);
2321 * Finish defining the src pagetable mapping
2323 src_va
= (caddr_t
)PT_INDEX_PTR(PWIN_VA(x
), entry
);
2324 pte
= MAKEPTE(src
->ht_pfn
, 0) | mmu
.pt_global
| mmu
.pt_nx
;
2325 pteptr
= (x86pte_t
*)PWIN_PTE_VA(x
);
2329 *(x86pte32_t
*)pteptr
= pte
;
2330 mmu_tlbflush_entry((caddr_t
)(PWIN_VA(x
)));
2336 size
= count
<< mmu
.pte_size_shift
;
2337 bcopy(src_va
, dst_va
, size
);
2339 x86pte_release_pagetable(dest
);
2345 * The hypervisor only supports writable pagetables at level 0, so we have
2346 * to install these 1 by 1 the slow way.
2349 x86pte_copy(htable_t
*src
, htable_t
*dest
, uint_t entry
, uint_t count
)
2354 ASSERT(!IN_XPV_PANIC());
2355 src_va
= (caddr_t
)x86pte_access_pagetable(src
, entry
);
2358 pte
= *(x86pte_t
*)src_va
;
2360 pte
= *(x86pte32_t
*)src_va
;
2362 set_pteval(pfn_to_pa(dest
->ht_pfn
), entry
,
2363 dest
->ht_level
, pte
);
2365 if (dest
->ht_level
== mmu
.max_level
&&
2366 htable_e2va(dest
, entry
) < HYPERVISOR_VIRT_END
)
2368 pfn_to_pa(dest
->ht_hat
->hat_user_ptable
),
2369 entry
, dest
->ht_level
, pte
);
2374 src_va
+= mmu
.pte_size
;
2376 x86pte_release_pagetable(src
);
2381 * Zero page table entries - Note this doesn't use atomic stores!
2384 x86pte_zero(htable_t
*dest
, uint_t entry
, uint_t count
)
2394 * Map in the page table to be zeroed.
2396 ASSERT(!(dest
->ht_flags
& HTABLE_SHARED_PFN
));
2397 ASSERT(!(dest
->ht_flags
& HTABLE_VLP
));
2400 * On the hypervisor we don't use x86pte_access_pagetable() since
2401 * in this case the page is not pinned yet.
2404 if (kpm_vbase
== NULL
) {
2406 ASSERT(CPU
->cpu_hat_info
!= NULL
);
2407 mutex_enter(&CPU
->cpu_hat_info
->hci_mutex
);
2408 x
= PWIN_TABLE(CPU
->cpu_id
);
2409 newpte
= MAKEPTE(dest
->ht_pfn
, 0) | PT_WRITABLE
;
2410 xen_map(newpte
, PWIN_VA(x
));
2411 dst_va
= (caddr_t
)PT_INDEX_PTR(PWIN_VA(x
), entry
);
2414 dst_va
= (caddr_t
)x86pte_access_pagetable(dest
, entry
);
2416 size
= count
<< mmu
.pte_size_shift
;
2417 ASSERT(size
> BLOCKZEROALIGN
);
2419 if (!is_x86_feature(x86_featureset
, X86FSET_SSE2
))
2420 bzero(dst_va
, size
);
2423 block_zero_no_xmm(dst_va
, size
);
2426 if (kpm_vbase
== NULL
) {
2427 xen_map(0, PWIN_VA(x
));
2428 mutex_exit(&CPU
->cpu_hat_info
->hci_mutex
);
2432 x86pte_release_pagetable(dest
);
2436 * Called to ensure that all pagetables are in the system dump
2446 * Dump all page tables
2448 for (hat
= kas
.a_hat
; hat
!= NULL
; hat
= hat
->hat_next
) {
2449 for (h
= 0; h
< hat
->hat_num_hash
; ++h
) {
2450 for (ht
= hat
->hat_ht_hash
[h
]; ht
; ht
= ht
->ht_next
) {
2451 if ((ht
->ht_flags
& HTABLE_VLP
) == 0)
2452 dump_page(ht
->ht_pfn
);