4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014 by Delphix. All rights reserved.
25 * Copyright 2015 Joyent, Inc.
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
31 #include <sys/atomic.h>
32 #include <sys/bitmap.h>
33 #include <sys/machparam.h>
34 #include <sys/machsystm.h>
36 #include <sys/systm.h>
37 #include <sys/cpuvar.h>
38 #include <sys/thread.h>
44 #include <sys/vmsystm.h>
45 #include <sys/promif.h>
47 #include <sys/x86_archext.h>
48 #include <sys/archsystm.h>
49 #include <sys/bootconf.h>
50 #include <sys/dumphdr.h>
51 #include <vm/seg_kmem.h>
52 #include <vm/seg_kpm.h>
54 #include <vm/hat_i86.h>
55 #include <sys/cmn_err.h>
56 #include <sys/panic.h>
59 #include <sys/hypervisor.h>
60 #include <sys/xpv_panic.h>
63 #include <sys/bootinfo.h>
64 #include <vm/kboot_mmu.h>
66 static void x86pte_zero(htable_t
*dest
, uint_t entry
, uint_t count
);
68 kmem_cache_t
*htable_cache
;
71 * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
72 * is used in order to facilitate testing of the htable_steal() code.
73 * By resetting htable_reserve_amount to a lower value, we can force
74 * stealing to occur. The reserve amount is a guess to get us through boot.
76 #define HTABLE_RESERVE_AMOUNT (200)
77 uint_t htable_reserve_amount
= HTABLE_RESERVE_AMOUNT
;
78 kmutex_t htable_reserve_mutex
;
79 uint_t htable_reserve_cnt
;
80 htable_t
*htable_reserve_pool
;
83 * Used to hand test htable_steal().
86 ulong_t force_steal
= 0;
87 ulong_t ptable_cnt
= 0;
91 * This variable is so that we can tune this via /etc/system
92 * Any value works, but a power of two <= mmu.ptes_per_table is best.
94 uint_t htable_steal_passes
= 8;
97 * mutex stuff for access to htable hash
99 #define NUM_HTABLE_MUTEX 128
100 kmutex_t htable_mutex
[NUM_HTABLE_MUTEX
];
101 #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1))
103 #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
104 #define HTABLE_EXIT(h) mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
107 * forward declarations
109 static void link_ptp(htable_t
*higher
, htable_t
*new, uintptr_t vaddr
);
110 static void unlink_ptp(htable_t
*higher
, htable_t
*old
, uintptr_t vaddr
);
111 static void htable_free(htable_t
*ht
);
112 static x86pte_t
*x86pte_access_pagetable(htable_t
*ht
, uint_t index
);
113 static void x86pte_release_pagetable(htable_t
*ht
);
114 static x86pte_t
x86pte_cas(htable_t
*ht
, uint_t entry
, x86pte_t old
,
118 * A counter to track if we are stealing or reaping htables. When non-zero
119 * htable_free() will directly free htables (either to the reserve or kmem)
120 * instead of putting them in a hat's htable cache.
122 uint32_t htable_dont_cache
= 0;
125 * Track the number of active pagetables, so we can know how many to reap
127 static uint32_t active_ptables
= 0;
131 * Deal with hypervisor complications.
134 xen_flush_va(caddr_t va
)
139 if (IN_XPV_PANIC()) {
140 mmu_tlbflush_entry((caddr_t
)va
);
142 t
.cmd
= MMUEXT_INVLPG_LOCAL
;
143 t
.arg1
.linear_addr
= (uintptr_t)va
;
144 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
145 panic("HYPERVISOR_mmuext_op() failed");
151 xen_gflush_va(caddr_t va
, cpuset_t cpus
)
156 if (IN_XPV_PANIC()) {
157 mmu_tlbflush_entry((caddr_t
)va
);
161 t
.cmd
= MMUEXT_INVLPG_MULTI
;
162 t
.arg1
.linear_addr
= (uintptr_t)va
;
163 /*LINTED: constant in conditional context*/
164 set_xen_guest_handle(t
.arg2
.vcpumask
, &cpus
);
165 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
166 panic("HYPERVISOR_mmuext_op() failed");
176 if (IN_XPV_PANIC()) {
177 xpv_panic_reload_cr3();
179 t
.cmd
= MMUEXT_TLB_FLUSH_LOCAL
;
180 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
181 panic("HYPERVISOR_mmuext_op() failed");
187 xen_gflush_tlb(cpuset_t cpus
)
192 ASSERT(!IN_XPV_PANIC());
193 t
.cmd
= MMUEXT_TLB_FLUSH_MULTI
;
194 /*LINTED: constant in conditional context*/
195 set_xen_guest_handle(t
.arg2
.vcpumask
, &cpus
);
196 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
197 panic("HYPERVISOR_mmuext_op() failed");
202 * Install/Adjust a kpm mapping under the hypervisor.
203 * Value of "how" should be:
204 * PT_WRITABLE | PT_VALID - regular kpm mapping
205 * PT_VALID - make mapping read-only
208 * returns 0 on success. non-zero for failure.
211 xen_kpm_page(pfn_t pfn
, uint_t how
)
213 paddr_t pa
= mmu_ptob((paddr_t
)pfn
);
214 x86pte_t pte
= PT_NOCONSIST
| PT_REF
| PT_MOD
;
216 if (kpm_vbase
== NULL
)
220 pte
|= pa_to_ma(pa
) | how
;
223 return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase
+ pa
,
224 pte
, UVMF_INVLPG
| UVMF_ALL
));
228 xen_pin(pfn_t pfn
, level_t lvl
)
233 t
.cmd
= MMUEXT_PIN_L1_TABLE
+ lvl
;
234 t
.arg1
.mfn
= pfn_to_mfn(pfn
);
235 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
236 panic("HYPERVISOR_mmuext_op() failed");
246 t
.cmd
= MMUEXT_UNPIN_TABLE
;
247 t
.arg1
.mfn
= pfn_to_mfn(pfn
);
248 if (HYPERVISOR_mmuext_op(&t
, 1, &count
, DOMID_SELF
) < 0)
249 panic("HYPERVISOR_mmuext_op() failed");
254 xen_map(uint64_t pte
, caddr_t va
)
256 if (HYPERVISOR_update_va_mapping((uintptr_t)va
, pte
,
257 UVMF_INVLPG
| UVMF_LOCAL
))
258 panic("HYPERVISOR_update_va_mapping() failed");
263 * Allocate a memory page for a hardware page table.
265 * A wrapper around page_get_physical(), with some extra checks.
268 ptable_alloc(uintptr_t seed
)
276 * The first check is to see if there is memory in the system. If we
277 * drop to throttlefree, then fail the ptable_alloc() and let the
278 * stealing code kick in. Note that we have to do this test here,
279 * since the test in page_create_throttle() would let the NOSLEEP
280 * allocation go through and deplete the page reserves.
282 * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check.
284 if (!NOMEMWAIT() && freemem
<= throttlefree
+ 1)
285 return (PFN_INVALID
);
289 * This code makes htable_steal() easier to test. By setting
290 * force_steal we force pagetable allocations to fall
291 * into the stealing code. Roughly 1 in ever "force_steal"
292 * page table allocations will fail.
294 if (proc_pageout
!= NULL
&& force_steal
> 1 &&
295 ++ptable_cnt
> force_steal
) {
297 return (PFN_INVALID
);
301 pp
= page_get_physical(seed
);
303 return (PFN_INVALID
);
304 ASSERT(PAGE_SHARED(pp
));
306 if (pfn
== PFN_INVALID
)
307 panic("ptable_alloc(): Invalid PFN!!");
308 atomic_inc_32(&active_ptables
);
309 HATSTAT_INC(hs_ptable_allocs
);
314 * Free an htable's associated page table page. See the comments
315 * for ptable_alloc().
318 ptable_free(pfn_t pfn
)
320 page_t
*pp
= page_numtopp_nolock(pfn
);
323 * need to destroy the page used for the pagetable
325 ASSERT(pfn
!= PFN_INVALID
);
326 HATSTAT_INC(hs_ptable_frees
);
327 atomic_dec_32(&active_ptables
);
329 panic("ptable_free(): no page for pfn!");
330 ASSERT(PAGE_SHARED(pp
));
331 ASSERT(pfn
== pp
->p_pagenum
);
332 ASSERT(!IN_XPV_PANIC());
335 * Get an exclusive lock, might have to wait for a kmem reader.
337 if (!page_tryupgrade(pp
)) {
338 u_offset_t off
= pp
->p_offset
;
340 pp
= page_lookup(&kvp
, off
, SE_EXCL
);
342 panic("page not found");
345 if (kpm_vbase
&& xen_kpm_page(pfn
, PT_VALID
| PT_WRITABLE
) < 0)
346 panic("failure making kpm r/w pfn=0x%lx", pfn
);
348 page_hashout(pp
, NULL
);
354 * Put one htable on the reserve list.
357 htable_put_reserve(htable_t
*ht
)
359 ht
->ht_hat
= NULL
; /* no longer tied to a hat */
360 ASSERT(ht
->ht_pfn
== PFN_INVALID
);
361 HATSTAT_INC(hs_htable_rputs
);
362 mutex_enter(&htable_reserve_mutex
);
363 ht
->ht_next
= htable_reserve_pool
;
364 htable_reserve_pool
= ht
;
365 ++htable_reserve_cnt
;
366 mutex_exit(&htable_reserve_mutex
);
370 * Take one htable from the reserve.
373 htable_get_reserve(void)
377 mutex_enter(&htable_reserve_mutex
);
378 if (htable_reserve_cnt
!= 0) {
379 ht
= htable_reserve_pool
;
381 ASSERT(ht
->ht_pfn
== PFN_INVALID
);
382 htable_reserve_pool
= ht
->ht_next
;
383 --htable_reserve_cnt
;
384 HATSTAT_INC(hs_htable_rgets
);
386 mutex_exit(&htable_reserve_mutex
);
391 * Allocate initial htables and put them on the reserve list
394 htable_initial_reserve(uint_t count
)
398 count
+= HTABLE_RESERVE_AMOUNT
;
400 ht
= kmem_cache_alloc(htable_cache
, KM_NOSLEEP
);
403 ASSERT(use_boot_reserve
);
404 ht
->ht_pfn
= PFN_INVALID
;
405 htable_put_reserve(ht
);
411 * Readjust the reserves after a thread finishes using them.
414 htable_adjust_reserve()
419 * Free any excess htables in the reserve list
421 while (htable_reserve_cnt
> htable_reserve_amount
&&
422 !USE_HAT_RESERVES()) {
423 ht
= htable_get_reserve();
426 ASSERT(ht
->ht_pfn
== PFN_INVALID
);
427 kmem_cache_free(htable_cache
, ht
);
432 * Search the active htables for one to steal. Start at a different hash
433 * bucket every time to help spread the pain of stealing
436 htable_steal_active(hat_t
*hat
, uint_t cnt
, uint_t threshold
,
437 uint_t
*stolen
, htable_t
**list
)
439 static uint_t h_seed
= 0;
440 htable_t
*higher
, *ht
;
441 uint_t h
, e
, h_start
;
445 h
= h_start
= h_seed
++ % hat
->hat_num_hash
;
449 for (ht
= hat
->hat_ht_hash
[h
]; ht
; ht
= ht
->ht_next
) {
452 * Can we rule out reaping?
454 if (ht
->ht_busy
!= 0 ||
455 (ht
->ht_flags
& HTABLE_SHARED_PFN
) ||
456 ht
->ht_level
> 0 || ht
->ht_valid_cnt
> threshold
||
457 ht
->ht_lock_cnt
!= 0)
461 * Increment busy so the htable can't disappear. We
462 * drop the htable mutex to avoid deadlocks with
463 * hat_pageunload() and the hment mutex while we
464 * call hat_pte_unmap()
471 * - unload and invalidate all PTEs
473 for (e
= 0, va
= ht
->ht_vaddr
;
474 e
< HTABLE_NUM_PTES(ht
) && ht
->ht_valid_cnt
> 0 &&
475 ht
->ht_busy
== 1 && ht
->ht_lock_cnt
== 0;
476 ++e
, va
+= MMU_PAGESIZE
) {
477 pte
= x86pte_get(ht
, e
);
478 if (!PTE_ISVALID(pte
))
480 hat_pte_unmap(ht
, e
, HAT_UNLOAD
, pte
, NULL
,
485 * Reacquire htable lock. If we didn't remove all
486 * mappings in the table, or another thread added a new
487 * mapping behind us, give up on this table.
490 if (ht
->ht_busy
!= 1 || ht
->ht_valid_cnt
!= 0 ||
491 ht
->ht_lock_cnt
!= 0) {
497 * Steal it and unlink the page table.
499 higher
= ht
->ht_parent
;
500 unlink_ptp(higher
, ht
, ht
->ht_vaddr
);
503 * remove from the hash list
506 ht
->ht_next
->ht_prev
= ht
->ht_prev
;
509 ht
->ht_prev
->ht_next
= ht
->ht_next
;
511 ASSERT(hat
->hat_ht_hash
[h
] == ht
);
512 hat
->hat_ht_hash
[h
] = ht
->ht_next
;
516 * Break to outer loop to release the
517 * higher (ht_parent) pagetable. This
518 * spreads out the pain caused by
528 htable_release(higher
);
529 if (++h
== hat
->hat_num_hash
)
531 } while (*stolen
< cnt
&& h
!= h_start
);
535 * Move hat to the end of the kas list
538 move_victim(hat_t
*hat
)
540 ASSERT(MUTEX_HELD(&hat_list_lock
));
542 /* unlink victim hat */
544 hat
->hat_prev
->hat_next
= hat
->hat_next
;
546 kas
.a_hat
->hat_next
= hat
->hat_next
;
549 hat
->hat_next
->hat_prev
= hat
->hat_prev
;
551 kas
.a_hat
->hat_prev
= hat
->hat_prev
;
552 /* relink at end of hat list */
553 hat
->hat_next
= NULL
;
554 hat
->hat_prev
= kas
.a_hat
->hat_prev
;
556 hat
->hat_prev
->hat_next
= hat
;
558 kas
.a_hat
->hat_next
= hat
;
560 kas
.a_hat
->hat_prev
= hat
;
564 * This routine steals htables from user processes. Called by htable_reap
565 * (reap=TRUE) or htable_alloc (reap=FALSE).
568 htable_steal(uint_t cnt
, boolean_t reap
)
570 hat_t
*hat
= kas
.a_hat
; /* list starts with khat */
571 htable_t
*list
= NULL
;
578 * Limit htable_steal_passes to something reasonable
580 if (htable_steal_passes
== 0)
581 htable_steal_passes
= 1;
582 if (htable_steal_passes
> mmu
.ptes_per_table
)
583 htable_steal_passes
= mmu
.ptes_per_table
;
586 * If we're stealing merely as part of kmem reaping (versus stealing
587 * to assure forward progress), we don't want to actually steal any
588 * active htables. (Stealing active htables merely to give memory
589 * back to the system can inadvertently kick off an htable crime wave
590 * as active processes repeatedly steal htables from one another,
591 * plummeting the system into a kind of HAT lawlessness that can
592 * become so violent as to impede the one thing that can end it: the
593 * freeing of memory via ARC reclaim and other means.) So if we're
594 * reaping, we limit ourselves to the first pass that steals cached
595 * htables that aren't in use -- which gives memory back, but averts
596 * the entire breakdown of social order.
598 passes
= reap
? 0 : htable_steal_passes
;
601 * Loop through all user hats. The 1st pass takes cached htables that
602 * aren't in use. The later passes steal by removing mappings, too.
604 atomic_inc_32(&htable_dont_cache
);
605 for (pass
= 0; pass
<= passes
&& stolen
< cnt
; ++pass
) {
606 threshold
= pass
* mmu
.ptes_per_table
/ htable_steal_passes
;
608 mutex_enter(&hat_list_lock
);
610 /* skip the first hat (kernel) */
611 hat
= kas
.a_hat
->hat_next
;
614 * Skip any hat that is already being stolen from.
616 * We skip SHARED hats, as these are dummy
617 * hats that host ISM shared page tables.
619 * We also skip if HAT_FREEING because hat_pte_unmap()
620 * won't zero out the PTE's. That would lead to hitting
621 * stale PTEs either here or under hat_unload() when we
622 * steal and unload the same page table in competing
625 while (hat
!= NULL
&&
627 (HAT_VICTIM
| HAT_SHARED
| HAT_FREEING
)) != 0)
634 * Mark the HAT as a stealing victim so that it is
635 * not freed from under us, e.g. in as_free()
637 hat
->hat_flags
|= HAT_VICTIM
;
638 mutex_exit(&hat_list_lock
);
641 * Take any htables from the hat's cached "free" list.
644 while ((ht
= hat
->hat_ht_cached
) != NULL
&&
646 hat
->hat_ht_cached
= ht
->ht_next
;
654 * Don't steal active htables on first pass.
656 if (pass
!= 0 && (stolen
< cnt
))
657 htable_steal_active(hat
, cnt
, threshold
,
661 * do synchronous teardown for the reap case so that
662 * we can forget hat; at this time, hat is
663 * guaranteed to be around because HAT_VICTIM is set
664 * (see htable_free() for similar code)
666 for (ht
= list
; (ht
) && (reap
); ht
= ht
->ht_next
) {
667 if (ht
->ht_hat
== NULL
)
669 ASSERT(ht
->ht_hat
== hat
);
670 #if defined(__xpv) && defined(__amd64)
671 if (!(ht
->ht_flags
& HTABLE_VLP
) &&
672 ht
->ht_level
== mmu
.max_level
) {
673 ptable_free(hat
->hat_user_ptable
);
674 hat
->hat_user_ptable
= PFN_INVALID
;
683 mutex_enter(&hat_list_lock
);
690 * Try to spread the pain of stealing,
691 * move victim HAT to the end of the HAT list.
693 if (pass
>= 1 && cnt
== 1 &&
694 kas
.a_hat
->hat_prev
!= hat
)
702 * Clear the victim flag, hat can go away now (once
703 * the lock is dropped)
705 if (hat
->hat_flags
& HAT_VICTIM
) {
706 ASSERT(hat
!= kas
.a_hat
);
707 hat
->hat_flags
&= ~HAT_VICTIM
;
708 cv_broadcast(&hat_list_cv
);
711 /* move on to the next hat */
715 mutex_exit(&hat_list_lock
);
718 ASSERT(!MUTEX_HELD(&hat_list_lock
));
720 atomic_dec_32(&htable_dont_cache
);
725 * This is invoked from kmem when the system is low on memory. We try
726 * to free hments, htables, and ptables to improve the memory situation.
730 htable_reap(void *handle
)
736 HATSTAT_INC(hs_reap_attempts
);
737 if (!can_steal_post_boot
)
741 * Try to reap 5% of the page tables bounded by a maximum of
742 * 5% of physmem and a minimum of 10.
744 reap_cnt
= MAX(MIN(physmem
/ 20, active_ptables
/ 20), 10);
747 * Note: htable_dont_cache should be set at the time of
748 * invoking htable_free()
750 atomic_inc_32(&htable_dont_cache
);
752 * Let htable_steal() do the work, we just call htable_free()
754 XPV_DISALLOW_MIGRATE();
755 list
= htable_steal(reap_cnt
, B_TRUE
);
757 while ((ht
= list
) != NULL
) {
759 HATSTAT_INC(hs_reaped
);
762 atomic_dec_32(&htable_dont_cache
);
765 * Free up excess reserves
767 htable_adjust_reserve();
768 hment_adjust_reserve();
772 * Allocate an htable, stealing one or using the reserve if necessary
784 uint_t need_to_zero
= 1;
785 int kmflags
= (can_steal_post_boot
? KM_NOSLEEP
: KM_SLEEP
);
787 if (level
< 0 || level
> TOP_LEVEL(hat
))
788 panic("htable_alloc(): level %d out of range\n", level
);
790 is_vlp
= (hat
->hat_flags
& HAT_VLP
) && level
== VLP_LEVEL
;
791 if (is_vlp
|| shared
!= NULL
)
795 * First reuse a cached htable from the hat_ht_cached field, this
796 * avoids unnecessary trips through kmem/page allocators.
798 if (hat
->hat_ht_cached
!= NULL
&& !is_bare
) {
800 ht
= hat
->hat_ht_cached
;
802 hat
->hat_ht_cached
= ht
->ht_next
;
804 /* XX64 ASSERT() they're all zero somehow */
805 ASSERT(ht
->ht_pfn
!= PFN_INVALID
);
812 * Allocate an htable, possibly refilling the reserves.
814 if (USE_HAT_RESERVES()) {
815 ht
= htable_get_reserve();
818 * Donate successful htable allocations to the reserve.
821 ht
= kmem_cache_alloc(htable_cache
, kmflags
);
824 ht
->ht_pfn
= PFN_INVALID
;
825 if (USE_HAT_RESERVES() ||
826 htable_reserve_cnt
>= htable_reserve_amount
)
828 htable_put_reserve(ht
);
833 * allocate a page for the hardware page table if needed
835 if (ht
!= NULL
&& !is_bare
) {
837 ht
->ht_pfn
= ptable_alloc((uintptr_t)ht
);
838 if (ht
->ht_pfn
== PFN_INVALID
) {
839 if (USE_HAT_RESERVES())
840 htable_put_reserve(ht
);
842 kmem_cache_free(htable_cache
, ht
);
849 * If allocations failed, kick off a kmem_reap() and resort to
850 * htable steal(). We may spin here if the system is very low on
851 * memory. If the kernel itself has consumed all memory and kmem_reap()
852 * can't free up anything, then we'll really get stuck here.
853 * That should only happen in a system where the administrator has
854 * misconfigured VM parameters via /etc/system.
856 while (ht
== NULL
&& can_steal_post_boot
) {
858 ht
= htable_steal(1, B_FALSE
);
859 HATSTAT_INC(hs_steals
);
862 * If we stole for a bare htable, release the pagetable page.
866 ptable_free(ht
->ht_pfn
);
867 ht
->ht_pfn
= PFN_INVALID
;
868 #if defined(__xpv) && defined(__amd64)
870 * make stolen page table writable again in kpm
872 } else if (kpm_vbase
&& xen_kpm_page(ht
->ht_pfn
,
873 PT_VALID
| PT_WRITABLE
) < 0) {
874 panic("failure making kpm r/w pfn=0x%lx",
882 * All attempts to allocate or steal failed. This should only happen
883 * if we run out of memory during boot, due perhaps to a huge
884 * boot_archive. At this point there's no way to continue.
887 panic("htable_alloc(): couldn't steal\n");
889 #if defined(__amd64) && defined(__xpv)
891 * Under the 64-bit hypervisor, we have 2 top level page tables.
892 * If this allocation fails, we'll resort to stealing.
893 * We use the stolen page indirectly, by freeing the
894 * stolen htable first.
896 if (level
== mmu
.max_level
) {
900 hat
->hat_user_ptable
= ptable_alloc((uintptr_t)ht
+ 1);
901 if (hat
->hat_user_ptable
!= PFN_INVALID
)
903 stolen
= htable_steal(1, B_FALSE
);
905 panic("2nd steal ptable failed\n");
908 block_zero_no_xmm(kpm_vbase
+ pfn_to_pa(hat
->hat_user_ptable
),
914 * Shared page tables have all entries locked and entries may not
915 * be added or deleted.
918 if (shared
!= NULL
) {
919 ASSERT(shared
->ht_valid_cnt
> 0);
920 ht
->ht_flags
|= HTABLE_SHARED_PFN
;
921 ht
->ht_pfn
= shared
->ht_pfn
;
923 ht
->ht_valid_cnt
= 0; /* updated in hat_share() */
924 ht
->ht_shares
= shared
;
927 ht
->ht_shares
= NULL
;
929 ht
->ht_valid_cnt
= 0;
933 * setup flags, etc. for VLP htables
936 ht
->ht_flags
|= HTABLE_VLP
;
937 ASSERT(ht
->ht_pfn
== PFN_INVALID
);
945 ht
->ht_parent
= NULL
;
946 ht
->ht_vaddr
= vaddr
;
947 ht
->ht_level
= level
;
953 * Zero out any freshly allocated page table
956 x86pte_zero(ht
, 0, mmu
.ptes_per_table
);
958 #if defined(__amd64) && defined(__xpv)
959 if (!is_bare
&& kpm_vbase
) {
960 (void) xen_kpm_page(ht
->ht_pfn
, PT_VALID
);
961 if (level
== mmu
.max_level
)
962 (void) xen_kpm_page(hat
->hat_user_ptable
, PT_VALID
);
970 * Free up an htable, either to a hat's cached list, the reserves or
974 htable_free(htable_t
*ht
)
976 hat_t
*hat
= ht
->ht_hat
;
979 * If the process isn't exiting, cache the free htable in the hat
980 * structure. We always do this for the boot time reserve. We don't
981 * do this if the hat is exiting or we are stealing/reaping htables.
984 !(ht
->ht_flags
& HTABLE_SHARED_PFN
) &&
986 (!(hat
->hat_flags
& HAT_FREEING
) && !htable_dont_cache
))) {
987 ASSERT((ht
->ht_flags
& HTABLE_VLP
) == 0);
988 ASSERT(ht
->ht_pfn
!= PFN_INVALID
);
990 ht
->ht_next
= hat
->hat_ht_cached
;
991 hat
->hat_ht_cached
= ht
;
997 * If we have a hardware page table, free it.
998 * We don't free page tables that are accessed by sharing.
1000 if (ht
->ht_flags
& HTABLE_SHARED_PFN
) {
1001 ASSERT(ht
->ht_pfn
!= PFN_INVALID
);
1002 } else if (!(ht
->ht_flags
& HTABLE_VLP
)) {
1003 ptable_free(ht
->ht_pfn
);
1004 #if defined(__amd64) && defined(__xpv)
1005 if (ht
->ht_level
== mmu
.max_level
&& hat
!= NULL
) {
1006 ptable_free(hat
->hat_user_ptable
);
1007 hat
->hat_user_ptable
= PFN_INVALID
;
1011 ht
->ht_pfn
= PFN_INVALID
;
1014 * Free it or put into reserves.
1016 if (USE_HAT_RESERVES() || htable_reserve_cnt
< htable_reserve_amount
) {
1017 htable_put_reserve(ht
);
1019 kmem_cache_free(htable_cache
, ht
);
1020 htable_adjust_reserve();
1026 * This is called when a hat is being destroyed or swapped out. We reap all
1027 * the remaining htables in the hat cache. If destroying all left over
1028 * htables are also destroyed.
1030 * We also don't need to invalidate any of the PTPs nor do any demapping.
1033 htable_purge_hat(hat_t
*hat
)
1039 * Purge the htable cache if just reaping.
1041 if (!(hat
->hat_flags
& HAT_FREEING
)) {
1042 atomic_inc_32(&htable_dont_cache
);
1045 ht
= hat
->hat_ht_cached
;
1050 hat
->hat_ht_cached
= ht
->ht_next
;
1054 atomic_dec_32(&htable_dont_cache
);
1059 * if freeing, no locking is needed
1061 while ((ht
= hat
->hat_ht_cached
) != NULL
) {
1062 hat
->hat_ht_cached
= ht
->ht_next
;
1067 * walk thru the htable hash table and free all the htables in it.
1069 for (h
= 0; h
< hat
->hat_num_hash
; ++h
) {
1070 while ((ht
= hat
->hat_ht_hash
[h
]) != NULL
) {
1072 ht
->ht_next
->ht_prev
= ht
->ht_prev
;
1075 ht
->ht_prev
->ht_next
= ht
->ht_next
;
1077 ASSERT(hat
->hat_ht_hash
[h
] == ht
);
1078 hat
->hat_ht_hash
[h
] = ht
->ht_next
;
1086 * Unlink an entry for a table at vaddr and level out of the existing table
1087 * one level higher. We are always holding the HASH_ENTER() when doing this.
1090 unlink_ptp(htable_t
*higher
, htable_t
*old
, uintptr_t vaddr
)
1092 uint_t entry
= htable_va2entry(vaddr
, higher
);
1093 x86pte_t expect
= MAKEPTP(old
->ht_pfn
, old
->ht_level
);
1095 hat_t
*hat
= old
->ht_hat
;
1097 ASSERT(higher
->ht_busy
> 0);
1098 ASSERT(higher
->ht_valid_cnt
> 0);
1099 ASSERT(old
->ht_valid_cnt
== 0);
1100 found
= x86pte_cas(higher
, entry
, expect
, 0);
1103 * This is weird, but Xen apparently automatically unlinks empty
1104 * pagetables from the upper page table. So allow PTP to be 0 already.
1106 if (found
!= expect
&& found
!= 0)
1108 if (found
!= expect
)
1110 panic("Bad PTP found=" FMT_PTE
", expected=" FMT_PTE
,
1114 * When a top level VLP page table entry changes, we must issue
1115 * a reload of cr3 on all processors.
1117 * If we don't need do do that, then we still have to INVLPG against
1118 * an address covered by the inner page table, as the latest processors
1119 * have TLB-like caches for non-leaf page table entries.
1121 if (!(hat
->hat_flags
& HAT_FREEING
)) {
1122 hat_tlb_inval(hat
, (higher
->ht_flags
& HTABLE_VLP
) ?
1123 DEMAP_ALL_ADDR
: old
->ht_vaddr
);
1126 HTABLE_DEC(higher
->ht_valid_cnt
);
1130 * Link an entry for a new table at vaddr and level into the existing table
1131 * one level higher. We are always holding the HASH_ENTER() when doing this.
1134 link_ptp(htable_t
*higher
, htable_t
*new, uintptr_t vaddr
)
1136 uint_t entry
= htable_va2entry(vaddr
, higher
);
1137 x86pte_t newptp
= MAKEPTP(new->ht_pfn
, new->ht_level
);
1140 ASSERT(higher
->ht_busy
> 0);
1142 ASSERT(new->ht_level
!= mmu
.max_level
);
1144 HTABLE_INC(higher
->ht_valid_cnt
);
1146 found
= x86pte_cas(higher
, entry
, 0, newptp
);
1147 if ((found
& ~PT_REF
) != 0)
1148 panic("HAT: ptp not 0, found=" FMT_PTE
, found
);
1151 * When any top level VLP page table entry changes, we must issue
1152 * a reload of cr3 on all processors using it.
1153 * We also need to do this for the kernel hat on PAE 32 bit kernel.
1157 (higher
->ht_hat
== kas
.a_hat
&& higher
->ht_level
== VLP_LEVEL
) ||
1159 (higher
->ht_flags
& HTABLE_VLP
))
1160 hat_tlb_inval(higher
->ht_hat
, DEMAP_ALL_ADDR
);
1164 * Release of hold on an htable. If this is the last use and the pagetable
1165 * is empty we may want to free it, then recursively look at the pagetable
1166 * above it. The recursion is handled by the outer while() loop.
1168 * On the metal, during process exit, we don't bother unlinking the tables from
1169 * upper level pagetables. They are instead handled in bulk by hat_free_end().
1170 * We can't do this on the hypervisor as we need the page table to be
1171 * implicitly unpinnned before it goes to the free page lists. This can't
1172 * happen unless we fully unlink it from the page table hierarchy.
1175 htable_release(htable_t
*ht
)
1184 while (ht
!= NULL
) {
1189 level
= ht
->ht_level
;
1190 hashval
= HTABLE_HASH(hat
, va
, level
);
1193 * The common case is that this isn't the last use of
1194 * an htable so we don't want to free the htable.
1196 HTABLE_ENTER(hashval
);
1197 ASSERT(ht
->ht_valid_cnt
>= 0);
1198 ASSERT(ht
->ht_busy
> 0);
1199 if (ht
->ht_valid_cnt
> 0)
1201 if (ht
->ht_busy
> 1)
1203 ASSERT(ht
->ht_lock_cnt
== 0);
1207 * we always release empty shared htables
1209 if (!(ht
->ht_flags
& HTABLE_SHARED_PFN
)) {
1212 * don't release if in address space tear down
1214 if (hat
->hat_flags
& HAT_FREEING
)
1218 * At and above max_page_level, free if it's for
1219 * a boot-time kernel mapping below kernelbase.
1221 if (level
>= mmu
.max_page_level
&&
1222 (hat
!= kas
.a_hat
|| va
>= kernelbase
))
1228 * Remember if we destroy an htable that shares its PFN
1231 if (ht
->ht_flags
& HTABLE_SHARED_PFN
) {
1232 ASSERT(shared
== NULL
);
1233 shared
= ht
->ht_shares
;
1234 HATSTAT_INC(hs_htable_unshared
);
1238 * Handle release of a table and freeing the htable_t.
1239 * Unlink it from the table higher (ie. ht_parent).
1241 higher
= ht
->ht_parent
;
1242 ASSERT(higher
!= NULL
);
1245 * Unlink the pagetable.
1247 unlink_ptp(higher
, ht
, va
);
1250 * remove this htable from its hash list
1253 ht
->ht_next
->ht_prev
= ht
->ht_prev
;
1256 ht
->ht_prev
->ht_next
= ht
->ht_next
;
1258 ASSERT(hat
->hat_ht_hash
[hashval
] == ht
);
1259 hat
->hat_ht_hash
[hashval
] = ht
->ht_next
;
1261 HTABLE_EXIT(hashval
);
1266 ASSERT(ht
->ht_busy
>= 1);
1268 HTABLE_EXIT(hashval
);
1271 * If we released a shared htable, do a release on the htable
1272 * from which it shared
1279 * Find the htable for the pagetable at the given level for the given address.
1280 * If found acquires a hold that eventually needs to be htable_release()d
1283 htable_lookup(hat_t
*hat
, uintptr_t vaddr
, level_t level
)
1287 htable_t
*ht
= NULL
;
1290 ASSERT(level
<= TOP_LEVEL(hat
));
1292 if (level
== TOP_LEVEL(hat
)) {
1293 #if defined(__amd64)
1295 * 32 bit address spaces on 64 bit kernels need to check
1296 * for overflow of the 32 bit address space
1298 if ((hat
->hat_flags
& HAT_VLP
) && vaddr
>= ((uint64_t)1 << 32))
1303 base
= vaddr
& LEVEL_MASK(level
+ 1);
1306 hashval
= HTABLE_HASH(hat
, base
, level
);
1307 HTABLE_ENTER(hashval
);
1308 for (ht
= hat
->hat_ht_hash
[hashval
]; ht
; ht
= ht
->ht_next
) {
1309 if (ht
->ht_hat
== hat
&&
1310 ht
->ht_vaddr
== base
&&
1311 ht
->ht_level
== level
)
1317 HTABLE_EXIT(hashval
);
1322 * Acquires a hold on a known htable (from a locked hment entry).
1325 htable_acquire(htable_t
*ht
)
1327 hat_t
*hat
= ht
->ht_hat
;
1328 level_t level
= ht
->ht_level
;
1329 uintptr_t base
= ht
->ht_vaddr
;
1330 uint_t hashval
= HTABLE_HASH(hat
, base
, level
);
1332 HTABLE_ENTER(hashval
);
1335 * make sure the htable is there
1340 for (h
= hat
->hat_ht_hash
[hashval
];
1348 HTABLE_EXIT(hashval
);
1352 * Find the htable for the pagetable at the given level for the given address.
1353 * If found acquires a hold that eventually needs to be htable_release()d
1354 * If not found the table is created.
1356 * Since we can't hold a hash table mutex during allocation, we have to
1357 * drop it and redo the search on a create. Then we may have to free the newly
1358 * allocated htable if another thread raced in and created it ahead of us.
1371 htable_t
*higher
= NULL
;
1372 htable_t
*new = NULL
;
1374 if (level
< 0 || level
> TOP_LEVEL(hat
))
1375 panic("htable_create(): level %d out of range\n", level
);
1378 * Create the page tables in top down order.
1380 for (l
= TOP_LEVEL(hat
); l
>= level
; --l
) {
1382 if (l
== TOP_LEVEL(hat
))
1385 base
= vaddr
& LEVEL_MASK(l
+ 1);
1387 h
= HTABLE_HASH(hat
, base
, l
);
1390 * look up the htable at this level
1393 if (l
== TOP_LEVEL(hat
)) {
1394 ht
= hat
->hat_htable
;
1396 for (ht
= hat
->hat_ht_hash
[h
]; ht
; ht
= ht
->ht_next
) {
1397 ASSERT(ht
->ht_hat
== hat
);
1398 if (ht
->ht_vaddr
== base
&&
1405 * if we found the htable, increment its busy cnt
1406 * and if we had allocated a new htable, free it.
1410 * If we find a pre-existing shared table, it must
1411 * share from the same place.
1413 if (l
== level
&& shared
&& ht
->ht_shares
&&
1414 ht
->ht_shares
!= shared
) {
1415 panic("htable shared from wrong place "
1416 "found htable=%p shared=%p",
1417 (void *)ht
, (void *)shared
);
1424 htable_release(higher
);
1428 * if we didn't find it on the first search
1429 * allocate a new one and search again
1431 } else if (new == NULL
) {
1433 new = htable_alloc(hat
, base
, l
,
1434 l
== level
? shared
: NULL
);
1438 * 2nd search and still not there, use "new" table
1439 * Link new table into higher, when not at top level.
1443 if (higher
!= NULL
) {
1444 link_ptp(higher
, ht
, base
);
1445 ht
->ht_parent
= higher
;
1447 ht
->ht_next
= hat
->hat_ht_hash
[h
];
1448 ASSERT(ht
->ht_prev
== NULL
);
1449 if (hat
->hat_ht_hash
[h
])
1450 hat
->hat_ht_hash
[h
]->ht_prev
= ht
;
1451 hat
->hat_ht_hash
[h
] = ht
;
1455 * Note we don't do htable_release(higher).
1456 * That happens recursively when "new" is removed by
1457 * htable_release() or htable_steal().
1462 * If we just created a new shared page table we
1463 * increment the shared htable's busy count, so that
1464 * it can't be the victim of a steal even if it's empty.
1466 if (l
== level
&& shared
) {
1467 (void) htable_lookup(shared
->ht_hat
,
1468 shared
->ht_vaddr
, shared
->ht_level
);
1469 HATSTAT_INC(hs_htable_shared
);
1478 * Inherit initial pagetables from the boot program. On the 64-bit
1479 * hypervisor we also temporarily mark the p_index field of page table
1480 * pages, so we know not to try making them writable in seg_kpm.
1496 extern page_t
*boot_claim_page(pfn_t
);
1498 ht
= htable_get_reserve();
1499 if (level
== mmu
.max_level
)
1500 kas
.a_hat
->hat_htable
= ht
;
1502 ht
->ht_parent
= parent
;
1503 ht
->ht_vaddr
= base
;
1504 ht
->ht_level
= level
;
1510 ht
->ht_lock_cnt
= 0;
1511 ht
->ht_valid_cnt
= 0;
1515 h
= HTABLE_HASH(hat
, base
, level
);
1517 ht
->ht_next
= hat
->hat_ht_hash
[h
];
1518 ASSERT(ht
->ht_prev
== NULL
);
1519 if (hat
->hat_ht_hash
[h
])
1520 hat
->hat_ht_hash
[h
]->ht_prev
= ht
;
1521 hat
->hat_ht_hash
[h
] = ht
;
1525 * make sure the page table physical page is not FREE
1527 if (page_resv(1, KM_NOSLEEP
) == 0)
1528 panic("page_resv() failed in ptable alloc");
1530 pp
= boot_claim_page(pfn
);
1534 * Page table pages that were allocated by dboot or
1535 * in very early startup didn't go through boot_mapin()
1536 * and so won't have vnode/offsets. Fix that here.
1538 if (pp
->p_vnode
== NULL
) {
1539 /* match offset calculation in page_get_physical() */
1540 u_offset_t offset
= (uintptr_t)ht
;
1541 if (offset
> kernelbase
)
1542 offset
-= kernelbase
;
1543 offset
<<= MMU_PAGESHIFT
;
1544 #if defined(__amd64)
1545 offset
+= mmu
.hole_start
; /* something in VA hole */
1547 offset
+= 1ULL << 40; /* something > 4 Gig */
1549 ASSERT(page_exists(&kvp
, offset
) == NULL
);
1550 (void) page_hashin(pp
, &kvp
, offset
, NULL
);
1553 #if defined(__xpv) && defined(__amd64)
1555 * Record in the page_t that is a pagetable for segkpm setup.
1562 * Count valid mappings and recursively attach lower level pagetables.
1564 ptep
= kbm_remap_window(pfn_to_pa(pfn
), 0);
1565 for (i
= 0; i
< HTABLE_NUM_PTES(ht
); ++i
) {
1569 pte
= ((x86pte32_t
*)ptep
)[i
];
1570 if (!IN_HYPERVISOR_VA(base
) && PTE_ISVALID(pte
)) {
1572 if (!PTE_ISPAGE(pte
, level
)) {
1573 htable_attach(hat
, base
, level
- 1,
1574 ht
, PTE2PFN(pte
, level
));
1575 ptep
= kbm_remap_window(pfn_to_pa(pfn
), 0);
1578 base
+= LEVEL_SIZE(level
);
1579 if (base
== mmu
.hole_start
)
1580 base
= (mmu
.hole_end
+ MMU_PAGEOFFSET
) & MMU_PAGEMASK
;
1584 * As long as all the mappings we had were below kernel base
1585 * we can release the htable.
1587 if (base
< kernelbase
)
1592 * Walk through a given htable looking for the first valid entry. This
1593 * routine takes both a starting and ending address. The starting address
1594 * is required to be within the htable provided by the caller, but there is
1595 * no such restriction on the ending address.
1597 * If the routine finds a valid entry in the htable (at or beyond the
1598 * starting address), the PTE (and its address) will be returned.
1599 * This PTE may correspond to either a page or a pagetable - it is the
1600 * caller's responsibility to determine which. If no valid entry is
1601 * found, 0 (and invalid PTE) and the next unexamined address will be
1604 * The loop has been carefully coded for optimization.
1607 htable_scan(htable_t
*ht
, uintptr_t *vap
, uintptr_t eaddr
)
1610 x86pte_t found_pte
= (x86pte_t
)0;
1612 caddr_t end_pte_ptr
;
1613 int l
= ht
->ht_level
;
1614 uintptr_t va
= *vap
& LEVEL_MASK(l
);
1615 size_t pgsize
= LEVEL_SIZE(l
);
1617 ASSERT(va
>= ht
->ht_vaddr
);
1618 ASSERT(va
<= HTABLE_LAST_PAGE(ht
));
1621 * Compute the starting index and ending virtual address
1623 e
= htable_va2entry(va
, ht
);
1626 * The following page table scan code knows that the valid
1627 * bit of a PTE is in the lowest byte AND that x86 is little endian!!
1629 pte_ptr
= (caddr_t
)x86pte_access_pagetable(ht
, 0);
1630 end_pte_ptr
= (caddr_t
)PT_INDEX_PTR(pte_ptr
, HTABLE_NUM_PTES(ht
));
1631 pte_ptr
= (caddr_t
)PT_INDEX_PTR((x86pte_t
*)pte_ptr
, e
);
1632 while (!PTE_ISVALID(*pte_ptr
)) {
1636 pte_ptr
+= mmu
.pte_size
;
1637 ASSERT(pte_ptr
<= end_pte_ptr
);
1638 if (pte_ptr
== end_pte_ptr
)
1643 * if we found a valid PTE, load the entire PTE
1645 if (va
< eaddr
&& pte_ptr
!= end_pte_ptr
)
1646 found_pte
= GET_PTE((x86pte_t
*)pte_ptr
);
1647 x86pte_release_pagetable(ht
);
1649 #if defined(__amd64)
1651 * deal with VA hole on amd64
1653 if (l
== mmu
.max_level
&& va
>= mmu
.hole_start
&& va
<= mmu
.hole_end
)
1654 va
= mmu
.hole_end
+ va
- mmu
.hole_start
;
1655 #endif /* __amd64 */
1662 * Find the address and htable for the first populated translation at or
1663 * above the given virtual address. The caller may also specify an upper
1664 * limit to the address range to search. Uses level information to quickly
1665 * skip unpopulated sections of virtual address spaces.
1667 * If not found returns NULL. When found, returns the htable and virt addr
1668 * and has a hold on the htable.
1677 uintptr_t va
= *vaddr
;
1679 htable_t
*prev
= *htp
;
1681 level_t max_mapped_level
;
1687 * If this is a user address, then we know we need not look beyond
1690 ASSERT(hat
== kas
.a_hat
|| eaddr
<= kernelbase
||
1691 eaddr
== HTABLE_WALK_TO_END
);
1692 if (hat
!= kas
.a_hat
&& eaddr
== HTABLE_WALK_TO_END
)
1696 * If we're coming in with a previous page table, search it first
1697 * without doing an htable_lookup(), this should be frequent.
1700 ASSERT(prev
->ht_busy
> 0);
1701 ASSERT(prev
->ht_vaddr
<= va
);
1703 if (va
<= HTABLE_LAST_PAGE(prev
)) {
1704 pte
= htable_scan(prev
, &va
, eaddr
);
1706 if (PTE_ISPAGE(pte
, l
)) {
1714 * We found nothing in the htable provided by the caller,
1715 * so fall through and do the full search
1717 htable_release(prev
);
1721 * Find the level of the largest pagesize used by this HAT.
1723 if (hat
->hat_ism_pgcnt
> 0) {
1724 max_mapped_level
= mmu
.umax_page_level
;
1726 max_mapped_level
= 0;
1727 for (l
= 1; l
<= mmu
.max_page_level
; ++l
)
1728 if (hat
->hat_pages_mapped
[l
] != 0)
1729 max_mapped_level
= l
;
1732 while (va
< eaddr
&& va
>= *vaddr
) {
1734 * Find lowest table with any entry for given address.
1736 for (l
= 0; l
<= TOP_LEVEL(hat
); ++l
) {
1737 ht
= htable_lookup(hat
, va
, l
);
1739 pte
= htable_scan(ht
, &va
, eaddr
);
1740 if (PTE_ISPAGE(pte
, l
)) {
1741 VERIFY(!IN_VA_HOLE(va
));
1751 * No htable at this level for the address. If there
1752 * is no larger page size that could cover it, we can
1753 * skip right to the start of the next page table.
1755 ASSERT(l
< TOP_LEVEL(hat
));
1756 if (l
>= max_mapped_level
) {
1757 va
= NEXT_ENTRY_VA(va
, l
+ 1);
1770 * Find the htable and page table entry index of the given virtual address
1771 * with pagesize at or below given level.
1772 * If not found returns NULL. When found, returns the htable, sets
1773 * entry, and has a hold on the htable.
1787 ASSERT(level
<= mmu
.max_page_level
);
1789 for (l
= 0; l
<= level
; ++l
) {
1790 ht
= htable_lookup(hat
, vaddr
, l
);
1793 e
= htable_va2entry(vaddr
, ht
);
1797 *pte
= x86pte_get(ht
, e
);
1804 * Find the htable and page table entry index of the given virtual address.
1805 * There must be a valid page mapped at the given address.
1806 * If not found returns NULL. When found, returns the htable, sets
1807 * entry, and has a hold on the htable.
1810 htable_getpage(struct hat
*hat
, uintptr_t vaddr
, uint_t
*entry
)
1816 ht
= htable_getpte(hat
, vaddr
, &e
, &pte
, mmu
.max_page_level
);
1823 if (PTE_ISPAGE(pte
, ht
->ht_level
))
1834 * To save on kernel VA usage, we avoid debug information in 32 bit
1837 #if defined(__amd64)
1838 int kmem_flags
= KMC_NOHASH
;
1839 #elif defined(__i386)
1840 int kmem_flags
= KMC_NOHASH
| KMC_NODEBUG
;
1844 * initialize kmem caches
1846 htable_cache
= kmem_cache_create("htable_t",
1847 sizeof (htable_t
), 0, NULL
, NULL
,
1848 htable_reap
, NULL
, hat_memload_arena
, kmem_flags
);
1852 * get the pte index for the virtual address in the given htable's pagetable
1855 htable_va2entry(uintptr_t va
, htable_t
*ht
)
1857 level_t l
= ht
->ht_level
;
1859 ASSERT(va
>= ht
->ht_vaddr
);
1860 ASSERT(va
<= HTABLE_LAST_PAGE(ht
));
1861 return ((va
>> LEVEL_SHIFT(l
)) & (HTABLE_NUM_PTES(ht
) - 1));
1865 * Given an htable and the index of a pte in it, return the virtual address
1869 htable_e2va(htable_t
*ht
, uint_t entry
)
1871 level_t l
= ht
->ht_level
;
1874 ASSERT(entry
< HTABLE_NUM_PTES(ht
));
1875 va
= ht
->ht_vaddr
+ ((uintptr_t)entry
<< LEVEL_SHIFT(l
));
1878 * Need to skip over any VA hole in top level table
1880 #if defined(__amd64)
1881 if (ht
->ht_level
== mmu
.max_level
&& va
>= mmu
.hole_start
)
1882 va
+= ((mmu
.hole_end
- mmu
.hole_start
) + 1);
1889 * The code uses compare and swap instructions to read/write PTE's to
1890 * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
1891 * will naturally be atomic.
1893 * The combination of using kpreempt_disable()/_enable() and the hci_mutex
1894 * are used to ensure that an interrupt won't overwrite a temporary mapping
1895 * while it's in use. If an interrupt thread tries to access a PTE, it will
1896 * yield briefly back to the pinned thread which holds the cpu's hci_mutex.
1899 x86pte_cpu_init(cpu_t
*cpu
)
1901 struct hat_cpu_info
*hci
;
1903 hci
= kmem_zalloc(sizeof (*hci
), KM_SLEEP
);
1904 mutex_init(&hci
->hci_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
1905 cpu
->cpu_hat_info
= hci
;
1909 x86pte_cpu_fini(cpu_t
*cpu
)
1911 struct hat_cpu_info
*hci
= cpu
->cpu_hat_info
;
1913 kmem_free(hci
, sizeof (*hci
));
1914 cpu
->cpu_hat_info
= NULL
;
1919 * On 32 bit kernels, loading a 64 bit PTE is a little tricky
1922 get_pte64(x86pte_t
*ptr
)
1924 volatile uint32_t *p
= (uint32_t *)ptr
;
1927 ASSERT(mmu
.pae_hat
!= 0);
1930 t
|= (uint64_t)p
[1] << 32;
1931 if ((t
& 0xffffffff) == p
[0])
1938 * Disable preemption and establish a mapping to the pagetable with the
1939 * given pfn. This is optimized for there case where it's the same
1940 * pfn as we last used referenced from this CPU.
1943 x86pte_access_pagetable(htable_t
*ht
, uint_t index
)
1946 * VLP pagetables are contained in the hat_t
1948 if (ht
->ht_flags
& HTABLE_VLP
)
1949 return (PT_INDEX_PTR(ht
->ht_hat
->hat_vlp_ptes
, index
));
1950 return (x86pte_mapin(ht
->ht_pfn
, index
, ht
));
1954 * map the given pfn into the page table window.
1958 x86pte_mapin(pfn_t pfn
, uint_t index
, htable_t
*ht
)
1965 ASSERT(pfn
!= PFN_INVALID
);
1967 if (!khat_running
) {
1968 caddr_t va
= kbm_remap_window(pfn_to_pa(pfn
), 1);
1969 return (PT_INDEX_PTR(va
, index
));
1973 * If kpm is available, use it.
1976 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn
), index
));
1979 * Disable preemption and grab the CPU's hci_mutex
1982 ASSERT(CPU
->cpu_hat_info
!= NULL
);
1983 mutex_enter(&CPU
->cpu_hat_info
->hci_mutex
);
1984 x
= PWIN_TABLE(CPU
->cpu_id
);
1985 pteptr
= (x86pte_t
*)PWIN_PTE_VA(x
);
1990 pte
= *(x86pte32_t
*)pteptr
;
1993 newpte
= MAKEPTE(pfn
, 0) | mmu
.pt_global
| mmu
.pt_nx
;
1996 * For hardware we can use a writable mapping.
2001 newpte
|= PT_WRITABLE
;
2003 if (!PTE_EQUIV(newpte
, pte
)) {
2006 if (!IN_XPV_PANIC()) {
2007 xen_map(newpte
, PWIN_VA(x
));
2011 XPV_ALLOW_PAGETABLE_UPDATES();
2015 *(x86pte32_t
*)pteptr
= newpte
;
2016 XPV_DISALLOW_PAGETABLE_UPDATES();
2017 mmu_tlbflush_entry((caddr_t
)(PWIN_VA(x
)));
2020 return (PT_INDEX_PTR(PWIN_VA(x
), index
));
2024 * Release access to a page table.
2027 x86pte_release_pagetable(htable_t
*ht
)
2030 * nothing to do for VLP htables
2032 if (ht
->ht_flags
& HTABLE_VLP
)
2041 if (kpm_vbase
!= NULL
|| !khat_running
)
2045 * Drop the CPU's hci_mutex and restore preemption.
2048 if (!IN_XPV_PANIC()) {
2052 * We need to always clear the mapping in case a page
2053 * that was once a page table page is ballooned out.
2055 va
= (uintptr_t)PWIN_VA(PWIN_TABLE(CPU
->cpu_id
));
2056 (void) HYPERVISOR_update_va_mapping(va
, 0,
2057 UVMF_INVLPG
| UVMF_LOCAL
);
2060 mutex_exit(&CPU
->cpu_hat_info
->hci_mutex
);
2065 * Atomic retrieval of a pagetable entry
2068 x86pte_get(htable_t
*ht
, uint_t entry
)
2074 * Be careful that loading PAE entries in 32 bit kernel is atomic.
2076 ASSERT(entry
< mmu
.ptes_per_table
);
2077 ptep
= x86pte_access_pagetable(ht
, entry
);
2078 pte
= GET_PTE(ptep
);
2079 x86pte_release_pagetable(ht
);
2084 * Atomic unconditional set of a page table entry, it returns the previous
2085 * value. For pre-existing mappings if the PFN changes, then we don't care
2086 * about the old pte's REF / MOD bits. If the PFN remains the same, we leave
2087 * the MOD/REF bits unchanged.
2089 * If asked to overwrite a link to a lower page table with a large page
2090 * mapping, this routine returns the special value of LPAGE_ERROR. This
2091 * allows the upper HAT layers to retry with a smaller mapping size.
2094 x86pte_set(htable_t
*ht
, uint_t entry
, x86pte_t
new, void *ptr
)
2099 level_t l
= ht
->ht_level
;
2100 x86pte_t pfn_mask
= (l
!= 0) ? PT_PADDR_LGPG
: PT_PADDR
;
2102 uintptr_t addr
= htable_e2va(ht
, entry
);
2103 hat_t
*hat
= ht
->ht_hat
;
2105 ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */
2106 ASSERT(!(ht
->ht_flags
& HTABLE_SHARED_PFN
));
2108 ptep
= x86pte_access_pagetable(ht
, entry
);
2113 * Install the new PTE. If remapping the same PFN, then
2114 * copy existing REF/MOD bits to new mapping.
2117 prev
= GET_PTE(ptep
);
2119 if (PTE_ISVALID(n
) && (prev
& pfn_mask
) == (new & pfn_mask
))
2120 n
|= prev
& (PT_REF
| PT_MOD
);
2123 * Another thread may have installed this mapping already,
2124 * flush the local TLB and be done.
2129 if (!IN_XPV_PANIC())
2130 xen_flush_va((caddr_t
)addr
);
2133 mmu_tlbflush_entry((caddr_t
)addr
);
2138 * Detect if we have a collision of installing a large
2139 * page mapping where there already is a lower page table.
2141 if (l
> 0 && (prev
& PT_VALID
) && !(prev
& PT_PAGESIZE
)) {
2146 XPV_ALLOW_PAGETABLE_UPDATES();
2147 old
= CAS_PTE(ptep
, prev
, n
);
2148 XPV_DISALLOW_PAGETABLE_UPDATES();
2149 } while (old
!= prev
);
2152 * Do a TLB demap if needed, ie. the old pte was valid.
2154 * Note that a stale TLB writeback to the PTE here either can't happen
2155 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
2156 * mappings, but they were created with REF and MOD already set, so
2157 * no stale writeback will happen.
2159 * Segmap is the only place where remaps happen on the same pfn and for
2160 * that we want to preserve the stale REF/MOD bits.
2163 hat_tlb_inval(hat
, addr
);
2167 x86pte_release_pagetable(ht
);
2172 * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2173 * This is used for links between pagetables of different levels.
2174 * Note we always create these links with dirty/access set, so they should
2178 x86pte_cas(htable_t
*ht
, uint_t entry
, x86pte_t old
, x86pte_t
new)
2184 * We can't use writable pagetables for upper level tables, so fake it.
2191 if (!IN_XPV_PANIC()) {
2192 ASSERT(!(ht
->ht_flags
& HTABLE_VLP
)); /* no VLP yet */
2193 ma
= pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht
->ht_pfn
), entry
));
2194 t
[0].ptr
= ma
| MMU_NORMAL_PT_UPDATE
;
2197 #if defined(__amd64)
2199 * On the 64-bit hypervisor we need to maintain the user mode
2200 * top page table too.
2202 if (ht
->ht_level
== mmu
.max_level
&& ht
->ht_hat
!= kas
.a_hat
) {
2203 ma
= pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2204 ht
->ht_hat
->hat_user_ptable
), entry
));
2205 t
[1].ptr
= ma
| MMU_NORMAL_PT_UPDATE
;
2209 #endif /* __amd64 */
2211 if (HYPERVISOR_mmu_update(t
, cnt
, &count
, DOMID_SELF
))
2212 panic("HYPERVISOR_mmu_update() failed");
2213 ASSERT(count
== cnt
);
2217 ptep
= x86pte_access_pagetable(ht
, entry
);
2218 XPV_ALLOW_PAGETABLE_UPDATES();
2219 pte
= CAS_PTE(ptep
, old
, new);
2220 XPV_DISALLOW_PAGETABLE_UPDATES();
2221 x86pte_release_pagetable(ht
);
2226 * Invalidate a page table entry as long as it currently maps something that
2227 * matches the value determined by expect.
2229 * If tlb is set, also invalidates any TLB entries.
2231 * Returns the previous value of the PTE.
2245 ASSERT(!(ht
->ht_flags
& HTABLE_SHARED_PFN
));
2246 ASSERT(ht
->ht_level
<= mmu
.max_page_level
);
2248 if (pte_ptr
!= NULL
)
2251 ptep
= x86pte_access_pagetable(ht
, entry
);
2255 * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing
2256 * with anything else.
2258 if ((ht
->ht_hat
->hat_flags
& HAT_FREEING
) && !IN_XPV_PANIC()) {
2263 oldpte
= GET_PTE(ptep
);
2264 if (expect
!= 0 && (oldpte
& PT_PADDR
) != (expect
& PT_PADDR
))
2266 ma
= pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht
->ht_pfn
), entry
));
2267 t
[0].ptr
= ma
| MMU_NORMAL_PT_UPDATE
;
2269 if (HYPERVISOR_mmu_update(t
, 1, &count
, DOMID_SELF
))
2270 panic("HYPERVISOR_mmu_update() failed");
2277 * Note that the loop is needed to handle changes due to h/w updating
2281 oldpte
= GET_PTE(ptep
);
2282 if (expect
!= 0 && (oldpte
& PT_PADDR
) != (expect
& PT_PADDR
))
2284 XPV_ALLOW_PAGETABLE_UPDATES();
2285 found
= CAS_PTE(ptep
, oldpte
, 0);
2286 XPV_DISALLOW_PAGETABLE_UPDATES();
2287 } while (found
!= oldpte
);
2288 if (tlb
&& (oldpte
& (PT_REF
| PT_MOD
)))
2289 hat_tlb_inval(ht
->ht_hat
, htable_e2va(ht
, entry
));
2292 if (pte_ptr
== NULL
)
2293 x86pte_release_pagetable(ht
);
2298 * Change a page table entry af it currently matches the value in expect.
2311 ASSERT(!(ht
->ht_flags
& HTABLE_SHARED_PFN
));
2312 ASSERT(ht
->ht_level
<= mmu
.max_page_level
);
2314 ptep
= x86pte_access_pagetable(ht
, entry
);
2315 XPV_ALLOW_PAGETABLE_UPDATES();
2316 found
= CAS_PTE(ptep
, expect
, new);
2317 XPV_DISALLOW_PAGETABLE_UPDATES();
2318 if (found
== expect
) {
2319 hat_tlb_inval(ht
->ht_hat
, htable_e2va(ht
, entry
));
2322 * When removing write permission *and* clearing the
2323 * MOD bit, check if a write happened via a stale
2324 * TLB entry before the TLB shootdown finished.
2326 * If it did happen, simply re-enable write permission and
2327 * act like the original CAS failed.
2329 if ((expect
& (PT_WRITABLE
| PT_MOD
)) == PT_WRITABLE
&&
2330 (new & (PT_WRITABLE
| PT_MOD
)) == 0 &&
2331 (GET_PTE(ptep
) & PT_MOD
) != 0) {
2333 found
= GET_PTE(ptep
);
2334 XPV_ALLOW_PAGETABLE_UPDATES();
2336 CAS_PTE(ptep
, found
, found
| PT_WRITABLE
);
2337 XPV_DISALLOW_PAGETABLE_UPDATES();
2338 } while ((found
& PT_WRITABLE
) == 0);
2341 x86pte_release_pagetable(ht
);
2347 * Copy page tables - this is just a little more complicated than the
2348 * previous routines. Note that it's also not atomic! It also is never
2349 * used for VLP pagetables.
2352 x86pte_copy(htable_t
*src
, htable_t
*dest
, uint_t entry
, uint_t count
)
2360 ASSERT(khat_running
);
2361 ASSERT(!(dest
->ht_flags
& HTABLE_VLP
));
2362 ASSERT(!(src
->ht_flags
& HTABLE_VLP
));
2363 ASSERT(!(src
->ht_flags
& HTABLE_SHARED_PFN
));
2364 ASSERT(!(dest
->ht_flags
& HTABLE_SHARED_PFN
));
2367 * Acquire access to the CPU pagetable windows for the dest and source.
2369 dst_va
= (caddr_t
)x86pte_access_pagetable(dest
, entry
);
2372 PT_INDEX_PTR(hat_kpm_pfn2va(src
->ht_pfn
), entry
);
2374 uint_t x
= PWIN_SRC(CPU
->cpu_id
);
2377 * Finish defining the src pagetable mapping
2379 src_va
= (caddr_t
)PT_INDEX_PTR(PWIN_VA(x
), entry
);
2380 pte
= MAKEPTE(src
->ht_pfn
, 0) | mmu
.pt_global
| mmu
.pt_nx
;
2381 pteptr
= (x86pte_t
*)PWIN_PTE_VA(x
);
2385 *(x86pte32_t
*)pteptr
= pte
;
2386 mmu_tlbflush_entry((caddr_t
)(PWIN_VA(x
)));
2392 size
= count
<< mmu
.pte_size_shift
;
2393 bcopy(src_va
, dst_va
, size
);
2395 x86pte_release_pagetable(dest
);
2401 * The hypervisor only supports writable pagetables at level 0, so we have
2402 * to install these 1 by 1 the slow way.
2405 x86pte_copy(htable_t
*src
, htable_t
*dest
, uint_t entry
, uint_t count
)
2410 ASSERT(!IN_XPV_PANIC());
2411 src_va
= (caddr_t
)x86pte_access_pagetable(src
, entry
);
2414 pte
= *(x86pte_t
*)src_va
;
2416 pte
= *(x86pte32_t
*)src_va
;
2418 set_pteval(pfn_to_pa(dest
->ht_pfn
), entry
,
2419 dest
->ht_level
, pte
);
2421 if (dest
->ht_level
== mmu
.max_level
&&
2422 htable_e2va(dest
, entry
) < HYPERVISOR_VIRT_END
)
2424 pfn_to_pa(dest
->ht_hat
->hat_user_ptable
),
2425 entry
, dest
->ht_level
, pte
);
2430 src_va
+= mmu
.pte_size
;
2432 x86pte_release_pagetable(src
);
2437 * Zero page table entries - Note this doesn't use atomic stores!
2440 x86pte_zero(htable_t
*dest
, uint_t entry
, uint_t count
)
2450 * Map in the page table to be zeroed.
2452 ASSERT(!(dest
->ht_flags
& HTABLE_SHARED_PFN
));
2453 ASSERT(!(dest
->ht_flags
& HTABLE_VLP
));
2456 * On the hypervisor we don't use x86pte_access_pagetable() since
2457 * in this case the page is not pinned yet.
2460 if (kpm_vbase
== NULL
) {
2462 ASSERT(CPU
->cpu_hat_info
!= NULL
);
2463 mutex_enter(&CPU
->cpu_hat_info
->hci_mutex
);
2464 x
= PWIN_TABLE(CPU
->cpu_id
);
2465 newpte
= MAKEPTE(dest
->ht_pfn
, 0) | PT_WRITABLE
;
2466 xen_map(newpte
, PWIN_VA(x
));
2467 dst_va
= (caddr_t
)PT_INDEX_PTR(PWIN_VA(x
), entry
);
2470 dst_va
= (caddr_t
)x86pte_access_pagetable(dest
, entry
);
2472 size
= count
<< mmu
.pte_size_shift
;
2473 ASSERT(size
> BLOCKZEROALIGN
);
2475 if (!is_x86_feature(x86_featureset
, X86FSET_SSE2
))
2476 bzero(dst_va
, size
);
2479 block_zero_no_xmm(dst_va
, size
);
2482 if (kpm_vbase
== NULL
) {
2483 xen_map(0, PWIN_VA(x
));
2484 mutex_exit(&CPU
->cpu_hat_info
->hci_mutex
);
2488 x86pte_release_pagetable(dest
);
2492 * Called to ensure that all pagetables are in the system dump
2502 * Dump all page tables
2504 for (hat
= kas
.a_hat
; hat
!= NULL
; hat
= hat
->hat_next
) {
2505 for (h
= 0; h
< hat
->hat_num_hash
; ++h
) {
2506 for (ht
= hat
->hat_ht_hash
[h
]; ht
; ht
= ht
->ht_next
) {
2507 if ((ht
->ht_flags
& HTABLE_VLP
) == 0)
2508 dump_page(ht
->ht_pfn
);