4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2010, Intel Corporation.
26 * All rights reserved.
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
33 * VM - Hardware Address Translation management for i386 and amd64
35 * Implementation of the interfaces described in <common/vm/hat.h>
37 * Nearly all the details of how the hardware is managed should not be
38 * visible outside this layer except for misc. machine specific functions
39 * that work in conjunction with this code.
41 * Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
44 #include <sys/machparam.h>
45 #include <sys/machsystm.h>
47 #include <sys/types.h>
48 #include <sys/systm.h>
49 #include <sys/cpuvar.h>
50 #include <sys/thread.h>
56 #include <sys/sysmacros.h>
57 #include <sys/machparam.h>
59 #include <sys/vmsystm.h>
60 #include <sys/promif.h>
62 #include <sys/x86_archext.h>
63 #include <sys/atomic.h>
64 #include <sys/bitmap.h>
65 #include <sys/controlregs.h>
66 #include <sys/bootconf.h>
67 #include <sys/bootsvcs.h>
68 #include <sys/bootinfo.h>
69 #include <sys/archsystm.h>
71 #include <vm/seg_kmem.h>
72 #include <vm/hat_i86.h>
76 #include <vm/seg_kp.h>
77 #include <vm/seg_kpm.h>
78 #include <vm/vm_dep.h>
80 #include <sys/hypervisor.h>
82 #include <vm/kboot_mmu.h>
83 #include <vm/seg_spt.h>
85 #include <sys/cmn_err.h>
88 * Basic parameters for hat operation.
90 struct hat_mmu_info mmu
;
93 * The page that is the kernel's top level pagetable.
95 * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries
96 * on this 4K page for its top level page table. The remaining groups of
97 * 4 entries are used for per processor copies of user VLP pagetables for
98 * running threads. See hat_switch() and reload_pae32() for details.
100 * vlp_page[0..3] - level==2 PTEs for kernel HAT
101 * vlp_page[4..7] - level==2 PTEs for user thread on cpu 0
102 * vlp_page[8..11] - level==2 PTE for user thread on cpu 1
105 static x86pte_t
*vlp_page
;
108 * forward declaration of internal utility routines
110 static x86pte_t
hati_update_pte(htable_t
*ht
, uint_t entry
, x86pte_t expected
,
114 * The kernel address space exists in all HATs. To implement this the
115 * kernel reserves a fixed number of entries in the topmost level(s) of page
116 * tables. The values are setup during startup and then copied to every user
117 * hat created by hat_alloc(). This means that kernelbase must be:
119 * 4Meg aligned for 32 bit kernels
120 * 512Gig aligned for x86_64 64 bit kernel
122 * The hat_kernel_range_ts describe what needs to be copied from kernel hat
125 typedef struct hat_kernel_range
{
127 uintptr_t hkr_start_va
;
128 uintptr_t hkr_end_va
; /* zero means to end of memory */
129 } hat_kernel_range_t
;
130 #define NUM_KERNEL_RANGE 2
131 static hat_kernel_range_t kernel_ranges
[NUM_KERNEL_RANGE
];
132 static int num_kernel_ranges
;
134 uint_t use_boot_reserve
= 1; /* cleared after early boot process */
135 uint_t can_steal_post_boot
= 0; /* set late in boot to enable stealing */
138 * enable_1gpg: controls 1g page support for user applications.
139 * By default, 1g pages are exported to user applications. enable_1gpg can
140 * be set to 0 to not export.
145 * AMD shanghai processors provide better management of 1gb ptes in its tlb.
146 * By default, 1g page support will be disabled for pre-shanghai AMD
147 * processors that don't have optimal tlb support for the 1g page size.
148 * chk_optimal_1gtlb can be set to 0 to force 1g page support on sub-optimal
151 int chk_optimal_1gtlb
= 1;
160 * A cpuset for all cpus. This is used for kernel address cross calls, since
161 * the kernel addresses apply to all cpus.
163 cpuset_t khat_cpuset
;
166 * management stuff for hat structures
168 kmutex_t hat_list_lock
;
169 kcondvar_t hat_list_cv
;
170 kmem_cache_t
*hat_cache
;
171 kmem_cache_t
*hat_hash_cache
;
172 kmem_cache_t
*vlp_hash_cache
;
177 struct hatstats hatstat
;
180 * Some earlier hypervisor versions do not emulate cmpxchg of PTEs
181 * correctly. For such hypervisors we must set PT_USER for kernel
182 * entries ourselves (normally the emulation would set PT_USER for
183 * kernel entries and PT_USER|PT_GLOBAL for user entries). pt_kern is
184 * thus set appropriately. Note that dboot/kbm is OK, as only the full
185 * HAT uses cmpxchg() and the other paths (hypercall etc.) were never
191 * useful stuff for atomic access/clearing/setting REF/MOD/RO bits in page_t's.
193 extern void atomic_orb(uchar_t
*addr
, uchar_t val
);
194 extern void atomic_andb(uchar_t
*addr
, uchar_t val
);
197 extern pfn_t
memseg_get_start(struct memseg
*);
200 #define PP_GETRM(pp, rmmask) (pp->p_nrm & rmmask)
201 #define PP_ISMOD(pp) PP_GETRM(pp, P_MOD)
202 #define PP_ISREF(pp) PP_GETRM(pp, P_REF)
203 #define PP_ISRO(pp) PP_GETRM(pp, P_RO)
205 #define PP_SETRM(pp, rm) atomic_orb(&(pp->p_nrm), rm)
206 #define PP_SETMOD(pp) PP_SETRM(pp, P_MOD)
207 #define PP_SETREF(pp) PP_SETRM(pp, P_REF)
208 #define PP_SETRO(pp) PP_SETRM(pp, P_RO)
210 #define PP_CLRRM(pp, rm) atomic_andb(&(pp->p_nrm), ~(rm))
211 #define PP_CLRMOD(pp) PP_CLRRM(pp, P_MOD)
212 #define PP_CLRREF(pp) PP_CLRRM(pp, P_REF)
213 #define PP_CLRRO(pp) PP_CLRRM(pp, P_RO)
214 #define PP_CLRALL(pp) PP_CLRRM(pp, P_MOD | P_REF | P_RO)
217 * kmem cache constructor for struct hat
221 hati_constructor(void *buf
, void *handle
, int kmflags
)
225 mutex_init(&hat
->hat_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
226 bzero(hat
->hat_pages_mapped
,
227 sizeof (pgcnt_t
) * (mmu
.max_page_level
+ 1));
228 hat
->hat_ism_pgcnt
= 0;
231 CPUSET_ZERO(hat
->hat_cpus
);
232 hat
->hat_htable
= NULL
;
233 hat
->hat_ht_hash
= NULL
;
238 * Allocate a hat structure for as. We also create the top level
239 * htable and initialize it to contain the kernel hat entries.
242 hat_alloc(struct as
*as
)
245 htable_t
*ht
; /* top level htable */
248 hat_kernel_range_t
*rp
;
256 * Once we start creating user process HATs we can enable
257 * the htable_steal() code.
259 if (can_steal_post_boot
== 0)
260 can_steal_post_boot
= 1;
262 ASSERT(AS_WRITE_HELD(as
, &as
->a_lock
));
263 hat
= kmem_cache_alloc(hat_cache
, KM_SLEEP
);
265 mutex_init(&hat
->hat_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
266 ASSERT(hat
->hat_flags
== 0);
270 * No VLP stuff on the hypervisor due to the 64-bit split top level
271 * page tables. On 32-bit it's not needed as the hypervisor takes
272 * care of copying the top level PTEs to a below 4Gig page.
276 /* 32 bit processes uses a VLP style hat when running with PAE */
278 use_vlp
= (ttoproc(curthread
)->p_model
== DATAMODEL_ILP32
);
279 #elif defined(__i386)
280 use_vlp
= mmu
.pae_hat
;
284 hat
->hat_flags
= HAT_VLP
;
285 bzero(hat
->hat_vlp_ptes
, VLP_SIZE
);
289 * Allocate the htable hash
291 if ((hat
->hat_flags
& HAT_VLP
)) {
292 hat
->hat_num_hash
= mmu
.vlp_hash_cnt
;
293 hat
->hat_ht_hash
= kmem_cache_alloc(vlp_hash_cache
, KM_SLEEP
);
295 hat
->hat_num_hash
= mmu
.hash_cnt
;
296 hat
->hat_ht_hash
= kmem_cache_alloc(hat_hash_cache
, KM_SLEEP
);
298 bzero(hat
->hat_ht_hash
, hat
->hat_num_hash
* sizeof (htable_t
*));
301 * Initialize Kernel HAT entries at the top of the top level page
302 * tables for the new hat.
304 hat
->hat_htable
= NULL
;
305 hat
->hat_ht_cached
= NULL
;
306 XPV_DISALLOW_MIGRATE();
307 ht
= htable_create(hat
, (uintptr_t)0, TOP_LEVEL(hat
), NULL
);
308 hat
->hat_htable
= ht
;
311 if (hat
->hat_flags
& HAT_VLP
)
315 for (r
= 0; r
< num_kernel_ranges
; ++r
) {
316 rp
= &kernel_ranges
[r
];
317 for (va
= rp
->hkr_start_va
; va
!= rp
->hkr_end_va
;
318 va
+= cnt
* LEVEL_SIZE(rp
->hkr_level
)) {
320 if (rp
->hkr_level
== TOP_LEVEL(hat
))
321 ht
= hat
->hat_htable
;
323 ht
= htable_create(hat
, va
, rp
->hkr_level
,
326 start
= htable_va2entry(va
, ht
);
327 cnt
= HTABLE_NUM_PTES(ht
) - start
;
329 ((uintptr_t)cnt
<< LEVEL_SHIFT(rp
->hkr_level
));
330 if (rp
->hkr_end_va
!= 0 &&
331 (eva
> rp
->hkr_end_va
|| eva
== 0))
332 cnt
= htable_va2entry(rp
->hkr_end_va
, ht
) -
335 #if defined(__i386) && !defined(__xpv)
336 if (ht
->ht_flags
& HTABLE_VLP
) {
337 bcopy(&vlp_page
[start
],
338 &hat
->hat_vlp_ptes
[start
],
339 cnt
* sizeof (x86pte_t
));
343 src
= htable_lookup(kas
.a_hat
, va
, rp
->hkr_level
);
345 x86pte_copy(src
, ht
, start
, cnt
);
354 * Pin top level page tables after initializing them
356 xen_pin(hat
->hat_htable
->ht_pfn
, mmu
.max_level
);
358 xen_pin(hat
->hat_user_ptable
, mmu
.max_level
);
364 * Put it at the start of the global list of all hats (used by stealing)
366 * kas.a_hat is not in the list but is instead used to find the
367 * first and last items in the list.
369 * - kas.a_hat->hat_next points to the start of the user hats.
370 * The list ends where hat->hat_next == NULL
372 * - kas.a_hat->hat_prev points to the last of the user hats.
373 * The list begins where hat->hat_prev == NULL
375 mutex_enter(&hat_list_lock
);
376 hat
->hat_prev
= NULL
;
377 hat
->hat_next
= kas
.a_hat
->hat_next
;
379 hat
->hat_next
->hat_prev
= hat
;
381 kas
.a_hat
->hat_prev
= hat
;
382 kas
.a_hat
->hat_next
= hat
;
383 mutex_exit(&hat_list_lock
);
389 * process has finished executing but as has not been cleaned up yet.
393 hat_free_start(hat_t
*hat
)
395 ASSERT(AS_WRITE_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
398 * If the hat is currently a stealing victim, wait for the stealing
399 * to finish. Once we mark it as HAT_FREEING, htable_steal()
400 * won't look at its pagetables anymore.
402 mutex_enter(&hat_list_lock
);
403 while (hat
->hat_flags
& HAT_VICTIM
)
404 cv_wait(&hat_list_cv
, &hat_list_lock
);
405 hat
->hat_flags
|= HAT_FREEING
;
406 mutex_exit(&hat_list_lock
);
410 * An address space is being destroyed, so we destroy the associated hat.
413 hat_free_end(hat_t
*hat
)
417 ASSERT(hat
->hat_flags
& HAT_FREEING
);
420 * must not be running on the given hat
422 ASSERT(CPU
->cpu_current_hat
!= hat
);
425 * Remove it from the list of HATs
427 mutex_enter(&hat_list_lock
);
429 hat
->hat_prev
->hat_next
= hat
->hat_next
;
431 kas
.a_hat
->hat_next
= hat
->hat_next
;
433 hat
->hat_next
->hat_prev
= hat
->hat_prev
;
435 kas
.a_hat
->hat_prev
= hat
->hat_prev
;
436 mutex_exit(&hat_list_lock
);
437 hat
->hat_next
= hat
->hat_prev
= NULL
;
441 * On the hypervisor, unpin top level page table(s)
443 xen_unpin(hat
->hat_htable
->ht_pfn
);
445 xen_unpin(hat
->hat_user_ptable
);
450 * Make a pass through the htables freeing them all up.
452 htable_purge_hat(hat
);
455 * Decide which kmem cache the hash table came from, then free it.
457 if (hat
->hat_flags
& HAT_VLP
)
458 cache
= vlp_hash_cache
;
460 cache
= hat_hash_cache
;
461 kmem_cache_free(cache
, hat
->hat_ht_hash
);
462 hat
->hat_ht_hash
= NULL
;
465 kmem_cache_free(hat_cache
, hat
);
469 * round kernelbase down to a supported value to use for _userlimit
471 * userlimit must be aligned down to an entry in the top level htable.
472 * The one exception is for 32 bit HAT's running PAE.
475 hat_kernelbase(uintptr_t va
)
481 panic("_userlimit %p will fall in VA hole\n", (void *)va
);
493 if (!kbm_largepage_support
) {
496 if (is_x86_feature(x86_featureset
, X86FSET_1GPG
)) {
498 if (chk_optimal_1gtlb
&&
499 cpuid_opteron_erratum(CPU
, 6671130)) {
502 if (plat_mnode_xcheck(LEVEL_SIZE(2) >>
510 mmu
.max_page_level
= lvl
;
512 if ((lvl
== 2) && (enable_1gpg
== 0))
513 mmu
.umax_page_level
= 1;
515 mmu
.umax_page_level
= lvl
;
519 * Initialize hat data structures based on processor MMU information.
530 * If CPU enabled the page table global bit, use it for the kernel
531 * This is bit 7 in CR4 (PGE - Page Global Enable).
533 if (is_x86_feature(x86_featureset
, X86FSET_PGE
) &&
534 (getcr4() & CR4_PGE
) != 0)
535 mmu
.pt_global
= PT_GLOBAL
;
538 * Detect NX and PAE usage.
540 mmu
.pae_hat
= kbm_pae_support
;
547 * Use CPU info to set various MMU parameters
549 cpuid_get_addrsize(CPU
, &pa_bits
, &va_bits
);
551 if (va_bits
< sizeof (void *) * NBBY
) {
552 mmu
.hole_start
= (1ul << (va_bits
- 1));
553 mmu
.hole_end
= 0ul - mmu
.hole_start
- 1;
556 mmu
.hole_start
= mmu
.hole_end
- 1;
558 #if defined(OPTERON_ERRATUM_121)
560 * If erratum 121 has already been detected at this time, hole_start
561 * contains the value to be subtracted from mmu.hole_start.
563 ASSERT(hole_start
== 0 || opteron_erratum_121
!= 0);
564 hole_start
= mmu
.hole_start
- hole_start
;
566 hole_start
= mmu
.hole_start
;
568 hole_end
= mmu
.hole_end
;
570 mmu
.highest_pfn
= mmu_btop((1ull << pa_bits
) - 1);
571 if (mmu
.pae_hat
== 0 && pa_bits
> 32)
572 mmu
.highest_pfn
= PFN_4G
- 1;
575 mmu
.pte_size
= 8; /* 8 byte PTEs */
576 mmu
.pte_size_shift
= 3;
578 mmu
.pte_size
= 4; /* 4 byte PTEs */
579 mmu
.pte_size_shift
= 2;
582 if (mmu
.pae_hat
&& !is_x86_feature(x86_featureset
, X86FSET_PAE
))
583 panic("Processor does not support PAE");
585 if (!is_x86_feature(x86_featureset
, X86FSET_CX8
))
586 panic("Processor does not support cmpxchg8b instruction");
592 mmu
.ptes_per_table
= 512;
593 mmu
.top_level_count
= 512;
595 mmu
.level_shift
[0] = 12;
596 mmu
.level_shift
[1] = 21;
597 mmu
.level_shift
[2] = 30;
598 mmu
.level_shift
[3] = 39;
600 #elif defined(__i386)
605 mmu
.ptes_per_table
= 512;
606 mmu
.top_level_count
= 4;
608 mmu
.level_shift
[0] = 12;
609 mmu
.level_shift
[1] = 21;
610 mmu
.level_shift
[2] = 30;
615 mmu
.ptes_per_table
= 1024;
616 mmu
.top_level_count
= 1024;
618 mmu
.level_shift
[0] = 12;
619 mmu
.level_shift
[1] = 22;
624 for (i
= 0; i
< mmu
.num_level
; ++i
) {
625 mmu
.level_size
[i
] = 1UL << mmu
.level_shift
[i
];
626 mmu
.level_offset
[i
] = mmu
.level_size
[i
] - 1;
627 mmu
.level_mask
[i
] = ~mmu
.level_offset
[i
];
630 set_max_page_level();
632 mmu_page_sizes
= mmu
.max_page_level
+ 1;
633 mmu_exported_page_sizes
= mmu
.umax_page_level
+ 1;
635 /* restrict legacy applications from using pagesizes 1g and above */
636 mmu_legacy_page_sizes
=
637 (mmu_exported_page_sizes
> 2) ? 2 : mmu_exported_page_sizes
;
640 for (i
= 0; i
<= mmu
.max_page_level
; ++i
) {
641 mmu
.pte_bits
[i
] = PT_VALID
| pt_kern
;
643 mmu
.pte_bits
[i
] |= PT_PAGESIZE
;
647 * NOTE Legacy 32 bit PAE mode only has the P_VALID bit at top level.
649 for (i
= 1; i
< mmu
.num_level
; ++i
)
650 mmu
.ptp_bits
[i
] = PT_PTPBITS
;
653 mmu
.ptp_bits
[2] = PT_VALID
;
657 * Compute how many hash table entries to have per process for htables.
658 * We start with 1 page's worth of entries.
660 * If physical memory is small, reduce the amount need to cover it.
662 max_htables
= physmax
/ mmu
.ptes_per_table
;
663 mmu
.hash_cnt
= MMU_PAGESIZE
/ sizeof (htable_t
*);
664 while (mmu
.hash_cnt
> 16 && mmu
.hash_cnt
>= max_htables
)
666 mmu
.vlp_hash_cnt
= mmu
.hash_cnt
;
670 * If running in 64 bits and physical memory is large,
671 * increase the size of the cache to cover all of memory for
674 #define HASH_MAX_LENGTH 4
675 while (mmu
.hash_cnt
* HASH_MAX_LENGTH
< max_htables
)
682 * initialize hat data structures
689 * _userlimit must be aligned correctly
691 if ((_userlimit
& LEVEL_MASK(1)) != _userlimit
) {
692 prom_printf("hat_init(): _userlimit=%p, not aligned at %p\n",
693 (void *)_userlimit
, (void *)LEVEL_SIZE(1));
694 halt("hat_init(): Unable to continue");
698 cv_init(&hat_list_cv
, NULL
, CV_DEFAULT
, NULL
);
701 * initialize kmem caches
706 hat_cache
= kmem_cache_create("hat_t",
707 sizeof (hat_t
), 0, hati_constructor
, NULL
, NULL
,
710 hat_hash_cache
= kmem_cache_create("HatHash",
711 mmu
.hash_cnt
* sizeof (htable_t
*), 0, NULL
, NULL
, NULL
,
715 * VLP hats can use a smaller hash table size on large memroy machines
717 if (mmu
.hash_cnt
== mmu
.vlp_hash_cnt
) {
718 vlp_hash_cache
= hat_hash_cache
;
720 vlp_hash_cache
= kmem_cache_create("HatVlpHash",
721 mmu
.vlp_hash_cnt
* sizeof (htable_t
*), 0, NULL
, NULL
, NULL
,
726 * Set up the kernel's hat
728 AS_LOCK_ENTER(&kas
, &kas
.a_lock
, RW_WRITER
);
729 kas
.a_hat
= kmem_cache_alloc(hat_cache
, KM_NOSLEEP
);
730 mutex_init(&kas
.a_hat
->hat_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
731 kas
.a_hat
->hat_as
= &kas
;
732 kas
.a_hat
->hat_flags
= 0;
733 AS_LOCK_EXIT(&kas
, &kas
.a_lock
);
735 CPUSET_ZERO(khat_cpuset
);
736 CPUSET_ADD(khat_cpuset
, CPU
->cpu_id
);
739 * The kernel hat's next pointer serves as the head of the hat list .
740 * The kernel hat's prev pointer tracks the last hat on the list for
741 * htable_steal() to use.
743 kas
.a_hat
->hat_next
= NULL
;
744 kas
.a_hat
->hat_prev
= NULL
;
747 * Allocate an htable hash bucket for the kernel
748 * XX64 - tune for 64 bit procs
750 kas
.a_hat
->hat_num_hash
= mmu
.hash_cnt
;
751 kas
.a_hat
->hat_ht_hash
= kmem_cache_alloc(hat_hash_cache
, KM_NOSLEEP
);
752 bzero(kas
.a_hat
->hat_ht_hash
, mmu
.hash_cnt
* sizeof (htable_t
*));
755 * zero out the top level and cached htable pointers
757 kas
.a_hat
->hat_ht_cached
= NULL
;
758 kas
.a_hat
->hat_htable
= NULL
;
761 * Pre-allocate hrm_hashtab before enabling the collection of
762 * refmod statistics. Allocating on the fly would mean us
763 * running the risk of suffering recursive mutex enters or
766 hrm_hashtab
= kmem_zalloc(HRM_HASHSIZE
* sizeof (struct hrmstat
*),
771 * Prepare CPU specific pagetables for VLP processes on 64 bit kernels.
773 * Each CPU has a set of 2 pagetables that are reused for any 32 bit
774 * process it runs. They are the top level pagetable, hci_vlp_l3ptes, and
775 * the next to top level table for the bottom 512 Gig, hci_vlp_l2ptes.
779 hat_vlp_setup(struct cpu
*cpu
)
781 #if defined(__amd64) && !defined(__xpv)
782 struct hat_cpu_info
*hci
= cpu
->cpu_hat_info
;
786 * allocate the level==2 page table for the bottom most
787 * 512Gig of address space (this is where 32 bit apps live)
790 hci
->hci_vlp_l2ptes
= kmem_zalloc(MMU_PAGESIZE
, KM_SLEEP
);
793 * Allocate a top level pagetable and copy the kernel's
794 * entries into it. Then link in hci_vlp_l2ptes in the 1st entry.
796 hci
->hci_vlp_l3ptes
= kmem_zalloc(MMU_PAGESIZE
, KM_SLEEP
);
798 hat_getpfnum(kas
.a_hat
, (caddr_t
)hci
->hci_vlp_l3ptes
);
799 ASSERT(hci
->hci_vlp_pfn
!= PFN_INVALID
);
800 bcopy(vlp_page
, hci
->hci_vlp_l3ptes
, MMU_PAGESIZE
);
802 pfn
= hat_getpfnum(kas
.a_hat
, (caddr_t
)hci
->hci_vlp_l2ptes
);
803 ASSERT(pfn
!= PFN_INVALID
);
804 hci
->hci_vlp_l3ptes
[0] = MAKEPTP(pfn
, 2);
805 #endif /* __amd64 && !__xpv */
810 hat_vlp_teardown(cpu_t
*cpu
)
812 #if defined(__amd64) && !defined(__xpv)
813 struct hat_cpu_info
*hci
;
815 if ((hci
= cpu
->cpu_hat_info
) == NULL
)
817 if (hci
->hci_vlp_l2ptes
)
818 kmem_free(hci
->hci_vlp_l2ptes
, MMU_PAGESIZE
);
819 if (hci
->hci_vlp_l3ptes
)
820 kmem_free(hci
->hci_vlp_l3ptes
, MMU_PAGESIZE
);
824 #define NEXT_HKR(r, l, s, e) { \
825 kernel_ranges[r].hkr_level = l; \
826 kernel_ranges[r].hkr_start_va = s; \
827 kernel_ranges[r].hkr_end_va = e; \
832 * Finish filling in the kernel hat.
833 * Pre fill in all top level kernel page table entries for the kernel's
834 * part of the address range. From this point on we can't use any new
835 * kernel large pages if they need PTE's at max_level
837 * create the kmap mappings.
840 hat_init_finish(void)
845 hat_kernel_range_t
*rp
;
849 * We are now effectively running on the kernel hat.
850 * Clearing use_boot_reserve shuts off using the pre-allocated boot
851 * reserve for all HAT allocations. From here on, the reserves are
852 * only used when avoiding recursion in kmem_alloc().
854 use_boot_reserve
= 0;
855 htable_adjust_reserve();
858 * User HATs are initialized with copies of all kernel mappings in
859 * higher level page tables. Ensure that those entries exist.
863 NEXT_HKR(r
, 3, kernelbase
, 0);
865 NEXT_HKR(r
, 3, HYPERVISOR_VIRT_START
, HYPERVISOR_VIRT_END
);
868 #elif defined(__i386)
873 if ((va
& LEVEL_MASK(2)) != va
) {
874 va
= P2ROUNDUP(va
, LEVEL_SIZE(2));
875 NEXT_HKR(r
, 1, kernelbase
, va
);
878 NEXT_HKR(r
, 2, va
, 0);
881 NEXT_HKR(r
, 1, kernelbase
, 0);
885 num_kernel_ranges
= r
;
888 * Create all the kernel pagetables that will have entries
889 * shared to user HATs.
891 for (r
= 0; r
< num_kernel_ranges
; ++r
) {
892 rp
= &kernel_ranges
[r
];
893 for (va
= rp
->hkr_start_va
; va
!= rp
->hkr_end_va
;
894 va
+= LEVEL_SIZE(rp
->hkr_level
)) {
897 if (IN_HYPERVISOR_VA(va
))
900 /* can/must skip if a page mapping already exists */
901 if (rp
->hkr_level
<= mmu
.max_page_level
&&
902 (ht
= htable_getpage(kas
.a_hat
, va
, NULL
)) !=
908 (void) htable_create(kas
.a_hat
, va
, rp
->hkr_level
- 1,
914 * 32 bit PAE metal kernels use only 4 of the 512 entries in the
915 * page holding the top level pagetable. We use the remainder for
916 * the "per CPU" page tables for VLP processes.
917 * Map the top level kernel pagetable into the kernel to make
918 * it easy to use bcopy access these tables.
921 vlp_page
= vmem_alloc(heap_arena
, MMU_PAGESIZE
, VM_SLEEP
);
922 hat_devload(kas
.a_hat
, (caddr_t
)vlp_page
, MMU_PAGESIZE
,
923 kas
.a_hat
->hat_htable
->ht_pfn
,
927 PROT_READ
| HAT_NOSYNC
| HAT_UNORDERED_OK
,
928 HAT_LOAD
| HAT_LOAD_NOCONSIST
);
933 * Create kmap (cached mappings of kernel PTEs)
934 * for 32 bit we map from segmap_start .. ekernelheap
935 * for 64 bit we map from segmap_start .. segmap_start + segmapsize;
938 size
= (uintptr_t)ekernelheap
- segmap_start
;
939 #elif defined(__amd64)
942 hat_kmap_init((uintptr_t)segmap_start
, size
);
946 * On 32 bit PAE mode, PTE's are 64 bits, but ordinary atomic memory references
947 * are 32 bit, so for safety we must use atomic_cas_64() to install these.
951 reload_pae32(hat_t
*hat
, cpu_t
*cpu
)
959 * Load the 4 entries of the level 2 page table into this
960 * cpu's range of the vlp_page and point cr3 at them.
963 src
= hat
->hat_vlp_ptes
;
964 dest
= vlp_page
+ (cpu
->cpu_id
+ 1) * VLP_NUM_PTES
;
965 for (i
= 0; i
< VLP_NUM_PTES
; ++i
) {
970 if (atomic_cas_64(dest
+ i
, pte
, src
[i
]) != src
[i
])
978 * Switch to a new active hat, maintaining bit masks to track active CPUs.
980 * On the 32-bit PAE hypervisor, %cr3 is a 64-bit value, on metal it
981 * remains a 32-bit value.
984 hat_switch(hat_t
*hat
)
988 hat_t
*old
= cpu
->cpu_current_hat
;
991 * set up this information first, so we don't miss any cross calls
996 if (old
!= kas
.a_hat
)
997 CPUSET_ATOMIC_DEL(old
->hat_cpus
, cpu
->cpu_id
);
1001 * Add this CPU to the active set for this HAT.
1003 if (hat
!= kas
.a_hat
) {
1004 CPUSET_ATOMIC_ADD(hat
->hat_cpus
, cpu
->cpu_id
);
1006 cpu
->cpu_current_hat
= hat
;
1009 * now go ahead and load cr3
1011 if (hat
->hat_flags
& HAT_VLP
) {
1012 #if defined(__amd64)
1013 x86pte_t
*vlpptep
= cpu
->cpu_hat_info
->hci_vlp_l2ptes
;
1015 VLP_COPY(hat
->hat_vlp_ptes
, vlpptep
);
1016 newcr3
= MAKECR3(cpu
->cpu_hat_info
->hci_vlp_pfn
);
1017 #elif defined(__i386)
1018 reload_pae32(hat
, cpu
);
1019 newcr3
= MAKECR3(kas
.a_hat
->hat_htable
->ht_pfn
) +
1020 (cpu
->cpu_id
+ 1) * VLP_SIZE
;
1023 newcr3
= MAKECR3((uint64_t)hat
->hat_htable
->ht_pfn
);
1027 struct mmuext_op t
[2];
1031 t
[0].cmd
= MMUEXT_NEW_BASEPTR
;
1032 t
[0].arg1
.mfn
= mmu_btop(pa_to_ma(newcr3
));
1033 #if defined(__amd64)
1035 * There's an interesting problem here, as to what to
1036 * actually specify when switching to the kernel hat.
1037 * For now we'll reuse the kernel hat again.
1039 t
[1].cmd
= MMUEXT_NEW_USER_BASEPTR
;
1040 if (hat
== kas
.a_hat
)
1041 t
[1].arg1
.mfn
= mmu_btop(pa_to_ma(newcr3
));
1043 t
[1].arg1
.mfn
= pfn_to_mfn(hat
->hat_user_ptable
);
1045 #endif /* __amd64 */
1046 if (HYPERVISOR_mmuext_op(t
, opcnt
, &retcnt
, DOMID_SELF
) < 0)
1047 panic("HYPERVISOR_mmu_update() failed");
1048 ASSERT(retcnt
== opcnt
);
1058 * Utility to return a valid x86pte_t from protections, pfn, and level number
1061 hati_mkpte(pfn_t pfn
, uint_t attr
, level_t level
, uint_t flags
)
1064 uint_t cache_attr
= attr
& HAT_ORDER_MASK
;
1066 pte
= MAKEPTE(pfn
, level
);
1068 if (attr
& PROT_WRITE
)
1069 PTE_SET(pte
, PT_WRITABLE
);
1071 if (attr
& PROT_USER
)
1072 PTE_SET(pte
, PT_USER
);
1074 if (!(attr
& PROT_EXEC
))
1075 PTE_SET(pte
, mmu
.pt_nx
);
1078 * Set the software bits used track ref/mod sync's and hments.
1079 * If not using REF/MOD, set them to avoid h/w rewriting PTEs.
1081 if (flags
& HAT_LOAD_NOCONSIST
)
1082 PTE_SET(pte
, PT_NOCONSIST
| PT_REF
| PT_MOD
);
1083 else if (attr
& HAT_NOSYNC
)
1084 PTE_SET(pte
, PT_NOSYNC
| PT_REF
| PT_MOD
);
1087 * Set the caching attributes in the PTE. The combination
1088 * of attributes are poorly defined, so we pay attention
1089 * to them in the given order.
1091 * The test for HAT_STRICTORDER is different because it's defined
1092 * as "0" - which was a stupid thing to do, but is too late to change!
1094 if (cache_attr
== HAT_STRICTORDER
) {
1095 PTE_SET(pte
, PT_NOCACHE
);
1096 /*LINTED [Lint hates empty ifs, but it's the obvious way to do this] */
1097 } else if (cache_attr
& (HAT_UNORDERED_OK
| HAT_STORECACHING_OK
)) {
1098 /* nothing to set */;
1099 } else if (cache_attr
& (HAT_MERGING_OK
| HAT_LOADCACHING_OK
)) {
1100 PTE_SET(pte
, PT_NOCACHE
);
1101 if (is_x86_feature(x86_featureset
, X86FSET_PAT
))
1102 PTE_SET(pte
, (level
== 0) ? PT_PAT_4K
: PT_PAT_LARGE
);
1104 PTE_SET(pte
, PT_WRITETHRU
);
1106 panic("hati_mkpte(): bad caching attributes: %x\n", cache_attr
);
1113 * Duplicate address translations of the parent to the child.
1114 * This function really isn't used anymore.
1118 hat_dup(hat_t
*old
, hat_t
*new, caddr_t addr
, size_t len
, uint_t flag
)
1120 ASSERT((uintptr_t)addr
< kernelbase
);
1121 ASSERT(new != kas
.a_hat
);
1122 ASSERT(old
!= kas
.a_hat
);
1127 * Allocate any hat resources required for a process being swapped in.
1131 hat_swapin(hat_t
*hat
)
1133 /* do nothing - we let everything fault back in */
1137 * Unload all translations associated with an address space of a process
1138 * that is being swapped out.
1141 hat_swapout(hat_t
*hat
)
1143 uintptr_t vaddr
= (uintptr_t)0;
1144 uintptr_t eaddr
= _userlimit
;
1145 htable_t
*ht
= NULL
;
1148 XPV_DISALLOW_MIGRATE();
1150 * We can't just call hat_unload(hat, 0, _userlimit...) here, because
1151 * seg_spt and shared pagetables can't be swapped out.
1152 * Take a look at segspt_shmswapout() - it's a big no-op.
1154 * Instead we'll walk through all the address space and unload
1155 * any mappings which we are sure are not shared, not locked.
1157 ASSERT(IS_PAGEALIGNED(vaddr
));
1158 ASSERT(IS_PAGEALIGNED(eaddr
));
1159 ASSERT(AS_LOCK_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
1160 if ((uintptr_t)hat
->hat_as
->a_userlimit
< eaddr
)
1161 eaddr
= (uintptr_t)hat
->hat_as
->a_userlimit
;
1163 while (vaddr
< eaddr
) {
1164 (void) htable_walk(hat
, &ht
, &vaddr
, eaddr
);
1168 ASSERT(!IN_VA_HOLE(vaddr
));
1171 * If the page table is shared skip its entire range.
1174 if (ht
->ht_flags
& HTABLE_SHARED_PFN
) {
1175 vaddr
= ht
->ht_vaddr
+ LEVEL_SIZE(l
+ 1);
1182 * If the page table has no locked entries, unload this one.
1184 if (ht
->ht_lock_cnt
== 0)
1185 hat_unload(hat
, (caddr_t
)vaddr
, LEVEL_SIZE(l
),
1189 * If we have a level 0 page table with locked entries,
1190 * skip the entire page table, otherwise skip just one entry.
1192 if (ht
->ht_lock_cnt
> 0 && l
== 0)
1193 vaddr
= ht
->ht_vaddr
+ LEVEL_SIZE(1);
1195 vaddr
+= LEVEL_SIZE(l
);
1201 * We're in swapout because the system is low on memory, so
1202 * go back and flush all the htables off the cached list.
1204 htable_purge_hat(hat
);
1205 XPV_ALLOW_MIGRATE();
1209 * returns number of bytes that have valid mappings in hat.
1212 hat_get_mapped_size(hat_t
*hat
)
1217 for (l
= 0; l
<= mmu
.max_page_level
; l
++)
1218 total
+= (hat
->hat_pages_mapped
[l
] << LEVEL_SHIFT(l
));
1219 total
+= hat
->hat_ism_pgcnt
;
1225 * enable/disable collection of stats for hat.
1228 hat_stats_enable(hat_t
*hat
)
1230 atomic_inc_32(&hat
->hat_stats
);
1235 hat_stats_disable(hat_t
*hat
)
1237 atomic_dec_32(&hat
->hat_stats
);
1241 * Utility to sync the ref/mod bits from a page table entry to the page_t
1242 * We must be holding the mapping list lock when this is called.
1245 hati_sync_pte_to_page(page_t
*pp
, x86pte_t pte
, level_t level
)
1250 if (PTE_GET(pte
, PT_SOFTWARE
) >= PT_NOSYNC
)
1253 if (PTE_GET(pte
, PT_REF
))
1256 if (PTE_GET(pte
, PT_MOD
))
1263 * sync to all constituent pages of a large page
1265 ASSERT(x86_hm_held(pp
));
1266 pgcnt
= page_get_pagecnt(level
);
1267 ASSERT(IS_P2ALIGNED(pp
->p_pagenum
, pgcnt
));
1268 for (; pgcnt
> 0; --pgcnt
) {
1270 * hat_page_demote() can't decrease
1271 * pszc below this mapping size
1272 * since this large mapping existed after we
1275 ASSERT(pp
->p_szc
>= level
);
1276 hat_page_setattr(pp
, rm
);
1282 * This the set of PTE bits for PFN, permissions and caching
1283 * that are allowed to change on a HAT_LOAD_REMAP
1285 #define PT_REMAP_BITS \
1286 (PT_PADDR | PT_NX | PT_WRITABLE | PT_WRITETHRU | \
1287 PT_NOCACHE | PT_PAT_4K | PT_PAT_LARGE | PT_IGNORE | PT_REF | PT_MOD)
1289 #define REMAPASSERT(EX) if (!(EX)) panic("hati_pte_map: " #EX)
1291 * Do the low-level work to get a mapping entered into a HAT's pagetables
1292 * and in the mapping list of the associated page_t.
1303 hat_t
*hat
= ht
->ht_hat
;
1305 level_t l
= ht
->ht_level
;
1312 * Is this a consistent (ie. need mapping list lock) mapping?
1314 is_consist
= (pp
!= NULL
&& (flags
& HAT_LOAD_NOCONSIST
) == 0);
1317 * Track locked mapping count in the htable. Do this first,
1318 * as we track locking even if there already is a mapping present.
1320 is_locked
= (flags
& HAT_LOAD_LOCK
) != 0 && hat
!= kas
.a_hat
;
1322 HTABLE_LOCK_INC(ht
);
1325 * Acquire the page's mapping list lock and get an hment to use.
1326 * Note that hment_prepare() might return NULL.
1330 hm
= hment_prepare(ht
, entry
, pp
);
1334 * Set the new pte, retrieving the old one at the same time.
1336 old_pte
= x86pte_set(ht
, entry
, pte
, pte_ptr
);
1339 * Did we get a large page / page table collision?
1341 if (old_pte
== LPAGE_ERROR
) {
1343 HTABLE_LOCK_DEC(ht
);
1349 * If the mapping didn't change there is nothing more to do.
1351 if (PTE_EQUIV(pte
, old_pte
))
1355 * Install a new mapping in the page's mapping list
1357 if (!PTE_ISVALID(old_pte
)) {
1359 hment_assign(ht
, entry
, pp
, hm
);
1362 ASSERT(flags
& HAT_LOAD_NOCONSIST
);
1364 #if defined(__amd64)
1365 if (ht
->ht_flags
& HTABLE_VLP
) {
1367 x86pte_t
*vlpptep
= cpu
->cpu_hat_info
->hci_vlp_l2ptes
;
1368 VLP_COPY(hat
->hat_vlp_ptes
, vlpptep
);
1371 HTABLE_INC(ht
->ht_valid_cnt
);
1377 * Remap's are more complicated:
1378 * - HAT_LOAD_REMAP must be specified if changing the pfn.
1379 * We also require that NOCONSIST be specified.
1380 * - Otherwise only permission or caching bits may change.
1382 if (!PTE_ISPAGE(old_pte
, l
))
1383 panic("non-null/page mapping pte=" FMT_PTE
, old_pte
);
1385 if (PTE2PFN(old_pte
, l
) != PTE2PFN(pte
, l
)) {
1386 REMAPASSERT(flags
& HAT_LOAD_REMAP
);
1387 REMAPASSERT(flags
& HAT_LOAD_NOCONSIST
);
1388 REMAPASSERT(PTE_GET(old_pte
, PT_SOFTWARE
) >= PT_NOCONSIST
);
1389 REMAPASSERT(pf_is_memory(PTE2PFN(old_pte
, l
)) ==
1390 pf_is_memory(PTE2PFN(pte
, l
)));
1391 REMAPASSERT(!is_consist
);
1395 * We only let remaps change the certain bits in the PTE.
1397 if (PTE_GET(old_pte
, ~PT_REMAP_BITS
) != PTE_GET(pte
, ~PT_REMAP_BITS
))
1398 panic("remap bits changed: old_pte="FMT_PTE
", pte="FMT_PTE
"\n",
1402 * We don't create any mapping list entries on a remap, so release
1403 * any allocated hment after we drop the mapping list lock.
1415 * Internal routine to load a single page table entry. This only fails if
1416 * we attempt to overwrite a page table link with a large page.
1434 * The number 16 is arbitrary and here to catch a recursion problem
1435 * early before we blow out the kernel stack.
1437 ++curthread
->t_hatdepth
;
1438 ASSERT(curthread
->t_hatdepth
< 16);
1440 ASSERT(hat
== kas
.a_hat
||
1441 AS_LOCK_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
1443 if (flags
& HAT_LOAD_SHARE
)
1444 hat
->hat_flags
|= HAT_SHARED
;
1447 * Find the page table that maps this page if it already exists.
1449 ht
= htable_lookup(hat
, va
, level
);
1452 * We must have HAT_LOAD_NOCONSIST if page_t is NULL.
1455 flags
|= HAT_LOAD_NOCONSIST
;
1458 ht
= htable_create(hat
, va
, level
, NULL
);
1461 entry
= htable_va2entry(va
, ht
);
1464 * a bunch of paranoid error checking
1466 ASSERT(ht
->ht_busy
> 0);
1467 if (ht
->ht_vaddr
> va
|| va
> HTABLE_LAST_PAGE(ht
))
1468 panic("hati_load_common: bad htable %p, va %p",
1469 (void *)ht
, (void *)va
);
1470 ASSERT(ht
->ht_level
== level
);
1473 * construct the new PTE
1475 if (hat
== kas
.a_hat
)
1477 pte
= hati_mkpte(pfn
, attr
, level
, flags
);
1478 if (hat
== kas
.a_hat
&& va
>= kernelbase
)
1479 PTE_SET(pte
, mmu
.pt_global
);
1482 * establish the mapping
1484 rv
= hati_pte_map(ht
, entry
, pp
, pte
, flags
, NULL
);
1487 * release the htable and any reserves
1490 --curthread
->t_hatdepth
;
1495 * special case of hat_memload to deal with some kernel addrs for performance
1504 uintptr_t va
= (uintptr_t)addr
;
1506 pfn_t pfn
= page_pptonum(pp
);
1507 pgcnt_t pg_off
= mmu_btop(va
- mmu
.kmap_addr
);
1513 * construct the requested PTE
1516 attr
|= HAT_STORECACHING_OK
;
1517 pte
= hati_mkpte(pfn
, attr
, 0, flags
);
1518 PTE_SET(pte
, mmu
.pt_global
);
1521 * Figure out the pte_ptr and htable and use common code to finish up
1524 pte_ptr
= mmu
.kmap_ptes
+ pg_off
;
1526 pte_ptr
= (x86pte32_t
*)mmu
.kmap_ptes
+ pg_off
;
1527 ht
= mmu
.kmap_htables
[(va
- mmu
.kmap_htables
[0]->ht_vaddr
) >>
1529 entry
= htable_va2entry(va
, ht
);
1530 ++curthread
->t_hatdepth
;
1531 ASSERT(curthread
->t_hatdepth
< 16);
1532 (void) hati_pte_map(ht
, entry
, pp
, pte
, flags
, pte_ptr
);
1533 --curthread
->t_hatdepth
;
1537 * hat_memload() - load a translation to the given page struct
1539 * Flags for hat_memload/hat_devload/hat_*attr.
1541 * HAT_LOAD Default flags to load a translation to the page.
1543 * HAT_LOAD_LOCK Lock down mapping resources; hat_map(), hat_memload(),
1544 * and hat_devload().
1546 * HAT_LOAD_NOCONSIST Do not add mapping to page_t mapping list.
1549 * HAT_LOAD_SHARE A flag to hat_memload() to indicate h/w page tables
1550 * that map some user pages (not kas) is shared by more
1551 * than one process (eg. ISM).
1553 * HAT_LOAD_REMAP Reload a valid pte with a different page frame.
1555 * HAT_NO_KALLOC Do not kmem_alloc while creating the mapping; at this
1556 * point, it's setting up mapping to allocate internal
1557 * hat layer data structures. This flag forces hat layer
1558 * to tap its reserves in order to prevent infinite
1561 * The following is a protection attribute (like PROT_READ, etc.)
1563 * HAT_NOSYNC set PT_NOSYNC - this mapping's ref/mod bits
1564 * are never cleared.
1566 * Installing new valid PTE's and creation of the mapping list
1567 * entry are controlled under the same lock. It's derived from the
1568 * page_t being mapped.
1570 static uint_t supported_memload_flags
=
1571 HAT_LOAD
| HAT_LOAD_LOCK
| HAT_LOAD_ADV
| HAT_LOAD_NOCONSIST
|
1572 HAT_LOAD_SHARE
| HAT_NO_KALLOC
| HAT_LOAD_REMAP
| HAT_LOAD_TEXT
;
1582 uintptr_t va
= (uintptr_t)addr
;
1584 pfn_t pfn
= page_pptonum(pp
);
1586 XPV_DISALLOW_MIGRATE();
1587 ASSERT(IS_PAGEALIGNED(va
));
1588 ASSERT(hat
== kas
.a_hat
|| va
< _userlimit
);
1589 ASSERT(hat
== kas
.a_hat
||
1590 AS_LOCK_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
1591 ASSERT((flags
& supported_memload_flags
) == flags
);
1593 ASSERT(!IN_VA_HOLE(va
));
1594 ASSERT(!PP_ISFREE(pp
));
1597 * kernel address special case for performance.
1599 if (mmu
.kmap_addr
<= va
&& va
< mmu
.kmap_eaddr
) {
1600 ASSERT(hat
== kas
.a_hat
);
1601 hat_kmap_load(addr
, pp
, attr
, flags
);
1602 XPV_ALLOW_MIGRATE();
1607 * This is used for memory with normal caching enabled, so
1608 * always set HAT_STORECACHING_OK.
1610 attr
|= HAT_STORECACHING_OK
;
1611 if (hati_load_common(hat
, va
, pp
, attr
, flags
, level
, pfn
) != 0)
1612 panic("unexpected hati_load_common() failure");
1613 XPV_ALLOW_MIGRATE();
1618 hat_memload_region(struct hat
*hat
, caddr_t addr
, struct page
*pp
,
1619 uint_t attr
, uint_t flags
, hat_region_cookie_t rcookie
)
1621 hat_memload(hat
, addr
, pp
, attr
, flags
);
1625 * Load the given array of page structs using large pages when possible
1636 uintptr_t va
= (uintptr_t)addr
;
1637 uintptr_t eaddr
= va
+ len
;
1644 XPV_DISALLOW_MIGRATE();
1645 ASSERT(IS_PAGEALIGNED(va
));
1646 ASSERT(hat
== kas
.a_hat
|| va
+ len
<= _userlimit
);
1647 ASSERT(hat
== kas
.a_hat
||
1648 AS_LOCK_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
1649 ASSERT((flags
& supported_memload_flags
) == flags
);
1652 * memload is used for memory with full caching enabled, so
1653 * set HAT_STORECACHING_OK.
1655 attr
|= HAT_STORECACHING_OK
;
1658 * handle all pages using largest possible pagesize
1660 while (va
< eaddr
) {
1662 * decide what level mapping to use (ie. pagesize)
1664 pfn
= page_pptonum(pages
[pgindx
]);
1665 for (level
= mmu
.max_page_level
; ; --level
) {
1666 pgsize
= LEVEL_SIZE(level
);
1670 if (!IS_P2ALIGNED(va
, pgsize
) ||
1671 (eaddr
- va
) < pgsize
||
1672 !IS_P2ALIGNED(pfn_to_pa(pfn
), pgsize
))
1676 * To use a large mapping of this size, all the
1677 * pages we are passed must be sequential subpages
1678 * of the large page.
1679 * hat_page_demote() can't change p_szc because
1680 * all pages are locked.
1682 if (pages
[pgindx
]->p_szc
>= level
) {
1683 for (i
= 0; i
< mmu_btop(pgsize
); ++i
) {
1685 page_pptonum(pages
[pgindx
+ i
]))
1687 ASSERT(pages
[pgindx
+ i
]->p_szc
>=
1689 ASSERT(pages
[pgindx
] + i
==
1692 if (i
== mmu_btop(pgsize
)) {
1703 * Load this page mapping. If the load fails, try a smaller
1706 ASSERT(!IN_VA_HOLE(va
));
1707 while (hati_load_common(hat
, va
, pages
[pgindx
], attr
,
1708 flags
, level
, pfn
) != 0) {
1710 panic("unexpected hati_load_common() failure");
1712 pgsize
= LEVEL_SIZE(level
);
1719 pgindx
+= mmu_btop(pgsize
);
1721 XPV_ALLOW_MIGRATE();
1726 hat_memload_array_region(struct hat
*hat
, caddr_t addr
, size_t len
,
1727 struct page
**pps
, uint_t attr
, uint_t flags
,
1728 hat_region_cookie_t rcookie
)
1730 hat_memload_array(hat
, addr
, len
, pps
, attr
, flags
);
1734 * void hat_devload(hat, addr, len, pf, attr, flags)
1735 * load/lock the given page frame number
1737 * Advisory ordering attributes. Apply only to device mappings.
1739 * HAT_STRICTORDER: the CPU must issue the references in order, as the
1740 * programmer specified. This is the default.
1741 * HAT_UNORDERED_OK: the CPU may reorder the references (this is all kinds
1742 * of reordering; store or load with store or load).
1743 * HAT_MERGING_OK: merging and batching: the CPU may merge individual stores
1744 * to consecutive locations (for example, turn two consecutive byte
1745 * stores into one halfword store), and it may batch individual loads
1746 * (for example, turn two consecutive byte loads into one halfword load).
1747 * This also implies re-ordering.
1748 * HAT_LOADCACHING_OK: the CPU may cache the data it fetches and reuse it
1749 * until another store occurs. The default is to fetch new data
1750 * on every load. This also implies merging.
1751 * HAT_STORECACHING_OK: the CPU may keep the data in the cache and push it to
1752 * the device (perhaps with other data) at a later time. The default is
1753 * to push the data right away. This also implies load caching.
1755 * Equivalent of hat_memload(), but can be used for device memory where
1756 * there are no page_t's and we support additional flags (write merging, etc).
1757 * Note that we can have large page mappings with this interface.
1759 int supported_devload_flags
= HAT_LOAD
| HAT_LOAD_LOCK
|
1760 HAT_LOAD_NOCONSIST
| HAT_STRICTORDER
| HAT_UNORDERED_OK
|
1761 HAT_MERGING_OK
| HAT_LOADCACHING_OK
| HAT_STORECACHING_OK
;
1772 uintptr_t va
= ALIGN2PAGE(addr
);
1773 uintptr_t eva
= va
+ len
;
1777 int f
; /* per PTE copy of flags - maybe modified */
1778 uint_t a
; /* per PTE copy of attr */
1780 XPV_DISALLOW_MIGRATE();
1781 ASSERT(IS_PAGEALIGNED(va
));
1782 ASSERT(hat
== kas
.a_hat
|| eva
<= _userlimit
);
1783 ASSERT(hat
== kas
.a_hat
||
1784 AS_LOCK_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
1785 ASSERT((flags
& supported_devload_flags
) == flags
);
1793 * decide what level mapping to use (ie. pagesize)
1795 for (level
= mmu
.max_page_level
; ; --level
) {
1796 pgsize
= LEVEL_SIZE(level
);
1799 if (IS_P2ALIGNED(va
, pgsize
) &&
1800 (eva
- va
) >= pgsize
&&
1801 IS_P2ALIGNED(pfn
, mmu_btop(pgsize
))) {
1811 * If this is just memory then allow caching (this happens
1812 * for the nucleus pages) - though HAT_PLAT_NOCACHE can be used
1813 * to override that. If we don't have a page_t then make sure
1818 if (!pf_is_memory(pfn
))
1819 f
|= HAT_LOAD_NOCONSIST
;
1820 else if (!(a
& HAT_PLAT_NOCACHE
))
1821 a
|= HAT_STORECACHING_OK
;
1823 if (f
& HAT_LOAD_NOCONSIST
)
1826 pp
= page_numtopp_nolock(pfn
);
1829 * Check to make sure we are really trying to map a valid
1830 * memory page. The caller wishing to intentionally map
1831 * free memory pages will have passed the HAT_LOAD_NOCONSIST
1832 * flag, then pp will be NULL.
1835 if (PP_ISFREE(pp
)) {
1836 panic("hat_devload: loading "
1837 "a mapping to free page %p", (void *)pp
);
1840 if (!PAGE_LOCKED(pp
) && !PP_ISNORELOC(pp
)) {
1841 panic("hat_devload: loading a mapping "
1842 "to an unlocked page %p",
1848 * load this page mapping
1850 ASSERT(!IN_VA_HOLE(va
));
1851 while (hati_load_common(hat
, va
, pp
, a
, f
, level
, pfn
) != 0) {
1853 panic("unexpected hati_load_common() failure");
1855 pgsize
= LEVEL_SIZE(level
);
1862 pfn
+= mmu_btop(pgsize
);
1864 XPV_ALLOW_MIGRATE();
1868 * void hat_unlock(hat, addr, len)
1869 * unlock the mappings to a given range of addresses
1871 * Locks are tracked by ht_lock_cnt in the htable.
1874 hat_unlock(hat_t
*hat
, caddr_t addr
, size_t len
)
1876 uintptr_t vaddr
= (uintptr_t)addr
;
1877 uintptr_t eaddr
= vaddr
+ len
;
1878 htable_t
*ht
= NULL
;
1881 * kernel entries are always locked, we don't track lock counts
1883 ASSERT(hat
== kas
.a_hat
|| eaddr
<= _userlimit
);
1884 ASSERT(IS_PAGEALIGNED(vaddr
));
1885 ASSERT(IS_PAGEALIGNED(eaddr
));
1886 if (hat
== kas
.a_hat
)
1888 if (eaddr
> _userlimit
)
1889 panic("hat_unlock() address out of range - above _userlimit");
1891 XPV_DISALLOW_MIGRATE();
1892 ASSERT(AS_LOCK_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
1893 while (vaddr
< eaddr
) {
1894 (void) htable_walk(hat
, &ht
, &vaddr
, eaddr
);
1898 ASSERT(!IN_VA_HOLE(vaddr
));
1900 if (ht
->ht_lock_cnt
< 1)
1901 panic("hat_unlock(): lock_cnt < 1, "
1902 "htable=%p, vaddr=%p\n", (void *)ht
, (void *)vaddr
);
1903 HTABLE_LOCK_DEC(ht
);
1905 vaddr
+= LEVEL_SIZE(ht
->ht_level
);
1909 XPV_ALLOW_MIGRATE();
1914 hat_unlock_region(struct hat
*hat
, caddr_t addr
, size_t len
,
1915 hat_region_cookie_t rcookie
)
1917 panic("No shared region support on x86");
1922 * Cross call service routine to demap a virtual page on
1923 * the current CPU or flush all mappings in TLB.
1927 hati_demap_func(xc_arg_t a1
, xc_arg_t a2
, xc_arg_t a3
)
1929 hat_t
*hat
= (hat_t
*)a1
;
1930 caddr_t addr
= (caddr_t
)a2
;
1933 * If the target hat isn't the kernel and this CPU isn't operating
1934 * in the target hat, we can ignore the cross call.
1936 if (hat
!= kas
.a_hat
&& hat
!= CPU
->cpu_current_hat
)
1940 * For a normal address, we just flush one page mapping
1942 if ((uintptr_t)addr
!= DEMAP_ALL_ADDR
) {
1943 mmu_tlbflush_entry(addr
);
1948 * Otherwise we reload cr3 to effect a complete TLB flush.
1950 * A reload of cr3 on a VLP process also means we must also recopy in
1951 * the pte values from the struct hat
1953 if (hat
->hat_flags
& HAT_VLP
) {
1954 #if defined(__amd64)
1955 x86pte_t
*vlpptep
= CPU
->cpu_hat_info
->hci_vlp_l2ptes
;
1957 VLP_COPY(hat
->hat_vlp_ptes
, vlpptep
);
1958 #elif defined(__i386)
1959 reload_pae32(hat
, CPU
);
1967 * Flush all TLB entries, including global (ie. kernel) ones.
1970 flush_all_tlb_entries(void)
1972 ulong_t cr4
= getcr4();
1974 if (cr4
& CR4_PGE
) {
1975 setcr4(cr4
& ~(ulong_t
)CR4_PGE
);
1979 * 32 bit PAE also needs to always reload_cr3()
1981 if (mmu
.max_level
== 2)
1988 #define TLB_CPU_HALTED (01ul)
1989 #define TLB_INVAL_ALL (02ul)
1990 #define CAS_TLB_INFO(cpu, old, new) \
1991 atomic_cas_ulong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new))
1994 * Record that a CPU is going idle
1997 tlb_going_idle(void)
1999 atomic_or_ulong((ulong_t
*)&CPU
->cpu_m
.mcpu_tlb_info
, TLB_CPU_HALTED
);
2003 * Service a delayed TLB flush if coming out of being idle.
2004 * It will be called from cpu idle notification with interrupt disabled.
2013 * We only have to do something if coming out of being idle.
2015 tlb_info
= CPU
->cpu_m
.mcpu_tlb_info
;
2016 if (tlb_info
& TLB_CPU_HALTED
) {
2017 ASSERT(CPU
->cpu_current_hat
== kas
.a_hat
);
2020 * Atomic clear and fetch of old state.
2022 while ((found
= CAS_TLB_INFO(CPU
, tlb_info
, 0)) != tlb_info
) {
2023 ASSERT(found
& TLB_CPU_HALTED
);
2027 if (tlb_info
& TLB_INVAL_ALL
)
2028 flush_all_tlb_entries();
2034 * Internal routine to do cross calls to invalidate a range of pages on
2035 * all CPUs using a given hat.
2038 hat_tlb_inval(hat_t
*hat
, uintptr_t va
)
2040 extern int flushes_require_xcalls
; /* from mp_startup.c */
2042 cpuset_t cpus_to_shootdown
;
2044 cpuset_t check_cpus
;
2050 * If the hat is being destroyed, there are no more users, so
2051 * demap need not do anything.
2053 if (hat
->hat_flags
& HAT_FREEING
)
2057 * If demapping from a shared pagetable, we best demap the
2058 * entire set of user TLBs, since we don't know what addresses
2059 * these were shared at.
2061 if (hat
->hat_flags
& HAT_SHARED
) {
2063 va
= DEMAP_ALL_ADDR
;
2067 * if not running with multiple CPUs, don't use cross calls
2069 if (panicstr
|| !flushes_require_xcalls
) {
2071 if (va
== DEMAP_ALL_ADDR
)
2074 xen_flush_va((caddr_t
)va
);
2076 (void) hati_demap_func((xc_arg_t
)hat
, (xc_arg_t
)va
, NULL
);
2083 * Determine CPUs to shootdown. Kernel changes always do all CPUs.
2084 * Otherwise it's just CPUs currently executing in this hat.
2087 CPUSET_ONLY(justme
, CPU
->cpu_id
);
2088 if (hat
== kas
.a_hat
)
2089 cpus_to_shootdown
= khat_cpuset
;
2091 cpus_to_shootdown
= hat
->hat_cpus
;
2095 * If any CPUs in the set are idle, just request a delayed flush
2096 * and avoid waking them up.
2098 check_cpus
= cpus_to_shootdown
;
2099 for (c
= 0; c
< NCPU
&& !CPUSET_ISNULL(check_cpus
); ++c
) {
2102 if (!CPU_IN_SET(check_cpus
, c
))
2104 CPUSET_DEL(check_cpus
, c
);
2109 tlb_info
= cpup
->cpu_m
.mcpu_tlb_info
;
2110 while (tlb_info
== TLB_CPU_HALTED
) {
2111 (void) CAS_TLB_INFO(cpup
, TLB_CPU_HALTED
,
2112 TLB_CPU_HALTED
| TLB_INVAL_ALL
);
2114 tlb_info
= cpup
->cpu_m
.mcpu_tlb_info
;
2116 if (tlb_info
== (TLB_CPU_HALTED
| TLB_INVAL_ALL
)) {
2117 HATSTAT_INC(hs_tlb_inval_delayed
);
2118 CPUSET_DEL(cpus_to_shootdown
, c
);
2123 if (CPUSET_ISNULL(cpus_to_shootdown
) ||
2124 CPUSET_ISEQUAL(cpus_to_shootdown
, justme
)) {
2127 if (va
== DEMAP_ALL_ADDR
)
2130 xen_flush_va((caddr_t
)va
);
2132 (void) hati_demap_func((xc_arg_t
)hat
, (xc_arg_t
)va
, NULL
);
2137 CPUSET_ADD(cpus_to_shootdown
, CPU
->cpu_id
);
2139 if (va
== DEMAP_ALL_ADDR
)
2140 xen_gflush_tlb(cpus_to_shootdown
);
2142 xen_gflush_va((caddr_t
)va
, cpus_to_shootdown
);
2144 xc_call((xc_arg_t
)hat
, (xc_arg_t
)va
, NULL
,
2145 CPUSET2BV(cpus_to_shootdown
), hati_demap_func
);
2153 * Interior routine for HAT_UNLOADs from hat_unload_callback(),
2154 * hat_kmap_unload() OR from hat_steal() code. This routine doesn't
2155 * handle releasing of the htables.
2165 hat_t
*hat
= ht
->ht_hat
;
2168 level_t l
= ht
->ht_level
;
2172 * We always track the locking counts, even if nothing is unmapped
2174 if ((flags
& HAT_UNLOAD_UNLOCK
) != 0 && hat
!= kas
.a_hat
) {
2175 ASSERT(ht
->ht_lock_cnt
> 0);
2176 HTABLE_LOCK_DEC(ht
);
2180 * Figure out which page's mapping list lock to acquire using the PFN
2181 * passed in "old" PTE. We then attempt to invalidate the PTE.
2182 * If another thread, probably a hat_pageunload, has asynchronously
2183 * unmapped/remapped this address we'll loop here.
2185 ASSERT(ht
->ht_busy
> 0);
2186 while (PTE_ISVALID(old_pte
)) {
2187 pfn
= PTE2PFN(old_pte
, l
);
2188 if (PTE_GET(old_pte
, PT_SOFTWARE
) >= PT_NOCONSIST
) {
2192 if (pfn
== PFN_INVALID
)
2193 panic("Invalid PFN, but not PT_NOCONSIST");
2195 pp
= page_numtopp_nolock(pfn
);
2197 panic("no page_t, not NOCONSIST: old_pte="
2198 FMT_PTE
" ht=%lx entry=0x%x pte_ptr=%lx",
2199 old_pte
, (uintptr_t)ht
, entry
,
2200 (uintptr_t)pte_ptr
);
2205 old_pte
= x86pte_inval(ht
, entry
, old_pte
, pte_ptr
);
2208 * If the page hadn't changed we've unmapped it and can proceed
2210 if (PTE_ISVALID(old_pte
) && PTE2PFN(old_pte
, l
) == pfn
)
2214 * Otherwise, we'll have to retry with the current old_pte.
2215 * Drop the hment lock, since the pfn may have changed.
2221 ASSERT(PTE_GET(old_pte
, PT_SOFTWARE
) >= PT_NOCONSIST
);
2226 * If the old mapping wasn't valid, there's nothing more to do
2228 if (!PTE_ISVALID(old_pte
)) {
2235 * Take care of syncing any MOD/REF bits and removing the hment.
2238 if (!(flags
& HAT_UNLOAD_NOSYNC
))
2239 hati_sync_pte_to_page(pp
, old_pte
, l
);
2240 hm
= hment_remove(pp
, ht
, entry
);
2247 * Handle book keeping in the htable and hat
2249 ASSERT(ht
->ht_valid_cnt
> 0);
2250 HTABLE_DEC(ht
->ht_valid_cnt
);
2255 * very cheap unload implementation to special case some kernel addresses
2258 hat_kmap_unload(caddr_t addr
, size_t len
, uint_t flags
)
2260 uintptr_t va
= (uintptr_t)addr
;
2261 uintptr_t eva
= va
+ len
;
2268 for (; va
< eva
; va
+= MMU_PAGESIZE
) {
2272 pg_index
= mmu_btop(va
- mmu
.kmap_addr
);
2273 pte_ptr
= PT_INDEX_PTR(mmu
.kmap_ptes
, pg_index
);
2274 old_pte
= GET_PTE(pte_ptr
);
2277 * get the htable / entry
2279 ht
= mmu
.kmap_htables
[(va
- mmu
.kmap_htables
[0]->ht_vaddr
)
2281 entry
= htable_va2entry(va
, ht
);
2284 * use mostly common code to unmap it.
2286 hat_pte_unmap(ht
, entry
, flags
, old_pte
, pte_ptr
);
2292 * unload a range of virtual address space (no callback)
2295 hat_unload(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t flags
)
2297 uintptr_t va
= (uintptr_t)addr
;
2299 XPV_DISALLOW_MIGRATE();
2300 ASSERT(hat
== kas
.a_hat
|| va
+ len
<= _userlimit
);
2303 * special case for performance.
2305 if (mmu
.kmap_addr
<= va
&& va
< mmu
.kmap_eaddr
) {
2306 ASSERT(hat
== kas
.a_hat
);
2307 hat_kmap_unload(addr
, len
, flags
);
2309 hat_unload_callback(hat
, addr
, len
, flags
, NULL
);
2311 XPV_ALLOW_MIGRATE();
2315 * Do the callbacks for ranges being unloaded.
2317 typedef struct range_info
{
2324 handle_ranges(hat_callback_t
*cb
, uint_t cnt
, range_info_t
*range
)
2327 * do callbacks to upper level VM system
2329 while (cb
!= NULL
&& cnt
> 0) {
2331 cb
->hcb_start_addr
= (caddr_t
)range
[cnt
].rng_va
;
2332 cb
->hcb_end_addr
= cb
->hcb_start_addr
;
2334 range
[cnt
].rng_cnt
<< LEVEL_SIZE(range
[cnt
].rng_level
);
2335 cb
->hcb_function(cb
);
2340 * Unload a given range of addresses (has optional callback)
2343 * define HAT_UNLOAD 0x00
2344 * define HAT_UNLOAD_NOSYNC 0x02
2345 * define HAT_UNLOAD_UNLOCK 0x04
2346 * define HAT_UNLOAD_OTHER 0x08 - not used
2347 * define HAT_UNLOAD_UNMAP 0x10 - same as HAT_UNLOAD
2349 #define MAX_UNLOAD_CNT (8)
2351 hat_unload_callback(
2358 uintptr_t vaddr
= (uintptr_t)addr
;
2359 uintptr_t eaddr
= vaddr
+ len
;
2360 htable_t
*ht
= NULL
;
2362 uintptr_t contig_va
= (uintptr_t)-1L;
2363 range_info_t r
[MAX_UNLOAD_CNT
];
2367 XPV_DISALLOW_MIGRATE();
2368 ASSERT(hat
== kas
.a_hat
|| eaddr
<= _userlimit
);
2369 ASSERT(IS_PAGEALIGNED(vaddr
));
2370 ASSERT(IS_PAGEALIGNED(eaddr
));
2373 * Special case a single page being unloaded for speed. This happens
2374 * quite frequently, COW faults after a fork() for example.
2376 if (cb
== NULL
&& len
== MMU_PAGESIZE
) {
2377 ht
= htable_getpte(hat
, vaddr
, &entry
, &old_pte
, 0);
2379 if (PTE_ISVALID(old_pte
))
2380 hat_pte_unmap(ht
, entry
, flags
, old_pte
, NULL
);
2383 XPV_ALLOW_MIGRATE();
2387 while (vaddr
< eaddr
) {
2388 old_pte
= htable_walk(hat
, &ht
, &vaddr
, eaddr
);
2392 ASSERT(!IN_VA_HOLE(vaddr
));
2394 if (vaddr
< (uintptr_t)addr
)
2395 panic("hat_unload_callback(): unmap inside large page");
2398 * We'll do the call backs for contiguous ranges
2400 if (vaddr
!= contig_va
||
2401 (r_cnt
> 0 && r
[r_cnt
- 1].rng_level
!= ht
->ht_level
)) {
2402 if (r_cnt
== MAX_UNLOAD_CNT
) {
2403 handle_ranges(cb
, r_cnt
, r
);
2406 r
[r_cnt
].rng_va
= vaddr
;
2407 r
[r_cnt
].rng_cnt
= 0;
2408 r
[r_cnt
].rng_level
= ht
->ht_level
;
2413 * Unload one mapping from the page tables.
2415 entry
= htable_va2entry(vaddr
, ht
);
2416 hat_pte_unmap(ht
, entry
, flags
, old_pte
, NULL
);
2417 ASSERT(ht
->ht_level
<= mmu
.max_page_level
);
2418 vaddr
+= LEVEL_SIZE(ht
->ht_level
);
2420 ++r
[r_cnt
- 1].rng_cnt
;
2426 * handle last range for callbacks
2429 handle_ranges(cb
, r_cnt
, r
);
2430 XPV_ALLOW_MIGRATE();
2434 * Invalidate a virtual address translation on a slave CPU during
2438 hat_flush_range(hat_t
*hat
, caddr_t va
, size_t size
)
2441 caddr_t endva
= va
+ size
;
2443 while (va
< endva
) {
2444 sz
= hat_getpagesize(hat
, va
);
2449 flush_all_tlb_entries();
2456 mmu_tlbflush_entry(va
);
2463 * synchronize mapping with software data structures
2465 * This interface is currently only used by the working set monitor
2470 hat_sync(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t flags
)
2472 uintptr_t vaddr
= (uintptr_t)addr
;
2473 uintptr_t eaddr
= vaddr
+ len
;
2474 htable_t
*ht
= NULL
;
2481 ASSERT(!IN_VA_HOLE(vaddr
));
2482 ASSERT(IS_PAGEALIGNED(vaddr
));
2483 ASSERT(IS_PAGEALIGNED(eaddr
));
2484 ASSERT(hat
== kas
.a_hat
|| eaddr
<= _userlimit
);
2486 XPV_DISALLOW_MIGRATE();
2487 for (; vaddr
< eaddr
; vaddr
+= LEVEL_SIZE(ht
->ht_level
)) {
2489 pte
= htable_walk(hat
, &ht
, &vaddr
, eaddr
);
2492 entry
= htable_va2entry(vaddr
, ht
);
2494 if (PTE_GET(pte
, PT_SOFTWARE
) >= PT_NOSYNC
||
2495 PTE_GET(pte
, PT_REF
| PT_MOD
) == 0)
2499 * We need to acquire the mapping list lock to protect
2500 * against hat_pageunload(), hat_unload(), etc.
2502 pp
= page_numtopp_nolock(PTE2PFN(pte
, ht
->ht_level
));
2507 pte
= x86pte_get(ht
, entry
);
2508 if (pte
!= save_pte
) {
2512 if (PTE_GET(pte
, PT_SOFTWARE
) >= PT_NOSYNC
||
2513 PTE_GET(pte
, PT_REF
| PT_MOD
) == 0) {
2519 * Need to clear ref or mod bits. We may compete with
2520 * hardware updating the R/M bits and have to try again.
2522 if (flags
== HAT_SYNC_ZERORM
) {
2524 PTE_CLR(new, PT_REF
| PT_MOD
);
2525 pte
= hati_update_pte(ht
, entry
, pte
, new);
2532 * sync the PTE to the page_t
2534 hati_sync_pte_to_page(pp
, save_pte
, ht
->ht_level
);
2540 XPV_ALLOW_MIGRATE();
2544 * void hat_map(hat, addr, len, flags)
2548 hat_map(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t flags
)
2554 * uint_t hat_getattr(hat, addr, *attr)
2555 * returns attr for <hat,addr> in *attr. returns 0 if there was a
2556 * mapping and *attr is valid, nonzero if there was no mapping and
2557 * *attr is not valid.
2560 hat_getattr(hat_t
*hat
, caddr_t addr
, uint_t
*attr
)
2562 uintptr_t vaddr
= ALIGN2PAGE(addr
);
2563 htable_t
*ht
= NULL
;
2566 ASSERT(hat
== kas
.a_hat
|| vaddr
<= _userlimit
);
2568 if (IN_VA_HOLE(vaddr
))
2569 return ((uint_t
)-1);
2571 ht
= htable_getpte(hat
, vaddr
, NULL
, &pte
, mmu
.max_page_level
);
2573 return ((uint_t
)-1);
2575 if (!PTE_ISVALID(pte
) || !PTE_ISPAGE(pte
, ht
->ht_level
)) {
2577 return ((uint_t
)-1);
2581 if (PTE_GET(pte
, PT_WRITABLE
))
2582 *attr
|= PROT_WRITE
;
2583 if (PTE_GET(pte
, PT_USER
))
2585 if (!PTE_GET(pte
, mmu
.pt_nx
))
2587 if (PTE_GET(pte
, PT_SOFTWARE
) >= PT_NOSYNC
)
2588 *attr
|= HAT_NOSYNC
;
2594 * hat_updateattr() applies the given attribute change to an existing mapping
2596 #define HAT_LOAD_ATTR 1
2597 #define HAT_SET_ATTR 2
2598 #define HAT_CLR_ATTR 3
2601 hat_updateattr(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t attr
, int what
)
2603 uintptr_t vaddr
= (uintptr_t)addr
;
2604 uintptr_t eaddr
= (uintptr_t)addr
+ len
;
2605 htable_t
*ht
= NULL
;
2607 x86pte_t oldpte
, newpte
;
2610 XPV_DISALLOW_MIGRATE();
2611 ASSERT(IS_PAGEALIGNED(vaddr
));
2612 ASSERT(IS_PAGEALIGNED(eaddr
));
2613 ASSERT(hat
== kas
.a_hat
||
2614 AS_LOCK_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
2615 for (; vaddr
< eaddr
; vaddr
+= LEVEL_SIZE(ht
->ht_level
)) {
2617 oldpte
= htable_walk(hat
, &ht
, &vaddr
, eaddr
);
2620 if (PTE_GET(oldpte
, PT_SOFTWARE
) >= PT_NOCONSIST
)
2623 pp
= page_numtopp_nolock(PTE2PFN(oldpte
, ht
->ht_level
));
2630 * We found a page table entry in the desired range,
2631 * figure out the new attributes.
2633 if (what
== HAT_SET_ATTR
|| what
== HAT_LOAD_ATTR
) {
2634 if ((attr
& PROT_WRITE
) &&
2635 !PTE_GET(oldpte
, PT_WRITABLE
))
2636 newpte
|= PT_WRITABLE
;
2638 if ((attr
& HAT_NOSYNC
) &&
2639 PTE_GET(oldpte
, PT_SOFTWARE
) < PT_NOSYNC
)
2640 newpte
|= PT_NOSYNC
;
2642 if ((attr
& PROT_EXEC
) && PTE_GET(oldpte
, mmu
.pt_nx
))
2643 newpte
&= ~mmu
.pt_nx
;
2646 if (what
== HAT_LOAD_ATTR
) {
2647 if (!(attr
& PROT_WRITE
) &&
2648 PTE_GET(oldpte
, PT_WRITABLE
))
2649 newpte
&= ~PT_WRITABLE
;
2651 if (!(attr
& HAT_NOSYNC
) &&
2652 PTE_GET(oldpte
, PT_SOFTWARE
) >= PT_NOSYNC
)
2653 newpte
&= ~PT_SOFTWARE
;
2655 if (!(attr
& PROT_EXEC
) && !PTE_GET(oldpte
, mmu
.pt_nx
))
2656 newpte
|= mmu
.pt_nx
;
2659 if (what
== HAT_CLR_ATTR
) {
2660 if ((attr
& PROT_WRITE
) && PTE_GET(oldpte
, PT_WRITABLE
))
2661 newpte
&= ~PT_WRITABLE
;
2663 if ((attr
& HAT_NOSYNC
) &&
2664 PTE_GET(oldpte
, PT_SOFTWARE
) >= PT_NOSYNC
)
2665 newpte
&= ~PT_SOFTWARE
;
2667 if ((attr
& PROT_EXEC
) && !PTE_GET(oldpte
, mmu
.pt_nx
))
2668 newpte
|= mmu
.pt_nx
;
2672 * Ensure NOSYNC/NOCONSIST mappings have REF and MOD set.
2673 * x86pte_set() depends on this.
2675 if (PTE_GET(newpte
, PT_SOFTWARE
) >= PT_NOSYNC
)
2676 newpte
|= PT_REF
| PT_MOD
;
2679 * what about PROT_READ or others? this code only handles:
2680 * EXEC, WRITE, NOSYNC
2684 * If new PTE really changed, update the table.
2686 if (newpte
!= oldpte
) {
2687 entry
= htable_va2entry(vaddr
, ht
);
2688 oldpte
= hati_update_pte(ht
, entry
, oldpte
, newpte
);
2698 XPV_ALLOW_MIGRATE();
2702 * Various wrappers for hat_updateattr()
2705 hat_setattr(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
2707 ASSERT(hat
== kas
.a_hat
|| (uintptr_t)addr
+ len
<= _userlimit
);
2708 hat_updateattr(hat
, addr
, len
, attr
, HAT_SET_ATTR
);
2712 hat_clrattr(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
2714 ASSERT(hat
== kas
.a_hat
|| (uintptr_t)addr
+ len
<= _userlimit
);
2715 hat_updateattr(hat
, addr
, len
, attr
, HAT_CLR_ATTR
);
2719 hat_chgattr(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t attr
)
2721 ASSERT(hat
== kas
.a_hat
|| (uintptr_t)addr
+ len
<= _userlimit
);
2722 hat_updateattr(hat
, addr
, len
, attr
, HAT_LOAD_ATTR
);
2726 hat_chgprot(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t vprot
)
2728 ASSERT(hat
== kas
.a_hat
|| (uintptr_t)addr
+ len
<= _userlimit
);
2729 hat_updateattr(hat
, addr
, len
, vprot
& HAT_PROT_MASK
, HAT_LOAD_ATTR
);
2733 * size_t hat_getpagesize(hat, addr)
2734 * returns pagesize in bytes for <hat, addr>. returns -1 of there is
2735 * no mapping. This is an advisory call.
2738 hat_getpagesize(hat_t
*hat
, caddr_t addr
)
2740 uintptr_t vaddr
= ALIGN2PAGE(addr
);
2744 ASSERT(hat
== kas
.a_hat
|| vaddr
<= _userlimit
);
2745 if (IN_VA_HOLE(vaddr
))
2747 ht
= htable_getpage(hat
, vaddr
, NULL
);
2750 pagesize
= LEVEL_SIZE(ht
->ht_level
);
2758 * pfn_t hat_getpfnum(hat, addr)
2759 * returns pfn for <hat, addr> or PFN_INVALID if mapping is invalid.
2762 hat_getpfnum(hat_t
*hat
, caddr_t addr
)
2764 uintptr_t vaddr
= ALIGN2PAGE(addr
);
2767 pfn_t pfn
= PFN_INVALID
;
2769 ASSERT(hat
== kas
.a_hat
|| vaddr
<= _userlimit
);
2770 if (khat_running
== 0)
2771 return (PFN_INVALID
);
2773 if (IN_VA_HOLE(vaddr
))
2774 return (PFN_INVALID
);
2776 XPV_DISALLOW_MIGRATE();
2778 * A very common use of hat_getpfnum() is from the DDI for kernel pages.
2779 * Use the kmap_ptes (which also covers the 32 bit heap) to speed
2782 if (mmu
.kmap_addr
<= vaddr
&& vaddr
< mmu
.kmap_eaddr
) {
2786 pg_index
= mmu_btop(vaddr
- mmu
.kmap_addr
);
2787 pte
= GET_PTE(PT_INDEX_PTR(mmu
.kmap_ptes
, pg_index
));
2788 if (PTE_ISVALID(pte
))
2789 /*LINTED [use of constant 0 causes a lint warning] */
2790 pfn
= PTE2PFN(pte
, 0);
2791 XPV_ALLOW_MIGRATE();
2795 ht
= htable_getpage(hat
, vaddr
, &entry
);
2797 XPV_ALLOW_MIGRATE();
2798 return (PFN_INVALID
);
2800 ASSERT(vaddr
>= ht
->ht_vaddr
);
2801 ASSERT(vaddr
<= HTABLE_LAST_PAGE(ht
));
2802 pfn
= PTE2PFN(x86pte_get(ht
, entry
), ht
->ht_level
);
2803 if (ht
->ht_level
> 0)
2804 pfn
+= mmu_btop(vaddr
& LEVEL_OFFSET(ht
->ht_level
));
2806 XPV_ALLOW_MIGRATE();
2811 * int hat_probe(hat, addr)
2812 * return 0 if no valid mapping is present. Faster version
2813 * of hat_getattr in certain architectures.
2816 hat_probe(hat_t
*hat
, caddr_t addr
)
2818 uintptr_t vaddr
= ALIGN2PAGE(addr
);
2823 ASSERT(hat
== kas
.a_hat
|| vaddr
<= _userlimit
);
2824 ASSERT(hat
== kas
.a_hat
||
2825 AS_LOCK_HELD(hat
->hat_as
, &hat
->hat_as
->a_lock
));
2826 if (IN_VA_HOLE(vaddr
))
2830 * Most common use of hat_probe is from segmap. We special case it
2833 if (mmu
.kmap_addr
<= vaddr
&& vaddr
< mmu
.kmap_eaddr
) {
2834 pg_off
= mmu_btop(vaddr
- mmu
.kmap_addr
);
2836 return (PTE_ISVALID(mmu
.kmap_ptes
[pg_off
]));
2838 return (PTE_ISVALID(
2839 ((x86pte32_t
*)mmu
.kmap_ptes
)[pg_off
]));
2842 ht
= htable_getpage(hat
, vaddr
, &entry
);
2844 return (ht
!= NULL
);
2848 * Find out if the segment for hat_share()/hat_unshare() is DISM or locked ISM.
2851 is_it_dism(hat_t
*hat
, caddr_t va
)
2854 struct shm_data
*shmd
;
2855 struct spt_data
*sptd
;
2857 seg
= as_findseg(hat
->hat_as
, va
, 0);
2858 ASSERT(seg
!= NULL
);
2859 ASSERT(seg
->s_base
<= va
);
2860 shmd
= (struct shm_data
*)seg
->s_data
;
2861 ASSERT(shmd
!= NULL
);
2862 sptd
= (struct spt_data
*)shmd
->shm_sptseg
->s_data
;
2863 ASSERT(sptd
!= NULL
);
2864 if (sptd
->spt_flags
& SHM_PAGEABLE
)
2870 * Simple implementation of ISM. hat_share() is similar to hat_memload_array(),
2871 * except that we use the ism_hat's existing mappings to determine the pages
2872 * and protections to use for this hat. If we find a full properly aligned
2873 * and sized pagetable, we will attempt to share the pagetable itself.
2882 size_t len
, /* almost useless value, see below.. */
2885 uintptr_t vaddr_start
= (uintptr_t)addr
;
2887 uintptr_t eaddr
= vaddr_start
+ len
;
2888 uintptr_t ism_addr_start
= (uintptr_t)src_addr
;
2889 uintptr_t ism_addr
= ism_addr_start
;
2890 uintptr_t e_ism_addr
= ism_addr
+ len
;
2891 htable_t
*ism_ht
= NULL
;
2903 * We might be asked to share an empty DISM hat by as_dup()
2905 ASSERT(hat
!= kas
.a_hat
);
2906 ASSERT(eaddr
<= _userlimit
);
2907 if (!(ism_hat
->hat_flags
& HAT_SHARED
)) {
2908 ASSERT(hat_get_mapped_size(ism_hat
) == 0);
2911 XPV_DISALLOW_MIGRATE();
2914 * The SPT segment driver often passes us a size larger than there are
2915 * valid mappings. That's because it rounds the segment size up to a
2916 * large pagesize, even if the actual memory mapped by ism_hat is less.
2918 ASSERT(IS_PAGEALIGNED(vaddr_start
));
2919 ASSERT(IS_PAGEALIGNED(ism_addr_start
));
2920 ASSERT(ism_hat
->hat_flags
& HAT_SHARED
);
2921 is_dism
= is_it_dism(hat
, addr
);
2922 while (ism_addr
< e_ism_addr
) {
2924 * use htable_walk to get the next valid ISM mapping
2926 pte
= htable_walk(ism_hat
, &ism_ht
, &ism_addr
, e_ism_addr
);
2931 * First check to see if we already share the page table.
2933 l
= ism_ht
->ht_level
;
2934 vaddr
= vaddr_start
+ (ism_addr
- ism_addr_start
);
2935 ht
= htable_lookup(hat
, vaddr
, l
);
2937 if (ht
->ht_flags
& HTABLE_SHARED_PFN
)
2944 * Can't ever share top table.
2946 if (l
== mmu
.max_level
)
2950 * Avoid level mismatches later due to DISM faults.
2952 if (is_dism
&& l
> 0)
2956 * addresses and lengths must align
2957 * table must be fully populated
2958 * no lower level page tables
2960 if (ism_addr
!= ism_ht
->ht_vaddr
||
2961 (vaddr
& LEVEL_OFFSET(l
+ 1)) != 0)
2965 * The range of address space must cover a full table.
2967 if (e_ism_addr
- ism_addr
< LEVEL_SIZE(l
+ 1))
2971 * All entries in the ISM page table must be leaf PTEs.
2977 * We know the 0th is from htable_walk() above.
2979 for (e
= 1; e
< HTABLE_NUM_PTES(ism_ht
); ++e
) {
2981 pte
= x86pte_get(ism_ht
, e
);
2982 if (!PTE_ISPAGE(pte
, l
))
2988 * share the page table
2990 ht
= htable_create(hat
, vaddr
, l
, ism_ht
);
2992 ASSERT(ht
->ht_flags
& HTABLE_SHARED_PFN
);
2993 ASSERT(ht
->ht_shares
== ism_ht
);
2994 hat
->hat_ism_pgcnt
+=
2995 (ism_ht
->ht_valid_cnt
- ht
->ht_valid_cnt
) <<
2996 (LEVEL_SHIFT(ht
->ht_level
) - MMU_PAGESHIFT
);
2997 ht
->ht_valid_cnt
= ism_ht
->ht_valid_cnt
;
2999 ism_addr
= ism_ht
->ht_vaddr
+ LEVEL_SIZE(l
+ 1);
3000 htable_release(ism_ht
);
3006 * Unable to share the page table. Instead we will
3007 * create new mappings from the values in the ISM mappings.
3008 * Figure out what level size mappings to use;
3010 for (l
= ism_ht
->ht_level
; l
> 0; --l
) {
3011 if (LEVEL_SIZE(l
) <= eaddr
- vaddr
&&
3012 (vaddr
& LEVEL_OFFSET(l
)) == 0)
3017 * The ISM mapping might be larger than the share area,
3018 * be careful to truncate it if needed.
3020 if (eaddr
- vaddr
>= LEVEL_SIZE(ism_ht
->ht_level
)) {
3021 pgcnt
= mmu_btop(LEVEL_SIZE(ism_ht
->ht_level
));
3023 pgcnt
= mmu_btop(eaddr
- vaddr
);
3027 pfn
= PTE2PFN(pte
, ism_ht
->ht_level
);
3028 ASSERT(pfn
!= PFN_INVALID
);
3031 * Make a new pte for the PFN for this level.
3032 * Copy protections for the pte from the ISM pte.
3034 pp
= page_numtopp_nolock(pfn
);
3037 prot
= PROT_USER
| PROT_READ
| HAT_UNORDERED_OK
;
3038 if (PTE_GET(pte
, PT_WRITABLE
))
3040 if (!PTE_GET(pte
, PT_NX
))
3045 flags
|= HAT_LOAD_LOCK
| HAT_LOAD_NOCONSIST
;
3046 while (hati_load_common(hat
, vaddr
, pp
, prot
, flags
,
3049 panic("hati_load_common() failure");
3053 vaddr
+= LEVEL_SIZE(l
);
3054 ism_addr
+= LEVEL_SIZE(l
);
3055 pfn
+= mmu_btop(LEVEL_SIZE(l
));
3056 pgcnt
-= mmu_btop(LEVEL_SIZE(l
));
3060 htable_release(ism_ht
);
3061 XPV_ALLOW_MIGRATE();
3067 * hat_unshare() is similar to hat_unload_callback(), but
3068 * we have to look for empty shared pagetables. Note that
3069 * hat_unshare() is always invoked against an entire segment.
3073 hat_unshare(hat_t
*hat
, caddr_t addr
, size_t len
, uint_t ismszc
)
3075 uint64_t vaddr
= (uintptr_t)addr
;
3076 uintptr_t eaddr
= vaddr
+ len
;
3077 htable_t
*ht
= NULL
;
3078 uint_t need_demaps
= 0;
3079 int flags
= HAT_UNLOAD_UNMAP
;
3082 ASSERT(hat
!= kas
.a_hat
);
3083 ASSERT(eaddr
<= _userlimit
);
3084 ASSERT(IS_PAGEALIGNED(vaddr
));
3085 ASSERT(IS_PAGEALIGNED(eaddr
));
3086 XPV_DISALLOW_MIGRATE();
3089 * First go through and remove any shared pagetables.
3091 * Note that it's ok to delay the TLB shootdown till the entire range is
3092 * finished, because if hat_pageunload() were to unload a shared
3093 * pagetable page, its hat_tlb_inval() will do a global TLB invalidate.
3095 l
= mmu
.max_page_level
;
3096 if (l
== mmu
.max_level
)
3098 for (; l
>= 0; --l
) {
3099 for (vaddr
= (uintptr_t)addr
; vaddr
< eaddr
;
3100 vaddr
= (vaddr
& LEVEL_MASK(l
+ 1)) + LEVEL_SIZE(l
+ 1)) {
3101 ASSERT(!IN_VA_HOLE(vaddr
));
3103 * find a pagetable that maps the current address
3105 ht
= htable_lookup(hat
, vaddr
, l
);
3108 if (ht
->ht_flags
& HTABLE_SHARED_PFN
) {
3110 * clear page count, set valid_cnt to 0,
3111 * let htable_release() finish the job
3113 hat
->hat_ism_pgcnt
-= ht
->ht_valid_cnt
<<
3114 (LEVEL_SHIFT(ht
->ht_level
) - MMU_PAGESHIFT
);
3115 ht
->ht_valid_cnt
= 0;
3123 * flush the TLBs - since we're probably dealing with MANY mappings
3124 * we do just one CR3 reload.
3126 if (!(hat
->hat_flags
& HAT_FREEING
) && need_demaps
)
3127 hat_tlb_inval(hat
, DEMAP_ALL_ADDR
);
3130 * Now go back and clean up any unaligned mappings that
3131 * couldn't share pagetables.
3133 if (!is_it_dism(hat
, addr
))
3134 flags
|= HAT_UNLOAD_UNLOCK
;
3135 hat_unload(hat
, addr
, len
, flags
);
3136 XPV_ALLOW_MIGRATE();
3141 * hat_reserve() does nothing
3145 hat_reserve(struct as
*as
, caddr_t addr
, size_t len
)
3151 * Called when all mappings to a page should have write permission removed.
3152 * Mostly stolen from hat_pagesync()
3155 hati_page_clrwrt(struct page
*pp
)
3164 XPV_DISALLOW_MIGRATE();
3167 * walk thru the mapping list clearing write permission
3170 while ((hm
= hment_walk(pp
, &ht
, &entry
, hm
)) != NULL
) {
3171 if (ht
->ht_level
< pszc
)
3173 old
= x86pte_get(ht
, entry
);
3177 * Is this mapping of interest?
3179 if (PTE2PFN(old
, ht
->ht_level
) != pp
->p_pagenum
||
3180 PTE_GET(old
, PT_WRITABLE
) == 0)
3184 * Clear ref/mod writable bits. This requires cross
3185 * calls to ensure any executing TLBs see cleared bits.
3188 PTE_CLR(new, PT_REF
| PT_MOD
| PT_WRITABLE
);
3189 old
= hati_update_pte(ht
, entry
, old
, new);
3197 while (pszc
< pp
->p_szc
) {
3200 tpp
= PP_GROUPLEADER(pp
, pszc
);
3206 XPV_ALLOW_MIGRATE();
3210 * void hat_page_setattr(pp, flag)
3211 * void hat_page_clrattr(pp, flag)
3212 * used to set/clr ref/mod bits.
3215 hat_page_setattr(struct page
*pp
, uint_t flag
)
3217 vnode_t
*vp
= pp
->p_vnode
;
3218 kmutex_t
*vphm
= NULL
;
3222 noshuffle
= flag
& P_NSH
;
3225 if (PP_GETRM(pp
, flag
) == flag
)
3228 if ((flag
& P_MOD
) != 0 && vp
!= NULL
&& IS_VMODSORT(vp
) &&
3230 vphm
= page_vnode_mutex(vp
);
3239 * Some File Systems examine v_pages for NULL w/o
3240 * grabbing the vphm mutex. Must not let it become NULL when
3241 * pp is the only page on the list.
3243 if (pp
->p_vpnext
!= pp
) {
3244 page_vpsub(&vp
->v_pages
, pp
);
3245 if (vp
->v_pages
!= NULL
)
3246 listp
= &vp
->v_pages
->p_vpprev
->p_vpnext
;
3248 listp
= &vp
->v_pages
;
3249 page_vpadd(listp
, pp
);
3256 hat_page_clrattr(struct page
*pp
, uint_t flag
)
3258 vnode_t
*vp
= pp
->p_vnode
;
3259 ASSERT(!(flag
& ~(P_MOD
| P_REF
| P_RO
)));
3262 * Caller is expected to hold page's io lock for VMODSORT to work
3263 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
3265 * We don't have assert to avoid tripping some existing third party
3266 * code. The dirty page is moved back to top of the v_page list
3267 * after IO is done in pvn_write_done().
3271 if ((flag
& P_MOD
) != 0 && vp
!= NULL
&& IS_VMODSORT(vp
)) {
3274 * VMODSORT works by removing write permissions and getting
3275 * a fault when a page is made dirty. At this point
3276 * we need to remove write permission from all mappings
3279 hati_page_clrwrt(pp
);
3284 * If flag is specified, returns 0 if attribute is disabled
3285 * and non zero if enabled. If flag specifes multiple attributes
3286 * then returns 0 if ALL attributes are disabled. This is an advisory
3290 hat_page_getattr(struct page
*pp
, uint_t flag
)
3292 return (PP_GETRM(pp
, flag
));
3297 * common code used by hat_pageunload() and hment_steal()
3300 hati_page_unmap(page_t
*pp
, htable_t
*ht
, uint_t entry
)
3303 pfn_t pfn
= pp
->p_pagenum
;
3307 * We need to acquire a hold on the htable in order to
3308 * do the invalidate. We know the htable must exist, since
3309 * unmap's don't release the htable until after removing any
3310 * hment. Having x86_hm_enter() keeps that from proceeding.
3315 * Invalidate the PTE and remove the hment.
3317 old_pte
= x86pte_inval(ht
, entry
, 0, NULL
);
3318 if (PTE2PFN(old_pte
, ht
->ht_level
) != pfn
) {
3319 panic("x86pte_inval() failure found PTE = " FMT_PTE
3320 " pfn being unmapped is %lx ht=0x%lx entry=0x%x",
3321 old_pte
, pfn
, (uintptr_t)ht
, entry
);
3325 * Clean up all the htable information for this mapping
3327 ASSERT(ht
->ht_valid_cnt
> 0);
3328 HTABLE_DEC(ht
->ht_valid_cnt
);
3329 PGCNT_DEC(ht
->ht_hat
, ht
->ht_level
);
3332 * sync ref/mod bits to the page_t
3334 if (PTE_GET(old_pte
, PT_SOFTWARE
) < PT_NOSYNC
)
3335 hati_sync_pte_to_page(pp
, old_pte
, ht
->ht_level
);
3338 * Remove the mapping list entry for this page.
3340 hm
= hment_remove(pp
, ht
, entry
);
3343 * drop the mapping list lock so that we might free the
3351 extern int vpm_enable
;
3353 * Unload all translations to a page. If the page is a subpage of a large
3354 * page, the large page mappings are also removed.
3356 * The forceflags are unused.
3361 hati_pageunload(struct page
*pp
, uint_t pg_szcd
, uint_t forceflag
)
3363 page_t
*cur_pp
= pp
;
3370 XPV_DISALLOW_MIGRATE();
3373 * prevent recursion due to kmem_free()
3375 ++curthread
->t_hatdepth
;
3376 ASSERT(curthread
->t_hatdepth
< 16);
3378 #if defined(__amd64)
3380 * clear the vpm ref.
3387 * The loop with next_size handles pages with multiple pagesize mappings
3393 * Get a mapping list entry
3395 x86_hm_enter(cur_pp
);
3396 for (prev
= NULL
; ; prev
= hm
) {
3397 hm
= hment_walk(cur_pp
, &ht
, &entry
, prev
);
3399 x86_hm_exit(cur_pp
);
3402 * If not part of a larger page, we're done.
3404 if (cur_pp
->p_szc
<= pg_szcd
) {
3405 ASSERT(curthread
->t_hatdepth
> 0);
3406 --curthread
->t_hatdepth
;
3407 XPV_ALLOW_MIGRATE();
3412 * Else check the next larger page size.
3413 * hat_page_demote() may decrease p_szc
3414 * but that's ok we'll just take an extra
3415 * trip discover there're no larger mappings
3419 cur_pp
= PP_GROUPLEADER(cur_pp
, pg_szcd
);
3424 * If this mapping size matches, remove it.
3426 level
= ht
->ht_level
;
3427 if (level
== pg_szcd
)
3432 * Remove the mapping list entry for this page.
3433 * Note this does the x86_hm_exit() for us.
3435 hm
= hati_page_unmap(cur_pp
, ht
, entry
);
3442 hat_pageunload(struct page
*pp
, uint_t forceflag
)
3444 ASSERT(PAGE_EXCL(pp
));
3445 return (hati_pageunload(pp
, 0, forceflag
));
3449 * Unload all large mappings to pp and reduce by 1 p_szc field of every large
3450 * page level that included pp.
3452 * pp must be locked EXCL. Even though no other constituent pages are locked
3453 * it's legal to unload large mappings to pp because all constituent pages of
3454 * large locked mappings have to be locked SHARED. therefore if we have EXCL
3455 * lock on one of constituent pages none of the large mappings to pp are
3458 * Change (always decrease) p_szc field starting from the last constituent
3459 * page and ending with root constituent page so that root's pszc always shows
3460 * the area where hat_page_demote() may be active.
3462 * This mechanism is only used for file system pages where it's not always
3463 * possible to get EXCL locks on all constituent pages to demote the size code
3464 * (as is done for anonymous or kernel large pages).
3467 hat_page_demote(page_t
*pp
)
3477 ASSERT(PAGE_EXCL(pp
));
3478 ASSERT(!PP_ISFREE(pp
));
3479 ASSERT(page_szc_lock_assert(pp
));
3484 rootpp
= PP_GROUPLEADER(pp
, 1);
3485 (void) hati_pageunload(rootpp
, 1, HAT_FORCE_PGUNLOAD
);
3488 * all large mappings to pp are gone
3489 * and no new can be setup since pp is locked exclusively.
3491 * Lock the root to make sure there's only one hat_page_demote()
3492 * outstanding within the area of this root's pszc.
3494 * Second potential hat_page_demote() is already eliminated by upper
3495 * VM layer via page_szc_lock() but we don't rely on it and use our
3496 * own locking (so that upper layer locking can be changed without
3497 * assumptions that hat depends on upper layer VM to prevent multiple
3498 * hat_page_demote() to be issued simultaneously to the same large
3505 rootpp
= PP_GROUPLEADER(pp
, pszc
);
3506 x86_hm_enter(rootpp
);
3508 * If root's p_szc is different from pszc we raced with another
3509 * hat_page_demote(). Drop the lock and try to find the root again.
3510 * If root's p_szc is greater than pszc previous hat_page_demote() is
3511 * not done yet. Take and release mlist lock of root's root to wait
3512 * for previous hat_page_demote() to complete.
3514 if ((rszc
= rootpp
->p_szc
) != pszc
) {
3515 x86_hm_exit(rootpp
);
3517 /* p_szc of a locked non free page can't increase */
3518 ASSERT(pp
!= rootpp
);
3520 rootpp
= PP_GROUPLEADER(rootpp
, rszc
);
3521 x86_hm_enter(rootpp
);
3522 x86_hm_exit(rootpp
);
3526 ASSERT(pp
->p_szc
== pszc
);
3529 * Decrement by 1 p_szc of every constituent page of a region that
3530 * covered pp. For example if original szc is 3 it gets changed to 2
3531 * everywhere except in region 2 that covered pp. Region 2 that
3532 * covered pp gets demoted to 1 everywhere except in region 1 that
3533 * covered pp. The region 1 that covered pp is demoted to region
3534 * 0. It's done this way because from region 3 we removed level 3
3535 * mappings, from region 2 that covered pp we removed level 2 mappings
3536 * and from region 1 that covered pp we removed level 1 mappings. All
3537 * changes are done from from high pfn's to low pfn's so that roots
3538 * are changed last allowing one to know the largest region where
3539 * hat_page_demote() is stil active by only looking at the root page.
3541 * This algorithm is implemented in 2 while loops. First loop changes
3542 * p_szc of pages to the right of pp's level 1 region and second
3543 * loop changes p_szc of pages of level 1 region that covers pp
3544 * and all pages to the left of level 1 region that covers pp.
3545 * In the first loop p_szc keeps dropping with every iteration
3546 * and in the second loop it keeps increasing with every iteration.
3548 * First loop description: Demote pages to the right of pp outside of
3549 * level 1 region that covers pp. In every iteration of the while
3550 * loop below find the last page of szc region and the first page of
3551 * (szc - 1) region that is immediately to the right of (szc - 1)
3552 * region that covers pp. From last such page to first such page
3553 * change every page's szc to szc - 1. Decrement szc and continue
3554 * looping until szc is 1. If pp belongs to the last (szc - 1) region
3555 * of szc region skip to the next iteration.
3559 lastpp
= PP_GROUPLEADER(pp
, szc
);
3560 pgcnt
= page_get_pagecnt(szc
);
3561 lastpp
+= pgcnt
- 1;
3562 firstpp
= PP_GROUPLEADER(pp
, (szc
- 1));
3563 pgcnt
= page_get_pagecnt(szc
- 1);
3564 if (lastpp
- firstpp
< pgcnt
) {
3569 while (lastpp
!= firstpp
) {
3570 ASSERT(lastpp
->p_szc
== pszc
);
3571 lastpp
->p_szc
= szc
- 1;
3574 firstpp
->p_szc
= szc
- 1;
3579 * Second loop description:
3580 * First iteration changes p_szc to 0 of every
3581 * page of level 1 region that covers pp.
3582 * Subsequent iterations find last page of szc region
3583 * immediately to the left of szc region that covered pp
3584 * and first page of (szc + 1) region that covers pp.
3585 * From last to first page change p_szc of every page to szc.
3586 * Increment szc and continue looping until szc is pszc.
3587 * If pp belongs to the fist szc region of (szc + 1) region
3588 * skip to the next iteration.
3592 while (szc
< pszc
) {
3593 firstpp
= PP_GROUPLEADER(pp
, (szc
+ 1));
3595 pgcnt
= page_get_pagecnt(1);
3596 lastpp
= firstpp
+ (pgcnt
- 1);
3598 lastpp
= PP_GROUPLEADER(pp
, szc
);
3599 if (firstpp
== lastpp
) {
3604 pgcnt
= page_get_pagecnt(szc
);
3606 while (lastpp
!= firstpp
) {
3607 ASSERT(lastpp
->p_szc
== pszc
);
3608 lastpp
->p_szc
= szc
;
3611 firstpp
->p_szc
= szc
;
3612 if (firstpp
== rootpp
)
3616 x86_hm_exit(rootpp
);
3620 * get hw stats from hardware into page struct and reset hw stats
3621 * returns attributes of page
3622 * Flags for hat_pagesync, hat_getstat, hat_sync
3624 * define HAT_SYNC_ZERORM 0x01
3626 * Additional flags for hat_pagesync
3628 * define HAT_SYNC_STOPON_REF 0x02
3629 * define HAT_SYNC_STOPON_MOD 0x04
3630 * define HAT_SYNC_STOPON_RM 0x06
3631 * define HAT_SYNC_STOPON_SHARED 0x08
3634 hat_pagesync(struct page
*pp
, uint_t flags
)
3639 x86pte_t old
, save_old
;
3641 uchar_t nrmbits
= P_REF
|P_MOD
|P_RO
;
3642 extern ulong_t po_share
;
3643 page_t
*save_pp
= pp
;
3646 ASSERT(PAGE_LOCKED(pp
) || panicstr
);
3648 if (PP_ISRO(pp
) && (flags
& HAT_SYNC_STOPON_MOD
))
3649 return (pp
->p_nrm
& nrmbits
);
3651 if ((flags
& HAT_SYNC_ZERORM
) == 0) {
3653 if ((flags
& HAT_SYNC_STOPON_REF
) != 0 && PP_ISREF(pp
))
3654 return (pp
->p_nrm
& nrmbits
);
3656 if ((flags
& HAT_SYNC_STOPON_MOD
) != 0 && PP_ISMOD(pp
))
3657 return (pp
->p_nrm
& nrmbits
);
3659 if ((flags
& HAT_SYNC_STOPON_SHARED
) != 0 &&
3660 hat_page_getshare(pp
) > po_share
) {
3663 return (pp
->p_nrm
& nrmbits
);
3667 XPV_DISALLOW_MIGRATE();
3670 * walk thru the mapping list syncing (and clearing) ref/mod bits.
3673 while ((hm
= hment_walk(pp
, &ht
, &entry
, hm
)) != NULL
) {
3674 if (ht
->ht_level
< pszc
)
3676 old
= x86pte_get(ht
, entry
);
3679 ASSERT(PTE2PFN(old
, ht
->ht_level
) == pp
->p_pagenum
);
3681 if (PTE_GET(old
, PT_REF
| PT_MOD
) == 0)
3685 if ((flags
& HAT_SYNC_ZERORM
) != 0) {
3688 * Need to clear ref or mod bits. Need to demap
3689 * to make sure any executing TLBs see cleared bits.
3692 PTE_CLR(new, PT_REF
| PT_MOD
);
3693 old
= hati_update_pte(ht
, entry
, old
, new);
3703 if (!(flags
& HAT_SYNC_ZERORM
) &&
3704 PTE_GET(old
, PT_SOFTWARE
) <= PT_NOSYNC
)
3705 hati_sync_pte_to_page(pp
, old
, ht
->ht_level
);
3708 * can stop short if we found a ref'd or mod'd page
3710 if ((flags
& HAT_SYNC_STOPON_MOD
) && PP_ISMOD(save_pp
) ||
3711 (flags
& HAT_SYNC_STOPON_REF
) && PP_ISREF(save_pp
)) {
3717 while (pszc
< pp
->p_szc
) {
3720 tpp
= PP_GROUPLEADER(pp
, pszc
);
3727 XPV_ALLOW_MIGRATE();
3728 return (save_pp
->p_nrm
& nrmbits
);
3732 * returns approx number of mappings to this pp. A return of 0 implies
3733 * there are no mappings to the page.
3736 hat_page_getshare(page_t
*pp
)
3739 cnt
= hment_mapcnt(pp
);
3740 #if defined(__amd64)
3741 if (vpm_enable
&& pp
->p_vpmref
) {
3749 * Return 1 the number of mappings exceeds sh_thresh. Return 0
3753 hat_page_checkshare(page_t
*pp
, ulong_t sh_thresh
)
3755 return (hat_page_getshare(pp
) > sh_thresh
);
3759 * hat_softlock isn't supported anymore
3767 struct page
**page_array
,
3770 return (FC_NOSUPPORT
);
3776 * Routine to expose supported HAT features to platform independent code.
3780 hat_supported(enum hat_features feature
, void *arg
)
3784 case HAT_SHARED_PT
: /* this is really ISM */
3787 case HAT_DYNAMIC_ISM_UNMAP
:
3793 case HAT_SHARED_REGIONS
:
3797 panic("hat_supported() - unknown feature");
3803 * Called when a thread is exiting and has been switched to the kernel AS
3806 hat_thread_exit(kthread_t
*thd
)
3808 ASSERT(thd
->t_procp
->p_as
== &kas
);
3809 XPV_DISALLOW_MIGRATE();
3810 hat_switch(thd
->t_procp
->p_as
->a_hat
);
3811 XPV_ALLOW_MIGRATE();
3815 * Setup the given brand new hat structure as the new HAT on this cpu's mmu.
3819 hat_setup(hat_t
*hat
, int flags
)
3821 XPV_DISALLOW_MIGRATE();
3827 XPV_ALLOW_MIGRATE();
3831 * Prepare for a CPU private mapping for the given address.
3833 * The address can only be used from a single CPU and can be remapped
3834 * using hat_mempte_remap(). Return the address of the PTE.
3836 * We do the htable_create() if necessary and increment the valid count so
3837 * the htable can't disappear. We also hat_devload() the page table into
3838 * kernel so that the PTE is quickly accessed.
3841 hat_mempte_setup(caddr_t addr
)
3843 uintptr_t va
= (uintptr_t)addr
;
3849 ASSERT(IS_PAGEALIGNED(va
));
3850 ASSERT(!IN_VA_HOLE(va
));
3851 ++curthread
->t_hatdepth
;
3852 XPV_DISALLOW_MIGRATE();
3853 ht
= htable_getpte(kas
.a_hat
, va
, &entry
, &oldpte
, 0);
3855 ht
= htable_create(kas
.a_hat
, va
, 0, NULL
);
3856 entry
= htable_va2entry(va
, ht
);
3857 ASSERT(ht
->ht_level
== 0);
3858 oldpte
= x86pte_get(ht
, entry
);
3860 if (PTE_ISVALID(oldpte
))
3861 panic("hat_mempte_setup(): address already mapped"
3862 "ht=%p, entry=%d, pte=" FMT_PTE
, (void *)ht
, entry
, oldpte
);
3865 * increment ht_valid_cnt so that the pagetable can't disappear
3867 HTABLE_INC(ht
->ht_valid_cnt
);
3870 * return the PTE physical address to the caller.
3873 XPV_ALLOW_MIGRATE();
3874 p
= PT_INDEX_PHYSADDR(pfn_to_pa(ht
->ht_pfn
), entry
);
3875 --curthread
->t_hatdepth
;
3880 * Release a CPU private mapping for the given address.
3881 * We decrement the htable valid count so it might be destroyed.
3885 hat_mempte_release(caddr_t addr
, hat_mempte_t pte_pa
)
3889 XPV_DISALLOW_MIGRATE();
3891 * invalidate any left over mapping and decrement the htable valid count
3894 if (HYPERVISOR_update_va_mapping((uintptr_t)addr
, 0,
3895 UVMF_INVLPG
| UVMF_LOCAL
))
3896 panic("HYPERVISOR_update_va_mapping() failed");
3901 pteptr
= x86pte_mapin(mmu_btop(pte_pa
),
3902 (pte_pa
& MMU_PAGEOFFSET
) >> mmu
.pte_size_shift
, NULL
);
3906 *(x86pte32_t
*)pteptr
= 0;
3907 mmu_tlbflush_entry(addr
);
3912 ht
= htable_getpte(kas
.a_hat
, ALIGN2PAGE(addr
), NULL
, NULL
, 0);
3914 panic("hat_mempte_release(): invalid address");
3915 ASSERT(ht
->ht_level
== 0);
3916 HTABLE_DEC(ht
->ht_valid_cnt
);
3918 XPV_ALLOW_MIGRATE();
3922 * Apply a temporary CPU private mapping to a page. We flush the TLB only
3923 * on this CPU, so this ought to have been called with preemption disabled.
3929 hat_mempte_t pte_pa
,
3933 uintptr_t va
= (uintptr_t)addr
;
3937 * Remap the given PTE to the new page's PFN. Invalidate only
3944 ASSERT(IS_PAGEALIGNED(va
));
3945 ASSERT(!IN_VA_HOLE(va
));
3946 ht
= htable_getpte(kas
.a_hat
, va
, &entry
, NULL
, 0);
3948 ASSERT(ht
->ht_level
== 0);
3949 ASSERT(ht
->ht_valid_cnt
> 0);
3950 ASSERT(ht
->ht_pfn
== mmu_btop(pte_pa
));
3953 XPV_DISALLOW_MIGRATE();
3954 pte
= hati_mkpte(pfn
, attr
, 0, flags
);
3956 if (HYPERVISOR_update_va_mapping(va
, pte
, UVMF_INVLPG
| UVMF_LOCAL
))
3957 panic("HYPERVISOR_update_va_mapping() failed");
3962 pteptr
= x86pte_mapin(mmu_btop(pte_pa
),
3963 (pte_pa
& MMU_PAGEOFFSET
) >> mmu
.pte_size_shift
, NULL
);
3965 *(x86pte_t
*)pteptr
= pte
;
3967 *(x86pte32_t
*)pteptr
= (x86pte32_t
)pte
;
3968 mmu_tlbflush_entry(addr
);
3972 XPV_ALLOW_MIGRATE();
3978 * Hat locking functions
3979 * XXX - these two functions are currently being used by hatstats
3980 * they can be removed by using a per-as mutex for hatstats.
3983 hat_enter(hat_t
*hat
)
3985 mutex_enter(&hat
->hat_mutex
);
3989 hat_exit(hat_t
*hat
)
3991 mutex_exit(&hat
->hat_mutex
);
3995 * HAT part of cpu initialization.
3998 hat_cpu_online(struct cpu
*cpup
)
4001 x86pte_cpu_init(cpup
);
4002 hat_vlp_setup(cpup
);
4004 CPUSET_ATOMIC_ADD(khat_cpuset
, cpup
->cpu_id
);
4008 * HAT part of cpu deletion.
4009 * (currently, we only call this after the cpu is safely passivated.)
4012 hat_cpu_offline(struct cpu
*cpup
)
4014 ASSERT(cpup
!= CPU
);
4016 CPUSET_ATOMIC_DEL(khat_cpuset
, cpup
->cpu_id
);
4017 hat_vlp_teardown(cpup
);
4018 x86pte_cpu_fini(cpup
);
4022 * Function called after all CPUs are brought online.
4023 * Used to remove low address boot mappings.
4026 clear_boot_mappings(uintptr_t low
, uintptr_t high
)
4028 uintptr_t vaddr
= low
;
4029 htable_t
*ht
= NULL
;
4035 * On 1st CPU we can unload the prom mappings, basically we blow away
4036 * all virtual mappings under _userlimit.
4038 while (vaddr
< high
) {
4039 pte
= htable_walk(kas
.a_hat
, &ht
, &vaddr
, high
);
4043 level
= ht
->ht_level
;
4044 entry
= htable_va2entry(vaddr
, ht
);
4045 ASSERT(level
<= mmu
.max_page_level
);
4046 ASSERT(PTE_ISPAGE(pte
, level
));
4049 * Unload the mapping from the page tables.
4051 (void) x86pte_inval(ht
, entry
, 0, NULL
);
4052 ASSERT(ht
->ht_valid_cnt
> 0);
4053 HTABLE_DEC(ht
->ht_valid_cnt
);
4054 PGCNT_DEC(ht
->ht_hat
, ht
->ht_level
);
4056 vaddr
+= LEVEL_SIZE(ht
->ht_level
);
4063 * Atomically update a new translation for a single page. If the
4064 * currently installed PTE doesn't match the value we expect to find,
4065 * it's not updated and we return the PTE we found.
4067 * If activating nosync or NOWRITE and the page was modified we need to sync
4068 * with the page_t. Also sync with page_t if clearing ref/mod bits.
4071 hati_update_pte(htable_t
*ht
, uint_t entry
, x86pte_t expected
, x86pte_t
new)
4077 if (PTE_GET(expected
, PT_SOFTWARE
) < PT_NOSYNC
&&
4078 PTE_GET(expected
, PT_MOD
| PT_REF
) &&
4079 (PTE_GET(new, PT_NOSYNC
) || !PTE_GET(new, PT_WRITABLE
) ||
4080 !PTE_GET(new, PT_MOD
| PT_REF
))) {
4082 ASSERT(!pfn_is_foreign(PTE2PFN(expected
, ht
->ht_level
)));
4083 pp
= page_numtopp_nolock(PTE2PFN(expected
, ht
->ht_level
));
4085 if (PTE_GET(expected
, PT_MOD
))
4087 if (PTE_GET(expected
, PT_REF
))
4089 PTE_CLR(new, PT_MOD
| PT_REF
);
4092 replaced
= x86pte_update(ht
, entry
, expected
, new);
4093 if (replaced
!= expected
)
4098 * sync to all constituent pages of a large page
4100 pgcnt_t pgcnt
= page_get_pagecnt(ht
->ht_level
);
4101 ASSERT(IS_P2ALIGNED(pp
->p_pagenum
, pgcnt
));
4102 while (pgcnt
-- > 0) {
4104 * hat_page_demote() can't decrease
4105 * pszc below this mapping size
4106 * since large mapping existed after we
4109 ASSERT(pp
->p_szc
>= ht
->ht_level
);
4110 hat_page_setattr(pp
, rm
);
4120 hat_join_srd(struct hat
*hat
, vnode_t
*evp
)
4126 hat_join_region(struct hat
*hat
,
4130 u_offset_t r_objoff
,
4133 hat_rgn_cb_func_t r_cb_function
,
4136 panic("No shared region support on x86");
4137 return (HAT_INVALID_REGION_COOKIE
);
4142 hat_leave_region(struct hat
*hat
, hat_region_cookie_t rcookie
, uint_t flags
)
4144 panic("No shared region support on x86");
4149 hat_dup_region(struct hat
*hat
, hat_region_cookie_t rcookie
)
4151 panic("No shared region support on x86");
4156 * Kernel Physical Mapping (kpm) facility
4158 * Most of the routines needed to support segkpm are almost no-ops on the
4159 * x86 platform. We map in the entire segment when it is created and leave
4160 * it mapped in, so there is no additional work required to set up and tear
4161 * down individual mappings. All of these routines were created to support
4162 * SPARC platforms that have to avoid aliasing in their virtually indexed
4165 * Most of the routines have sanity checks in them (e.g. verifying that the
4166 * passed-in page is locked). We don't actually care about most of these
4167 * checks on x86, but we leave them in place to identify problems in the
4172 * Map in a locked page and return the vaddr.
4176 hat_kpm_mapin(struct page
*pp
, struct kpme
*kpme
)
4181 if (kpm_enable
== 0) {
4182 cmn_err(CE_WARN
, "hat_kpm_mapin: kpm_enable not set\n");
4183 return ((caddr_t
)NULL
);
4186 if (pp
== NULL
|| PAGE_LOCKED(pp
) == 0) {
4187 cmn_err(CE_WARN
, "hat_kpm_mapin: pp zero or not locked\n");
4188 return ((caddr_t
)NULL
);
4192 vaddr
= hat_kpm_page2va(pp
, 1);
4198 * Mapout a locked page.
4202 hat_kpm_mapout(struct page
*pp
, struct kpme
*kpme
, caddr_t vaddr
)
4205 if (kpm_enable
== 0) {
4206 cmn_err(CE_WARN
, "hat_kpm_mapout: kpm_enable not set\n");
4210 if (IS_KPM_ADDR(vaddr
) == 0) {
4211 cmn_err(CE_WARN
, "hat_kpm_mapout: no kpm address\n");
4215 if (pp
== NULL
|| PAGE_LOCKED(pp
) == 0) {
4216 cmn_err(CE_WARN
, "hat_kpm_mapout: page zero or not locked\n");
4223 * hat_kpm_mapin_pfn is used to obtain a kpm mapping for physical
4224 * memory addresses that are not described by a page_t. It can
4225 * also be used for normal pages that are not locked, but beware
4226 * this is dangerous - no locking is performed, so the identity of
4227 * the page could change. hat_kpm_mapin_pfn is not supported when
4228 * vac_colors > 1, because the chosen va depends on the page identity,
4229 * which could change.
4230 * The caller must only pass pfn's for valid physical addresses; violation
4231 * of this rule will cause panic.
4234 hat_kpm_mapin_pfn(pfn_t pfn
)
4236 caddr_t paddr
, vaddr
;
4238 if (kpm_enable
== 0)
4239 return ((caddr_t
)NULL
);
4241 paddr
= (caddr_t
)ptob(pfn
);
4242 vaddr
= (uintptr_t)kpm_vbase
+ paddr
;
4244 return ((caddr_t
)vaddr
);
4249 hat_kpm_mapout_pfn(pfn_t pfn
)
4255 * Return the kpm virtual address for a specific pfn
4258 hat_kpm_pfn2va(pfn_t pfn
)
4260 uintptr_t vaddr
= (uintptr_t)kpm_vbase
+ mmu_ptob(pfn
);
4262 ASSERT(!pfn_is_foreign(pfn
));
4263 return ((caddr_t
)vaddr
);
4267 * Return the kpm virtual address for the page at pp.
4271 hat_kpm_page2va(struct page
*pp
, int checkswap
)
4273 return (hat_kpm_pfn2va(pp
->p_pagenum
));
4277 * Return the page frame number for the kpm virtual address vaddr.
4280 hat_kpm_va2pfn(caddr_t vaddr
)
4284 ASSERT(IS_KPM_ADDR(vaddr
));
4286 pfn
= (pfn_t
)btop(vaddr
- kpm_vbase
);
4293 * Return the page for the kpm virtual address vaddr.
4296 hat_kpm_vaddr2page(caddr_t vaddr
)
4300 ASSERT(IS_KPM_ADDR(vaddr
));
4302 pfn
= hat_kpm_va2pfn(vaddr
);
4304 return (page_numtopp_nolock(pfn
));
4308 * hat_kpm_fault is called from segkpm_fault when we take a page fault on a
4309 * KPM page. This should never happen on x86
4312 hat_kpm_fault(hat_t
*hat
, caddr_t vaddr
)
4314 panic("pagefault in seg_kpm. hat: 0x%p vaddr: 0x%p",
4315 (void *)hat
, (void *)vaddr
);
4322 hat_kpm_mseghash_clear(int nentries
)
4327 hat_kpm_mseghash_update(pgcnt_t inx
, struct memseg
*msp
)
4332 hat_kpm_addmem_mseg_update(struct memseg
*msp
, pgcnt_t nkpmpgs
,
4333 offset_t kpm_pages_off
)
4335 _NOTE(ARGUNUSED(nkpmpgs
, kpm_pages_off
));
4339 * kphysm_add_memory_dynamic() does not set nkpmpgs
4340 * when page_t memory is externally allocated. That
4341 * code must properly calculate nkpmpgs in all cases
4342 * if nkpmpgs needs to be used at some point.
4346 * The meta (page_t) pages for dynamically added memory are allocated
4347 * either from the incoming memory itself or from existing memory.
4348 * In the former case the base of the incoming pages will be different
4349 * than the base of the dynamic segment so call memseg_get_start() to
4350 * get the actual base of the incoming memory for each case.
4353 base
= memseg_get_start(msp
);
4354 end
= msp
->pages_end
;
4356 hat_devload(kas
.a_hat
, kpm_vbase
+ mmu_ptob(base
),
4357 mmu_ptob(end
- base
), base
, PROT_READ
| PROT_WRITE
,
4358 HAT_LOAD
| HAT_LOAD_LOCK
| HAT_LOAD_NOCONSIST
);
4362 hat_kpm_addmem_mseg_insert(struct memseg
*msp
)
4364 _NOTE(ARGUNUSED(msp
));
4368 hat_kpm_addmem_memsegs_update(struct memseg
*msp
)
4370 _NOTE(ARGUNUSED(msp
));
4374 * Return end of metadata for an already setup memseg.
4375 * X86 platforms don't need per-page meta data to support kpm.
4378 hat_kpm_mseg_reuse(struct memseg
*msp
)
4380 return ((caddr_t
)msp
->epages
);
4384 hat_kpm_delmem_mseg_update(struct memseg
*msp
, struct memseg
**mspp
)
4386 _NOTE(ARGUNUSED(msp
, mspp
));
4391 hat_kpm_split_mseg_update(struct memseg
*msp
, struct memseg
**mspp
,
4392 struct memseg
*lo
, struct memseg
*mid
, struct memseg
*hi
)
4394 _NOTE(ARGUNUSED(msp
, mspp
, lo
, mid
, hi
));
4399 * Walk the memsegs chain, applying func to each memseg span.
4402 hat_kpm_walk(void (*func
)(void *, void *, size_t), void *arg
)
4409 for (msp
= memsegs
; msp
; msp
= msp
->next
) {
4410 pbase
= msp
->pages_base
;
4411 pend
= msp
->pages_end
;
4412 base
= ptob(pbase
) + kpm_vbase
;
4413 size
= ptob(pend
- pbase
);
4414 func(arg
, base
, size
);
4421 * There are specific Hypervisor calls to establish and remove mappings
4422 * to grant table references and the privcmd driver. We have to ensure
4423 * that a page table actually exists.
4426 hat_prepare_mapping(hat_t
*hat
, caddr_t addr
, uint64_t *pte_ma
)
4432 ASSERT(IS_P2ALIGNED((uintptr_t)addr
, MMU_PAGESIZE
));
4433 XPV_DISALLOW_MIGRATE();
4434 ht
= htable_create(hat
, (uintptr_t)addr
, 0, NULL
);
4437 * if an address for pte_ma is passed in, return the MA of the pte
4438 * for this specific address. This address is only valid as long
4439 * as the htable stays locked.
4441 if (pte_ma
!= NULL
) {
4442 entry
= htable_va2entry((uintptr_t)addr
, ht
);
4443 base_ma
= pa_to_ma(ptob(ht
->ht_pfn
));
4444 *pte_ma
= base_ma
+ (entry
<< mmu
.pte_size_shift
);
4446 XPV_ALLOW_MIGRATE();
4450 hat_release_mapping(hat_t
*hat
, caddr_t addr
)
4454 ASSERT(IS_P2ALIGNED((uintptr_t)addr
, MMU_PAGESIZE
));
4455 XPV_DISALLOW_MIGRATE();
4456 ht
= htable_lookup(hat
, (uintptr_t)addr
, 0);
4458 ASSERT(ht
->ht_busy
>= 2);
4461 XPV_ALLOW_MIGRATE();