2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
9 * Copyright (C) 2006 Qumranet, Inc.
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
23 #include <linux/types.h>
24 #include <linux/string.h>
26 #include <linux/highmem.h>
27 #include <linux/module.h>
30 #include <asm/cmpxchg.h>
37 static void kvm_mmu_audit(struct kvm_vcpu
*vcpu
, const char *msg
);
39 static void kvm_mmu_audit(struct kvm_vcpu
*vcpu
, const char *msg
) {}
44 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
45 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
49 #define pgprintk(x...) do { } while (0)
50 #define rmap_printk(x...) do { } while (0)
54 #if defined(MMU_DEBUG) || defined(AUDIT)
59 #define ASSERT(x) do { } while (0)
63 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
64 __FILE__, __LINE__, #x); \
68 #define PT64_PT_BITS 9
69 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
70 #define PT32_PT_BITS 10
71 #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
73 #define PT_WRITABLE_SHIFT 1
75 #define PT_PRESENT_MASK (1ULL << 0)
76 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
77 #define PT_USER_MASK (1ULL << 2)
78 #define PT_PWT_MASK (1ULL << 3)
79 #define PT_PCD_MASK (1ULL << 4)
80 #define PT_ACCESSED_MASK (1ULL << 5)
81 #define PT_DIRTY_MASK (1ULL << 6)
82 #define PT_PAGE_SIZE_MASK (1ULL << 7)
83 #define PT_PAT_MASK (1ULL << 7)
84 #define PT_GLOBAL_MASK (1ULL << 8)
85 #define PT64_NX_MASK (1ULL << 63)
87 #define PT_PAT_SHIFT 7
88 #define PT_DIR_PAT_SHIFT 12
89 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
91 #define PT32_DIR_PSE36_SIZE 4
92 #define PT32_DIR_PSE36_SHIFT 13
93 #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
96 #define PT_FIRST_AVAIL_BITS_SHIFT 9
97 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
99 #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
101 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
103 #define PT64_LEVEL_BITS 9
105 #define PT64_LEVEL_SHIFT(level) \
106 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
108 #define PT64_LEVEL_MASK(level) \
109 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
111 #define PT64_INDEX(address, level)\
112 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
115 #define PT32_LEVEL_BITS 10
117 #define PT32_LEVEL_SHIFT(level) \
118 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
120 #define PT32_LEVEL_MASK(level) \
121 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
123 #define PT32_INDEX(address, level)\
124 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
127 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
128 #define PT64_DIR_BASE_ADDR_MASK \
129 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
131 #define PT32_BASE_ADDR_MASK PAGE_MASK
132 #define PT32_DIR_BASE_ADDR_MASK \
133 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
136 #define PFERR_PRESENT_MASK (1U << 0)
137 #define PFERR_WRITE_MASK (1U << 1)
138 #define PFERR_USER_MASK (1U << 2)
139 #define PFERR_FETCH_MASK (1U << 4)
141 #define PT64_ROOT_LEVEL 4
142 #define PT32_ROOT_LEVEL 2
143 #define PT32E_ROOT_LEVEL 3
145 #define PT_DIRECTORY_LEVEL 2
146 #define PT_PAGE_TABLE_LEVEL 1
150 struct kvm_rmap_desc
{
151 u64
*shadow_ptes
[RMAP_EXT
];
152 struct kvm_rmap_desc
*more
;
155 static struct kmem_cache
*pte_chain_cache
;
156 static struct kmem_cache
*rmap_desc_cache
;
157 static struct kmem_cache
*mmu_page_cache
;
158 static struct kmem_cache
*mmu_page_header_cache
;
160 static int is_write_protection(struct kvm_vcpu
*vcpu
)
162 return vcpu
->cr0
& CR0_WP_MASK
;
165 static int is_cpuid_PSE36(void)
170 static int is_nx(struct kvm_vcpu
*vcpu
)
172 return vcpu
->shadow_efer
& EFER_NX
;
175 static int is_present_pte(unsigned long pte
)
177 return pte
& PT_PRESENT_MASK
;
180 static int is_writeble_pte(unsigned long pte
)
182 return pte
& PT_WRITABLE_MASK
;
185 static int is_io_pte(unsigned long pte
)
187 return pte
& PT_SHADOW_IO_MARK
;
190 static int is_rmap_pte(u64 pte
)
192 return (pte
& (PT_WRITABLE_MASK
| PT_PRESENT_MASK
))
193 == (PT_WRITABLE_MASK
| PT_PRESENT_MASK
);
196 static void set_shadow_pte(u64
*sptep
, u64 spte
)
199 set_64bit((unsigned long *)sptep
, spte
);
201 set_64bit((unsigned long long *)sptep
, spte
);
205 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache
*cache
,
206 struct kmem_cache
*base_cache
, int min
,
211 if (cache
->nobjs
>= min
)
213 while (cache
->nobjs
< ARRAY_SIZE(cache
->objects
)) {
214 obj
= kmem_cache_zalloc(base_cache
, gfp_flags
);
217 cache
->objects
[cache
->nobjs
++] = obj
;
222 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache
*mc
)
225 kfree(mc
->objects
[--mc
->nobjs
]);
228 static int __mmu_topup_memory_caches(struct kvm_vcpu
*vcpu
, gfp_t gfp_flags
)
232 r
= mmu_topup_memory_cache(&vcpu
->mmu_pte_chain_cache
,
233 pte_chain_cache
, 4, gfp_flags
);
236 r
= mmu_topup_memory_cache(&vcpu
->mmu_rmap_desc_cache
,
237 rmap_desc_cache
, 1, gfp_flags
);
240 r
= mmu_topup_memory_cache(&vcpu
->mmu_page_cache
,
241 mmu_page_cache
, 4, gfp_flags
);
244 r
= mmu_topup_memory_cache(&vcpu
->mmu_page_header_cache
,
245 mmu_page_header_cache
, 4, gfp_flags
);
250 static int mmu_topup_memory_caches(struct kvm_vcpu
*vcpu
)
254 r
= __mmu_topup_memory_caches(vcpu
, GFP_NOWAIT
);
256 spin_unlock(&vcpu
->kvm
->lock
);
257 kvm_arch_ops
->vcpu_put(vcpu
);
258 r
= __mmu_topup_memory_caches(vcpu
, GFP_KERNEL
);
259 kvm_arch_ops
->vcpu_load(vcpu
);
260 spin_lock(&vcpu
->kvm
->lock
);
265 static void mmu_free_memory_caches(struct kvm_vcpu
*vcpu
)
267 mmu_free_memory_cache(&vcpu
->mmu_pte_chain_cache
);
268 mmu_free_memory_cache(&vcpu
->mmu_rmap_desc_cache
);
269 mmu_free_memory_cache(&vcpu
->mmu_page_cache
);
270 mmu_free_memory_cache(&vcpu
->mmu_page_header_cache
);
273 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache
*mc
,
279 p
= mc
->objects
[--mc
->nobjs
];
284 static struct kvm_pte_chain
*mmu_alloc_pte_chain(struct kvm_vcpu
*vcpu
)
286 return mmu_memory_cache_alloc(&vcpu
->mmu_pte_chain_cache
,
287 sizeof(struct kvm_pte_chain
));
290 static void mmu_free_pte_chain(struct kvm_pte_chain
*pc
)
295 static struct kvm_rmap_desc
*mmu_alloc_rmap_desc(struct kvm_vcpu
*vcpu
)
297 return mmu_memory_cache_alloc(&vcpu
->mmu_rmap_desc_cache
,
298 sizeof(struct kvm_rmap_desc
));
301 static void mmu_free_rmap_desc(struct kvm_rmap_desc
*rd
)
307 * Reverse mapping data structures:
309 * If page->private bit zero is zero, then page->private points to the
310 * shadow page table entry that points to page_address(page).
312 * If page->private bit zero is one, (then page->private & ~1) points
313 * to a struct kvm_rmap_desc containing more mappings.
315 static void rmap_add(struct kvm_vcpu
*vcpu
, u64
*spte
)
318 struct kvm_rmap_desc
*desc
;
321 if (!is_rmap_pte(*spte
))
323 page
= pfn_to_page((*spte
& PT64_BASE_ADDR_MASK
) >> PAGE_SHIFT
);
324 if (!page_private(page
)) {
325 rmap_printk("rmap_add: %p %llx 0->1\n", spte
, *spte
);
326 set_page_private(page
,(unsigned long)spte
);
327 } else if (!(page_private(page
) & 1)) {
328 rmap_printk("rmap_add: %p %llx 1->many\n", spte
, *spte
);
329 desc
= mmu_alloc_rmap_desc(vcpu
);
330 desc
->shadow_ptes
[0] = (u64
*)page_private(page
);
331 desc
->shadow_ptes
[1] = spte
;
332 set_page_private(page
,(unsigned long)desc
| 1);
334 rmap_printk("rmap_add: %p %llx many->many\n", spte
, *spte
);
335 desc
= (struct kvm_rmap_desc
*)(page_private(page
) & ~1ul);
336 while (desc
->shadow_ptes
[RMAP_EXT
-1] && desc
->more
)
338 if (desc
->shadow_ptes
[RMAP_EXT
-1]) {
339 desc
->more
= mmu_alloc_rmap_desc(vcpu
);
342 for (i
= 0; desc
->shadow_ptes
[i
]; ++i
)
344 desc
->shadow_ptes
[i
] = spte
;
348 static void rmap_desc_remove_entry(struct page
*page
,
349 struct kvm_rmap_desc
*desc
,
351 struct kvm_rmap_desc
*prev_desc
)
355 for (j
= RMAP_EXT
- 1; !desc
->shadow_ptes
[j
] && j
> i
; --j
)
357 desc
->shadow_ptes
[i
] = desc
->shadow_ptes
[j
];
358 desc
->shadow_ptes
[j
] = NULL
;
361 if (!prev_desc
&& !desc
->more
)
362 set_page_private(page
,(unsigned long)desc
->shadow_ptes
[0]);
365 prev_desc
->more
= desc
->more
;
367 set_page_private(page
,(unsigned long)desc
->more
| 1);
368 mmu_free_rmap_desc(desc
);
371 static void rmap_remove(u64
*spte
)
374 struct kvm_rmap_desc
*desc
;
375 struct kvm_rmap_desc
*prev_desc
;
378 if (!is_rmap_pte(*spte
))
380 page
= pfn_to_page((*spte
& PT64_BASE_ADDR_MASK
) >> PAGE_SHIFT
);
381 if (!page_private(page
)) {
382 printk(KERN_ERR
"rmap_remove: %p %llx 0->BUG\n", spte
, *spte
);
384 } else if (!(page_private(page
) & 1)) {
385 rmap_printk("rmap_remove: %p %llx 1->0\n", spte
, *spte
);
386 if ((u64
*)page_private(page
) != spte
) {
387 printk(KERN_ERR
"rmap_remove: %p %llx 1->BUG\n",
391 set_page_private(page
,0);
393 rmap_printk("rmap_remove: %p %llx many->many\n", spte
, *spte
);
394 desc
= (struct kvm_rmap_desc
*)(page_private(page
) & ~1ul);
397 for (i
= 0; i
< RMAP_EXT
&& desc
->shadow_ptes
[i
]; ++i
)
398 if (desc
->shadow_ptes
[i
] == spte
) {
399 rmap_desc_remove_entry(page
,
411 static void rmap_write_protect(struct kvm_vcpu
*vcpu
, u64 gfn
)
413 struct kvm
*kvm
= vcpu
->kvm
;
415 struct kvm_rmap_desc
*desc
;
418 page
= gfn_to_page(kvm
, gfn
);
421 while (page_private(page
)) {
422 if (!(page_private(page
) & 1))
423 spte
= (u64
*)page_private(page
);
425 desc
= (struct kvm_rmap_desc
*)(page_private(page
) & ~1ul);
426 spte
= desc
->shadow_ptes
[0];
429 BUG_ON((*spte
& PT64_BASE_ADDR_MASK
) >> PAGE_SHIFT
430 != page_to_pfn(page
));
431 BUG_ON(!(*spte
& PT_PRESENT_MASK
));
432 BUG_ON(!(*spte
& PT_WRITABLE_MASK
));
433 rmap_printk("rmap_write_protect: spte %p %llx\n", spte
, *spte
);
435 set_shadow_pte(spte
, *spte
& ~PT_WRITABLE_MASK
);
436 kvm_flush_remote_tlbs(vcpu
->kvm
);
441 static int is_empty_shadow_page(u64
*spt
)
446 for (pos
= spt
, end
= pos
+ PAGE_SIZE
/ sizeof(u64
); pos
!= end
; pos
++)
448 printk(KERN_ERR
"%s: %p %llx\n", __FUNCTION__
,
456 static void kvm_mmu_free_page(struct kvm
*kvm
,
457 struct kvm_mmu_page
*page_head
)
459 ASSERT(is_empty_shadow_page(page_head
->spt
));
460 list_del(&page_head
->link
);
461 kfree(page_head
->spt
);
463 ++kvm
->n_free_mmu_pages
;
466 static unsigned kvm_page_table_hashfn(gfn_t gfn
)
471 static struct kvm_mmu_page
*kvm_mmu_alloc_page(struct kvm_vcpu
*vcpu
,
474 struct kvm_mmu_page
*page
;
476 if (!vcpu
->kvm
->n_free_mmu_pages
)
479 page
= mmu_memory_cache_alloc(&vcpu
->mmu_page_header_cache
,
481 page
->spt
= mmu_memory_cache_alloc(&vcpu
->mmu_page_cache
, PAGE_SIZE
);
482 set_page_private(virt_to_page(page
->spt
), (unsigned long)page
);
483 list_add(&page
->link
, &vcpu
->kvm
->active_mmu_pages
);
484 ASSERT(is_empty_shadow_page(page
->spt
));
485 page
->slot_bitmap
= 0;
486 page
->multimapped
= 0;
487 page
->parent_pte
= parent_pte
;
488 --vcpu
->kvm
->n_free_mmu_pages
;
492 static void mmu_page_add_parent_pte(struct kvm_vcpu
*vcpu
,
493 struct kvm_mmu_page
*page
, u64
*parent_pte
)
495 struct kvm_pte_chain
*pte_chain
;
496 struct hlist_node
*node
;
501 if (!page
->multimapped
) {
502 u64
*old
= page
->parent_pte
;
505 page
->parent_pte
= parent_pte
;
508 page
->multimapped
= 1;
509 pte_chain
= mmu_alloc_pte_chain(vcpu
);
510 INIT_HLIST_HEAD(&page
->parent_ptes
);
511 hlist_add_head(&pte_chain
->link
, &page
->parent_ptes
);
512 pte_chain
->parent_ptes
[0] = old
;
514 hlist_for_each_entry(pte_chain
, node
, &page
->parent_ptes
, link
) {
515 if (pte_chain
->parent_ptes
[NR_PTE_CHAIN_ENTRIES
-1])
517 for (i
= 0; i
< NR_PTE_CHAIN_ENTRIES
; ++i
)
518 if (!pte_chain
->parent_ptes
[i
]) {
519 pte_chain
->parent_ptes
[i
] = parent_pte
;
523 pte_chain
= mmu_alloc_pte_chain(vcpu
);
525 hlist_add_head(&pte_chain
->link
, &page
->parent_ptes
);
526 pte_chain
->parent_ptes
[0] = parent_pte
;
529 static void mmu_page_remove_parent_pte(struct kvm_mmu_page
*page
,
532 struct kvm_pte_chain
*pte_chain
;
533 struct hlist_node
*node
;
536 if (!page
->multimapped
) {
537 BUG_ON(page
->parent_pte
!= parent_pte
);
538 page
->parent_pte
= NULL
;
541 hlist_for_each_entry(pte_chain
, node
, &page
->parent_ptes
, link
)
542 for (i
= 0; i
< NR_PTE_CHAIN_ENTRIES
; ++i
) {
543 if (!pte_chain
->parent_ptes
[i
])
545 if (pte_chain
->parent_ptes
[i
] != parent_pte
)
547 while (i
+ 1 < NR_PTE_CHAIN_ENTRIES
548 && pte_chain
->parent_ptes
[i
+ 1]) {
549 pte_chain
->parent_ptes
[i
]
550 = pte_chain
->parent_ptes
[i
+ 1];
553 pte_chain
->parent_ptes
[i
] = NULL
;
555 hlist_del(&pte_chain
->link
);
556 mmu_free_pte_chain(pte_chain
);
557 if (hlist_empty(&page
->parent_ptes
)) {
558 page
->multimapped
= 0;
559 page
->parent_pte
= NULL
;
567 static struct kvm_mmu_page
*kvm_mmu_lookup_page(struct kvm_vcpu
*vcpu
,
571 struct hlist_head
*bucket
;
572 struct kvm_mmu_page
*page
;
573 struct hlist_node
*node
;
575 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__
, gfn
);
576 index
= kvm_page_table_hashfn(gfn
) % KVM_NUM_MMU_PAGES
;
577 bucket
= &vcpu
->kvm
->mmu_page_hash
[index
];
578 hlist_for_each_entry(page
, node
, bucket
, hash_link
)
579 if (page
->gfn
== gfn
&& !page
->role
.metaphysical
) {
580 pgprintk("%s: found role %x\n",
581 __FUNCTION__
, page
->role
.word
);
587 static struct kvm_mmu_page
*kvm_mmu_get_page(struct kvm_vcpu
*vcpu
,
592 unsigned hugepage_access
,
595 union kvm_mmu_page_role role
;
598 struct hlist_head
*bucket
;
599 struct kvm_mmu_page
*page
;
600 struct hlist_node
*node
;
603 role
.glevels
= vcpu
->mmu
.root_level
;
605 role
.metaphysical
= metaphysical
;
606 role
.hugepage_access
= hugepage_access
;
607 if (vcpu
->mmu
.root_level
<= PT32_ROOT_LEVEL
) {
608 quadrant
= gaddr
>> (PAGE_SHIFT
+ (PT64_PT_BITS
* level
));
609 quadrant
&= (1 << ((PT32_PT_BITS
- PT64_PT_BITS
) * level
)) - 1;
610 role
.quadrant
= quadrant
;
612 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__
,
614 index
= kvm_page_table_hashfn(gfn
) % KVM_NUM_MMU_PAGES
;
615 bucket
= &vcpu
->kvm
->mmu_page_hash
[index
];
616 hlist_for_each_entry(page
, node
, bucket
, hash_link
)
617 if (page
->gfn
== gfn
&& page
->role
.word
== role
.word
) {
618 mmu_page_add_parent_pte(vcpu
, page
, parent_pte
);
619 pgprintk("%s: found\n", __FUNCTION__
);
622 page
= kvm_mmu_alloc_page(vcpu
, parent_pte
);
625 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__
, gfn
, role
.word
);
628 hlist_add_head(&page
->hash_link
, bucket
);
630 rmap_write_protect(vcpu
, gfn
);
634 static void kvm_mmu_page_unlink_children(struct kvm
*kvm
,
635 struct kvm_mmu_page
*page
)
643 if (page
->role
.level
== PT_PAGE_TABLE_LEVEL
) {
644 for (i
= 0; i
< PT64_ENT_PER_PAGE
; ++i
) {
645 if (pt
[i
] & PT_PRESENT_MASK
)
649 kvm_flush_remote_tlbs(kvm
);
653 for (i
= 0; i
< PT64_ENT_PER_PAGE
; ++i
) {
657 if (!(ent
& PT_PRESENT_MASK
))
659 ent
&= PT64_BASE_ADDR_MASK
;
660 mmu_page_remove_parent_pte(page_header(ent
), &pt
[i
]);
662 kvm_flush_remote_tlbs(kvm
);
665 static void kvm_mmu_put_page(struct kvm_mmu_page
*page
,
668 mmu_page_remove_parent_pte(page
, parent_pte
);
671 static void kvm_mmu_zap_page(struct kvm
*kvm
,
672 struct kvm_mmu_page
*page
)
676 while (page
->multimapped
|| page
->parent_pte
) {
677 if (!page
->multimapped
)
678 parent_pte
= page
->parent_pte
;
680 struct kvm_pte_chain
*chain
;
682 chain
= container_of(page
->parent_ptes
.first
,
683 struct kvm_pte_chain
, link
);
684 parent_pte
= chain
->parent_ptes
[0];
687 kvm_mmu_put_page(page
, parent_pte
);
688 set_shadow_pte(parent_pte
, 0);
690 kvm_mmu_page_unlink_children(kvm
, page
);
691 if (!page
->root_count
) {
692 hlist_del(&page
->hash_link
);
693 kvm_mmu_free_page(kvm
, page
);
695 list_move(&page
->link
, &kvm
->active_mmu_pages
);
698 static int kvm_mmu_unprotect_page(struct kvm_vcpu
*vcpu
, gfn_t gfn
)
701 struct hlist_head
*bucket
;
702 struct kvm_mmu_page
*page
;
703 struct hlist_node
*node
, *n
;
706 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__
, gfn
);
708 index
= kvm_page_table_hashfn(gfn
) % KVM_NUM_MMU_PAGES
;
709 bucket
= &vcpu
->kvm
->mmu_page_hash
[index
];
710 hlist_for_each_entry_safe(page
, node
, n
, bucket
, hash_link
)
711 if (page
->gfn
== gfn
&& !page
->role
.metaphysical
) {
712 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__
, gfn
,
714 kvm_mmu_zap_page(vcpu
->kvm
, page
);
720 static void mmu_unshadow(struct kvm_vcpu
*vcpu
, gfn_t gfn
)
722 struct kvm_mmu_page
*page
;
724 while ((page
= kvm_mmu_lookup_page(vcpu
, gfn
)) != NULL
) {
725 pgprintk("%s: zap %lx %x\n",
726 __FUNCTION__
, gfn
, page
->role
.word
);
727 kvm_mmu_zap_page(vcpu
->kvm
, page
);
731 static void page_header_update_slot(struct kvm
*kvm
, void *pte
, gpa_t gpa
)
733 int slot
= memslot_id(kvm
, gfn_to_memslot(kvm
, gpa
>> PAGE_SHIFT
));
734 struct kvm_mmu_page
*page_head
= page_header(__pa(pte
));
736 __set_bit(slot
, &page_head
->slot_bitmap
);
739 hpa_t
safe_gpa_to_hpa(struct kvm_vcpu
*vcpu
, gpa_t gpa
)
741 hpa_t hpa
= gpa_to_hpa(vcpu
, gpa
);
743 return is_error_hpa(hpa
) ? bad_page_address
| (gpa
& ~PAGE_MASK
): hpa
;
746 hpa_t
gpa_to_hpa(struct kvm_vcpu
*vcpu
, gpa_t gpa
)
750 ASSERT((gpa
& HPA_ERR_MASK
) == 0);
751 page
= gfn_to_page(vcpu
->kvm
, gpa
>> PAGE_SHIFT
);
753 return gpa
| HPA_ERR_MASK
;
754 return ((hpa_t
)page_to_pfn(page
) << PAGE_SHIFT
)
755 | (gpa
& (PAGE_SIZE
-1));
758 hpa_t
gva_to_hpa(struct kvm_vcpu
*vcpu
, gva_t gva
)
760 gpa_t gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, gva
);
762 if (gpa
== UNMAPPED_GVA
)
764 return gpa_to_hpa(vcpu
, gpa
);
767 struct page
*gva_to_page(struct kvm_vcpu
*vcpu
, gva_t gva
)
769 gpa_t gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, gva
);
771 if (gpa
== UNMAPPED_GVA
)
773 return pfn_to_page(gpa_to_hpa(vcpu
, gpa
) >> PAGE_SHIFT
);
776 static void nonpaging_new_cr3(struct kvm_vcpu
*vcpu
)
780 static int nonpaging_map(struct kvm_vcpu
*vcpu
, gva_t v
, hpa_t p
)
782 int level
= PT32E_ROOT_LEVEL
;
783 hpa_t table_addr
= vcpu
->mmu
.root_hpa
;
786 u32 index
= PT64_INDEX(v
, level
);
790 ASSERT(VALID_PAGE(table_addr
));
791 table
= __va(table_addr
);
795 if (is_present_pte(pte
) && is_writeble_pte(pte
))
797 mark_page_dirty(vcpu
->kvm
, v
>> PAGE_SHIFT
);
798 page_header_update_slot(vcpu
->kvm
, table
, v
);
799 table
[index
] = p
| PT_PRESENT_MASK
| PT_WRITABLE_MASK
|
801 rmap_add(vcpu
, &table
[index
]);
805 if (table
[index
] == 0) {
806 struct kvm_mmu_page
*new_table
;
809 pseudo_gfn
= (v
& PT64_DIR_BASE_ADDR_MASK
)
811 new_table
= kvm_mmu_get_page(vcpu
, pseudo_gfn
,
813 1, 0, &table
[index
]);
815 pgprintk("nonpaging_map: ENOMEM\n");
819 table
[index
] = __pa(new_table
->spt
) | PT_PRESENT_MASK
820 | PT_WRITABLE_MASK
| PT_USER_MASK
;
822 table_addr
= table
[index
] & PT64_BASE_ADDR_MASK
;
826 static void mmu_free_roots(struct kvm_vcpu
*vcpu
)
829 struct kvm_mmu_page
*page
;
831 if (!VALID_PAGE(vcpu
->mmu
.root_hpa
))
834 if (vcpu
->mmu
.shadow_root_level
== PT64_ROOT_LEVEL
) {
835 hpa_t root
= vcpu
->mmu
.root_hpa
;
837 page
= page_header(root
);
839 vcpu
->mmu
.root_hpa
= INVALID_PAGE
;
843 for (i
= 0; i
< 4; ++i
) {
844 hpa_t root
= vcpu
->mmu
.pae_root
[i
];
847 root
&= PT64_BASE_ADDR_MASK
;
848 page
= page_header(root
);
851 vcpu
->mmu
.pae_root
[i
] = INVALID_PAGE
;
853 vcpu
->mmu
.root_hpa
= INVALID_PAGE
;
856 static void mmu_alloc_roots(struct kvm_vcpu
*vcpu
)
860 struct kvm_mmu_page
*page
;
862 root_gfn
= vcpu
->cr3
>> PAGE_SHIFT
;
865 if (vcpu
->mmu
.shadow_root_level
== PT64_ROOT_LEVEL
) {
866 hpa_t root
= vcpu
->mmu
.root_hpa
;
868 ASSERT(!VALID_PAGE(root
));
869 page
= kvm_mmu_get_page(vcpu
, root_gfn
, 0,
870 PT64_ROOT_LEVEL
, 0, 0, NULL
);
871 root
= __pa(page
->spt
);
873 vcpu
->mmu
.root_hpa
= root
;
877 for (i
= 0; i
< 4; ++i
) {
878 hpa_t root
= vcpu
->mmu
.pae_root
[i
];
880 ASSERT(!VALID_PAGE(root
));
881 if (vcpu
->mmu
.root_level
== PT32E_ROOT_LEVEL
) {
882 if (!is_present_pte(vcpu
->pdptrs
[i
])) {
883 vcpu
->mmu
.pae_root
[i
] = 0;
886 root_gfn
= vcpu
->pdptrs
[i
] >> PAGE_SHIFT
;
887 } else if (vcpu
->mmu
.root_level
== 0)
889 page
= kvm_mmu_get_page(vcpu
, root_gfn
, i
<< 30,
890 PT32_ROOT_LEVEL
, !is_paging(vcpu
),
892 root
= __pa(page
->spt
);
894 vcpu
->mmu
.pae_root
[i
] = root
| PT_PRESENT_MASK
;
896 vcpu
->mmu
.root_hpa
= __pa(vcpu
->mmu
.pae_root
);
899 static gpa_t
nonpaging_gva_to_gpa(struct kvm_vcpu
*vcpu
, gva_t vaddr
)
904 static int nonpaging_page_fault(struct kvm_vcpu
*vcpu
, gva_t gva
,
911 r
= mmu_topup_memory_caches(vcpu
);
916 ASSERT(VALID_PAGE(vcpu
->mmu
.root_hpa
));
919 paddr
= gpa_to_hpa(vcpu
, addr
& PT64_BASE_ADDR_MASK
);
921 if (is_error_hpa(paddr
))
924 return nonpaging_map(vcpu
, addr
& PAGE_MASK
, paddr
);
927 static void nonpaging_free(struct kvm_vcpu
*vcpu
)
929 mmu_free_roots(vcpu
);
932 static int nonpaging_init_context(struct kvm_vcpu
*vcpu
)
934 struct kvm_mmu
*context
= &vcpu
->mmu
;
936 context
->new_cr3
= nonpaging_new_cr3
;
937 context
->page_fault
= nonpaging_page_fault
;
938 context
->gva_to_gpa
= nonpaging_gva_to_gpa
;
939 context
->free
= nonpaging_free
;
940 context
->root_level
= 0;
941 context
->shadow_root_level
= PT32E_ROOT_LEVEL
;
942 context
->root_hpa
= INVALID_PAGE
;
946 static void kvm_mmu_flush_tlb(struct kvm_vcpu
*vcpu
)
948 ++vcpu
->stat
.tlb_flush
;
949 kvm_arch_ops
->tlb_flush(vcpu
);
952 static void paging_new_cr3(struct kvm_vcpu
*vcpu
)
954 pgprintk("%s: cr3 %lx\n", __FUNCTION__
, vcpu
->cr3
);
955 mmu_free_roots(vcpu
);
958 static void inject_page_fault(struct kvm_vcpu
*vcpu
,
962 kvm_arch_ops
->inject_page_fault(vcpu
, addr
, err_code
);
965 static void paging_free(struct kvm_vcpu
*vcpu
)
967 nonpaging_free(vcpu
);
971 #include "paging_tmpl.h"
975 #include "paging_tmpl.h"
978 static int paging64_init_context_common(struct kvm_vcpu
*vcpu
, int level
)
980 struct kvm_mmu
*context
= &vcpu
->mmu
;
982 ASSERT(is_pae(vcpu
));
983 context
->new_cr3
= paging_new_cr3
;
984 context
->page_fault
= paging64_page_fault
;
985 context
->gva_to_gpa
= paging64_gva_to_gpa
;
986 context
->free
= paging_free
;
987 context
->root_level
= level
;
988 context
->shadow_root_level
= level
;
989 context
->root_hpa
= INVALID_PAGE
;
993 static int paging64_init_context(struct kvm_vcpu
*vcpu
)
995 return paging64_init_context_common(vcpu
, PT64_ROOT_LEVEL
);
998 static int paging32_init_context(struct kvm_vcpu
*vcpu
)
1000 struct kvm_mmu
*context
= &vcpu
->mmu
;
1002 context
->new_cr3
= paging_new_cr3
;
1003 context
->page_fault
= paging32_page_fault
;
1004 context
->gva_to_gpa
= paging32_gva_to_gpa
;
1005 context
->free
= paging_free
;
1006 context
->root_level
= PT32_ROOT_LEVEL
;
1007 context
->shadow_root_level
= PT32E_ROOT_LEVEL
;
1008 context
->root_hpa
= INVALID_PAGE
;
1012 static int paging32E_init_context(struct kvm_vcpu
*vcpu
)
1014 return paging64_init_context_common(vcpu
, PT32E_ROOT_LEVEL
);
1017 static int init_kvm_mmu(struct kvm_vcpu
*vcpu
)
1020 ASSERT(!VALID_PAGE(vcpu
->mmu
.root_hpa
));
1022 if (!is_paging(vcpu
))
1023 return nonpaging_init_context(vcpu
);
1024 else if (is_long_mode(vcpu
))
1025 return paging64_init_context(vcpu
);
1026 else if (is_pae(vcpu
))
1027 return paging32E_init_context(vcpu
);
1029 return paging32_init_context(vcpu
);
1032 static void destroy_kvm_mmu(struct kvm_vcpu
*vcpu
)
1035 if (VALID_PAGE(vcpu
->mmu
.root_hpa
)) {
1036 vcpu
->mmu
.free(vcpu
);
1037 vcpu
->mmu
.root_hpa
= INVALID_PAGE
;
1041 int kvm_mmu_reset_context(struct kvm_vcpu
*vcpu
)
1043 destroy_kvm_mmu(vcpu
);
1044 return init_kvm_mmu(vcpu
);
1047 int kvm_mmu_load(struct kvm_vcpu
*vcpu
)
1051 spin_lock(&vcpu
->kvm
->lock
);
1052 r
= mmu_topup_memory_caches(vcpu
);
1055 mmu_alloc_roots(vcpu
);
1056 kvm_arch_ops
->set_cr3(vcpu
, vcpu
->mmu
.root_hpa
);
1057 kvm_mmu_flush_tlb(vcpu
);
1059 spin_unlock(&vcpu
->kvm
->lock
);
1062 EXPORT_SYMBOL_GPL(kvm_mmu_load
);
1064 void kvm_mmu_unload(struct kvm_vcpu
*vcpu
)
1066 mmu_free_roots(vcpu
);
1069 static void mmu_pte_write_zap_pte(struct kvm_vcpu
*vcpu
,
1070 struct kvm_mmu_page
*page
,
1074 struct kvm_mmu_page
*child
;
1077 if (is_present_pte(pte
)) {
1078 if (page
->role
.level
== PT_PAGE_TABLE_LEVEL
)
1081 child
= page_header(pte
& PT64_BASE_ADDR_MASK
);
1082 mmu_page_remove_parent_pte(child
, spte
);
1086 kvm_flush_remote_tlbs(vcpu
->kvm
);
1089 static void mmu_pte_write_new_pte(struct kvm_vcpu
*vcpu
,
1090 struct kvm_mmu_page
*page
,
1092 const void *new, int bytes
)
1094 if (page
->role
.level
!= PT_PAGE_TABLE_LEVEL
)
1097 if (page
->role
.glevels
== PT32_ROOT_LEVEL
)
1098 paging32_update_pte(vcpu
, page
, spte
, new, bytes
);
1100 paging64_update_pte(vcpu
, page
, spte
, new, bytes
);
1103 void kvm_mmu_pte_write(struct kvm_vcpu
*vcpu
, gpa_t gpa
,
1104 const u8
*old
, const u8
*new, int bytes
)
1106 gfn_t gfn
= gpa
>> PAGE_SHIFT
;
1107 struct kvm_mmu_page
*page
;
1108 struct hlist_node
*node
, *n
;
1109 struct hlist_head
*bucket
;
1112 unsigned offset
= offset_in_page(gpa
);
1114 unsigned page_offset
;
1115 unsigned misaligned
;
1121 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__
, gpa
, bytes
);
1122 if (gfn
== vcpu
->last_pt_write_gfn
) {
1123 ++vcpu
->last_pt_write_count
;
1124 if (vcpu
->last_pt_write_count
>= 3)
1127 vcpu
->last_pt_write_gfn
= gfn
;
1128 vcpu
->last_pt_write_count
= 1;
1130 index
= kvm_page_table_hashfn(gfn
) % KVM_NUM_MMU_PAGES
;
1131 bucket
= &vcpu
->kvm
->mmu_page_hash
[index
];
1132 hlist_for_each_entry_safe(page
, node
, n
, bucket
, hash_link
) {
1133 if (page
->gfn
!= gfn
|| page
->role
.metaphysical
)
1135 pte_size
= page
->role
.glevels
== PT32_ROOT_LEVEL
? 4 : 8;
1136 misaligned
= (offset
^ (offset
+ bytes
- 1)) & ~(pte_size
- 1);
1137 misaligned
|= bytes
< 4;
1138 if (misaligned
|| flooded
) {
1140 * Misaligned accesses are too much trouble to fix
1141 * up; also, they usually indicate a page is not used
1144 * If we're seeing too many writes to a page,
1145 * it may no longer be a page table, or we may be
1146 * forking, in which case it is better to unmap the
1149 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1150 gpa
, bytes
, page
->role
.word
);
1151 kvm_mmu_zap_page(vcpu
->kvm
, page
);
1154 page_offset
= offset
;
1155 level
= page
->role
.level
;
1157 if (page
->role
.glevels
== PT32_ROOT_LEVEL
) {
1158 page_offset
<<= 1; /* 32->64 */
1160 * A 32-bit pde maps 4MB while the shadow pdes map
1161 * only 2MB. So we need to double the offset again
1162 * and zap two pdes instead of one.
1164 if (level
== PT32_ROOT_LEVEL
) {
1165 page_offset
&= ~7; /* kill rounding error */
1169 quadrant
= page_offset
>> PAGE_SHIFT
;
1170 page_offset
&= ~PAGE_MASK
;
1171 if (quadrant
!= page
->role
.quadrant
)
1174 spte
= &page
->spt
[page_offset
/ sizeof(*spte
)];
1176 mmu_pte_write_zap_pte(vcpu
, page
, spte
);
1177 mmu_pte_write_new_pte(vcpu
, page
, spte
, new, bytes
);
1183 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu
*vcpu
, gva_t gva
)
1185 gpa_t gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, gva
);
1187 return kvm_mmu_unprotect_page(vcpu
, gpa
>> PAGE_SHIFT
);
1190 void kvm_mmu_free_some_pages(struct kvm_vcpu
*vcpu
)
1192 while (vcpu
->kvm
->n_free_mmu_pages
< KVM_REFILL_PAGES
) {
1193 struct kvm_mmu_page
*page
;
1195 page
= container_of(vcpu
->kvm
->active_mmu_pages
.prev
,
1196 struct kvm_mmu_page
, link
);
1197 kvm_mmu_zap_page(vcpu
->kvm
, page
);
1200 EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages
);
1202 static void free_mmu_pages(struct kvm_vcpu
*vcpu
)
1204 struct kvm_mmu_page
*page
;
1206 while (!list_empty(&vcpu
->kvm
->active_mmu_pages
)) {
1207 page
= container_of(vcpu
->kvm
->active_mmu_pages
.next
,
1208 struct kvm_mmu_page
, link
);
1209 kvm_mmu_zap_page(vcpu
->kvm
, page
);
1211 free_page((unsigned long)vcpu
->mmu
.pae_root
);
1214 static int alloc_mmu_pages(struct kvm_vcpu
*vcpu
)
1221 vcpu
->kvm
->n_free_mmu_pages
= KVM_NUM_MMU_PAGES
;
1224 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1225 * Therefore we need to allocate shadow page tables in the first
1226 * 4GB of memory, which happens to fit the DMA32 zone.
1228 page
= alloc_page(GFP_KERNEL
| __GFP_DMA32
);
1231 vcpu
->mmu
.pae_root
= page_address(page
);
1232 for (i
= 0; i
< 4; ++i
)
1233 vcpu
->mmu
.pae_root
[i
] = INVALID_PAGE
;
1238 free_mmu_pages(vcpu
);
1242 int kvm_mmu_create(struct kvm_vcpu
*vcpu
)
1245 ASSERT(!VALID_PAGE(vcpu
->mmu
.root_hpa
));
1247 return alloc_mmu_pages(vcpu
);
1250 int kvm_mmu_setup(struct kvm_vcpu
*vcpu
)
1253 ASSERT(!VALID_PAGE(vcpu
->mmu
.root_hpa
));
1255 return init_kvm_mmu(vcpu
);
1258 void kvm_mmu_destroy(struct kvm_vcpu
*vcpu
)
1262 destroy_kvm_mmu(vcpu
);
1263 free_mmu_pages(vcpu
);
1264 mmu_free_memory_caches(vcpu
);
1267 void kvm_mmu_slot_remove_write_access(struct kvm
*kvm
, int slot
)
1269 struct kvm_mmu_page
*page
;
1271 list_for_each_entry(page
, &kvm
->active_mmu_pages
, link
) {
1275 if (!test_bit(slot
, &page
->slot_bitmap
))
1279 for (i
= 0; i
< PT64_ENT_PER_PAGE
; ++i
)
1281 if (pt
[i
] & PT_WRITABLE_MASK
) {
1282 rmap_remove(&pt
[i
]);
1283 pt
[i
] &= ~PT_WRITABLE_MASK
;
1288 void kvm_mmu_zap_all(struct kvm
*kvm
)
1290 struct kvm_mmu_page
*page
, *node
;
1292 list_for_each_entry_safe(page
, node
, &kvm
->active_mmu_pages
, link
)
1293 kvm_mmu_zap_page(kvm
, page
);
1295 kvm_flush_remote_tlbs(kvm
);
1298 void kvm_mmu_module_exit(void)
1300 if (pte_chain_cache
)
1301 kmem_cache_destroy(pte_chain_cache
);
1302 if (rmap_desc_cache
)
1303 kmem_cache_destroy(rmap_desc_cache
);
1305 kmem_cache_destroy(mmu_page_cache
);
1306 if (mmu_page_header_cache
)
1307 kmem_cache_destroy(mmu_page_header_cache
);
1310 int kvm_mmu_module_init(void)
1312 pte_chain_cache
= kmem_cache_create("kvm_pte_chain",
1313 sizeof(struct kvm_pte_chain
),
1315 if (!pte_chain_cache
)
1317 rmap_desc_cache
= kmem_cache_create("kvm_rmap_desc",
1318 sizeof(struct kvm_rmap_desc
),
1320 if (!rmap_desc_cache
)
1323 mmu_page_cache
= kmem_cache_create("kvm_mmu_page",
1325 PAGE_SIZE
, 0, NULL
);
1326 if (!mmu_page_cache
)
1329 mmu_page_header_cache
= kmem_cache_create("kvm_mmu_page_header",
1330 sizeof(struct kvm_mmu_page
),
1332 if (!mmu_page_header_cache
)
1338 kvm_mmu_module_exit();
1344 static const char *audit_msg
;
1346 static gva_t
canonicalize(gva_t gva
)
1348 #ifdef CONFIG_X86_64
1349 gva
= (long long)(gva
<< 16) >> 16;
1354 static void audit_mappings_page(struct kvm_vcpu
*vcpu
, u64 page_pte
,
1355 gva_t va
, int level
)
1357 u64
*pt
= __va(page_pte
& PT64_BASE_ADDR_MASK
);
1359 gva_t va_delta
= 1ul << (PAGE_SHIFT
+ 9 * (level
- 1));
1361 for (i
= 0; i
< PT64_ENT_PER_PAGE
; ++i
, va
+= va_delta
) {
1364 if (!(ent
& PT_PRESENT_MASK
))
1367 va
= canonicalize(va
);
1369 audit_mappings_page(vcpu
, ent
, va
, level
- 1);
1371 gpa_t gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, va
);
1372 hpa_t hpa
= gpa_to_hpa(vcpu
, gpa
);
1374 if ((ent
& PT_PRESENT_MASK
)
1375 && (ent
& PT64_BASE_ADDR_MASK
) != hpa
)
1376 printk(KERN_ERR
"audit error: (%s) levels %d"
1377 " gva %lx gpa %llx hpa %llx ent %llx\n",
1378 audit_msg
, vcpu
->mmu
.root_level
,
1384 static void audit_mappings(struct kvm_vcpu
*vcpu
)
1388 if (vcpu
->mmu
.root_level
== 4)
1389 audit_mappings_page(vcpu
, vcpu
->mmu
.root_hpa
, 0, 4);
1391 for (i
= 0; i
< 4; ++i
)
1392 if (vcpu
->mmu
.pae_root
[i
] & PT_PRESENT_MASK
)
1393 audit_mappings_page(vcpu
,
1394 vcpu
->mmu
.pae_root
[i
],
1399 static int count_rmaps(struct kvm_vcpu
*vcpu
)
1404 for (i
= 0; i
< KVM_MEMORY_SLOTS
; ++i
) {
1405 struct kvm_memory_slot
*m
= &vcpu
->kvm
->memslots
[i
];
1406 struct kvm_rmap_desc
*d
;
1408 for (j
= 0; j
< m
->npages
; ++j
) {
1409 struct page
*page
= m
->phys_mem
[j
];
1413 if (!(page
->private & 1)) {
1417 d
= (struct kvm_rmap_desc
*)(page
->private & ~1ul);
1419 for (k
= 0; k
< RMAP_EXT
; ++k
)
1420 if (d
->shadow_ptes
[k
])
1431 static int count_writable_mappings(struct kvm_vcpu
*vcpu
)
1434 struct kvm_mmu_page
*page
;
1437 list_for_each_entry(page
, &vcpu
->kvm
->active_mmu_pages
, link
) {
1438 u64
*pt
= page
->spt
;
1440 if (page
->role
.level
!= PT_PAGE_TABLE_LEVEL
)
1443 for (i
= 0; i
< PT64_ENT_PER_PAGE
; ++i
) {
1446 if (!(ent
& PT_PRESENT_MASK
))
1448 if (!(ent
& PT_WRITABLE_MASK
))
1456 static void audit_rmap(struct kvm_vcpu
*vcpu
)
1458 int n_rmap
= count_rmaps(vcpu
);
1459 int n_actual
= count_writable_mappings(vcpu
);
1461 if (n_rmap
!= n_actual
)
1462 printk(KERN_ERR
"%s: (%s) rmap %d actual %d\n",
1463 __FUNCTION__
, audit_msg
, n_rmap
, n_actual
);
1466 static void audit_write_protection(struct kvm_vcpu
*vcpu
)
1468 struct kvm_mmu_page
*page
;
1470 list_for_each_entry(page
, &vcpu
->kvm
->active_mmu_pages
, link
) {
1474 if (page
->role
.metaphysical
)
1477 hfn
= gpa_to_hpa(vcpu
, (gpa_t
)page
->gfn
<< PAGE_SHIFT
)
1479 pg
= pfn_to_page(hfn
);
1481 printk(KERN_ERR
"%s: (%s) shadow page has writable"
1482 " mappings: gfn %lx role %x\n",
1483 __FUNCTION__
, audit_msg
, page
->gfn
,
1488 static void kvm_mmu_audit(struct kvm_vcpu
*vcpu
, const char *msg
)
1495 audit_write_protection(vcpu
);
1496 audit_mappings(vcpu
);