KVM: MMU: Move set_pte_common() to pte width dependent code
[linux-2.6/libata-dev.git] / drivers / kvm / mmu.c
bloba7631502f22bd04cdcb157afc6d25ac8491ff219
1 /*
2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
7 * MMU support
9 * Copyright (C) 2006 Qumranet, Inc.
11 * Authors:
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
19 #include <linux/types.h>
20 #include <linux/string.h>
21 #include <asm/page.h>
22 #include <linux/mm.h>
23 #include <linux/highmem.h>
24 #include <linux/module.h>
26 #include "vmx.h"
27 #include "kvm.h"
29 #undef MMU_DEBUG
31 #undef AUDIT
33 #ifdef AUDIT
34 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
35 #else
36 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
37 #endif
39 #ifdef MMU_DEBUG
41 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
42 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
44 #else
46 #define pgprintk(x...) do { } while (0)
47 #define rmap_printk(x...) do { } while (0)
49 #endif
51 #if defined(MMU_DEBUG) || defined(AUDIT)
52 static int dbg = 1;
53 #endif
55 #ifndef MMU_DEBUG
56 #define ASSERT(x) do { } while (0)
57 #else
58 #define ASSERT(x) \
59 if (!(x)) { \
60 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
61 __FILE__, __LINE__, #x); \
63 #endif
65 #define PT64_PT_BITS 9
66 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
67 #define PT32_PT_BITS 10
68 #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
70 #define PT_WRITABLE_SHIFT 1
72 #define PT_PRESENT_MASK (1ULL << 0)
73 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
74 #define PT_USER_MASK (1ULL << 2)
75 #define PT_PWT_MASK (1ULL << 3)
76 #define PT_PCD_MASK (1ULL << 4)
77 #define PT_ACCESSED_MASK (1ULL << 5)
78 #define PT_DIRTY_MASK (1ULL << 6)
79 #define PT_PAGE_SIZE_MASK (1ULL << 7)
80 #define PT_PAT_MASK (1ULL << 7)
81 #define PT_GLOBAL_MASK (1ULL << 8)
82 #define PT64_NX_MASK (1ULL << 63)
84 #define PT_PAT_SHIFT 7
85 #define PT_DIR_PAT_SHIFT 12
86 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
88 #define PT32_DIR_PSE36_SIZE 4
89 #define PT32_DIR_PSE36_SHIFT 13
90 #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
93 #define PT32_PTE_COPY_MASK \
94 (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
96 #define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
98 #define PT_FIRST_AVAIL_BITS_SHIFT 9
99 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
101 #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
102 #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
104 #define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
105 #define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
107 #define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
108 #define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
110 #define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
112 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
114 #define PT64_LEVEL_BITS 9
116 #define PT64_LEVEL_SHIFT(level) \
117 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
119 #define PT64_LEVEL_MASK(level) \
120 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
122 #define PT64_INDEX(address, level)\
123 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
126 #define PT32_LEVEL_BITS 10
128 #define PT32_LEVEL_SHIFT(level) \
129 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
131 #define PT32_LEVEL_MASK(level) \
132 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
134 #define PT32_INDEX(address, level)\
135 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
138 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
139 #define PT64_DIR_BASE_ADDR_MASK \
140 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
142 #define PT32_BASE_ADDR_MASK PAGE_MASK
143 #define PT32_DIR_BASE_ADDR_MASK \
144 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
147 #define PFERR_PRESENT_MASK (1U << 0)
148 #define PFERR_WRITE_MASK (1U << 1)
149 #define PFERR_USER_MASK (1U << 2)
150 #define PFERR_FETCH_MASK (1U << 4)
152 #define PT64_ROOT_LEVEL 4
153 #define PT32_ROOT_LEVEL 2
154 #define PT32E_ROOT_LEVEL 3
156 #define PT_DIRECTORY_LEVEL 2
157 #define PT_PAGE_TABLE_LEVEL 1
159 #define RMAP_EXT 4
161 struct kvm_rmap_desc {
162 u64 *shadow_ptes[RMAP_EXT];
163 struct kvm_rmap_desc *more;
166 static struct kmem_cache *pte_chain_cache;
167 static struct kmem_cache *rmap_desc_cache;
168 static struct kmem_cache *mmu_page_cache;
169 static struct kmem_cache *mmu_page_header_cache;
171 static int is_write_protection(struct kvm_vcpu *vcpu)
173 return vcpu->cr0 & CR0_WP_MASK;
176 static int is_cpuid_PSE36(void)
178 return 1;
181 static int is_nx(struct kvm_vcpu *vcpu)
183 return vcpu->shadow_efer & EFER_NX;
186 static int is_present_pte(unsigned long pte)
188 return pte & PT_PRESENT_MASK;
191 static int is_writeble_pte(unsigned long pte)
193 return pte & PT_WRITABLE_MASK;
196 static int is_io_pte(unsigned long pte)
198 return pte & PT_SHADOW_IO_MARK;
201 static int is_rmap_pte(u64 pte)
203 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
204 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
207 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
208 struct kmem_cache *base_cache, int min,
209 gfp_t gfp_flags)
211 void *obj;
213 if (cache->nobjs >= min)
214 return 0;
215 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
216 obj = kmem_cache_zalloc(base_cache, gfp_flags);
217 if (!obj)
218 return -ENOMEM;
219 cache->objects[cache->nobjs++] = obj;
221 return 0;
224 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
226 while (mc->nobjs)
227 kfree(mc->objects[--mc->nobjs]);
230 static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
232 int r;
234 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
235 pte_chain_cache, 4, gfp_flags);
236 if (r)
237 goto out;
238 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
239 rmap_desc_cache, 1, gfp_flags);
240 if (r)
241 goto out;
242 r = mmu_topup_memory_cache(&vcpu->mmu_page_cache,
243 mmu_page_cache, 4, gfp_flags);
244 if (r)
245 goto out;
246 r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
247 mmu_page_header_cache, 4, gfp_flags);
248 out:
249 return r;
252 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
254 int r;
256 r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
257 if (r < 0) {
258 spin_unlock(&vcpu->kvm->lock);
259 kvm_arch_ops->vcpu_put(vcpu);
260 r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
261 kvm_arch_ops->vcpu_load(vcpu);
262 spin_lock(&vcpu->kvm->lock);
264 return r;
267 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
269 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
270 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
271 mmu_free_memory_cache(&vcpu->mmu_page_cache);
272 mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
275 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
276 size_t size)
278 void *p;
280 BUG_ON(!mc->nobjs);
281 p = mc->objects[--mc->nobjs];
282 memset(p, 0, size);
283 return p;
286 static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj)
288 if (mc->nobjs < KVM_NR_MEM_OBJS)
289 mc->objects[mc->nobjs++] = obj;
290 else
291 kfree(obj);
294 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
296 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
297 sizeof(struct kvm_pte_chain));
300 static void mmu_free_pte_chain(struct kvm_vcpu *vcpu,
301 struct kvm_pte_chain *pc)
303 mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc);
306 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
308 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
309 sizeof(struct kvm_rmap_desc));
312 static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu,
313 struct kvm_rmap_desc *rd)
315 mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd);
319 * Reverse mapping data structures:
321 * If page->private bit zero is zero, then page->private points to the
322 * shadow page table entry that points to page_address(page).
324 * If page->private bit zero is one, (then page->private & ~1) points
325 * to a struct kvm_rmap_desc containing more mappings.
327 static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
329 struct page *page;
330 struct kvm_rmap_desc *desc;
331 int i;
333 if (!is_rmap_pte(*spte))
334 return;
335 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
336 if (!page_private(page)) {
337 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
338 set_page_private(page,(unsigned long)spte);
339 } else if (!(page_private(page) & 1)) {
340 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
341 desc = mmu_alloc_rmap_desc(vcpu);
342 desc->shadow_ptes[0] = (u64 *)page_private(page);
343 desc->shadow_ptes[1] = spte;
344 set_page_private(page,(unsigned long)desc | 1);
345 } else {
346 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
347 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
348 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
349 desc = desc->more;
350 if (desc->shadow_ptes[RMAP_EXT-1]) {
351 desc->more = mmu_alloc_rmap_desc(vcpu);
352 desc = desc->more;
354 for (i = 0; desc->shadow_ptes[i]; ++i)
356 desc->shadow_ptes[i] = spte;
360 static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu,
361 struct page *page,
362 struct kvm_rmap_desc *desc,
363 int i,
364 struct kvm_rmap_desc *prev_desc)
366 int j;
368 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
370 desc->shadow_ptes[i] = desc->shadow_ptes[j];
371 desc->shadow_ptes[j] = NULL;
372 if (j != 0)
373 return;
374 if (!prev_desc && !desc->more)
375 set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
376 else
377 if (prev_desc)
378 prev_desc->more = desc->more;
379 else
380 set_page_private(page,(unsigned long)desc->more | 1);
381 mmu_free_rmap_desc(vcpu, desc);
384 static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte)
386 struct page *page;
387 struct kvm_rmap_desc *desc;
388 struct kvm_rmap_desc *prev_desc;
389 int i;
391 if (!is_rmap_pte(*spte))
392 return;
393 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
394 if (!page_private(page)) {
395 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
396 BUG();
397 } else if (!(page_private(page) & 1)) {
398 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
399 if ((u64 *)page_private(page) != spte) {
400 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
401 spte, *spte);
402 BUG();
404 set_page_private(page,0);
405 } else {
406 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
407 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
408 prev_desc = NULL;
409 while (desc) {
410 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
411 if (desc->shadow_ptes[i] == spte) {
412 rmap_desc_remove_entry(vcpu, page,
413 desc, i,
414 prev_desc);
415 return;
417 prev_desc = desc;
418 desc = desc->more;
420 BUG();
424 static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
426 struct kvm *kvm = vcpu->kvm;
427 struct page *page;
428 struct kvm_rmap_desc *desc;
429 u64 *spte;
431 page = gfn_to_page(kvm, gfn);
432 BUG_ON(!page);
434 while (page_private(page)) {
435 if (!(page_private(page) & 1))
436 spte = (u64 *)page_private(page);
437 else {
438 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
439 spte = desc->shadow_ptes[0];
441 BUG_ON(!spte);
442 BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
443 != page_to_pfn(page));
444 BUG_ON(!(*spte & PT_PRESENT_MASK));
445 BUG_ON(!(*spte & PT_WRITABLE_MASK));
446 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
447 rmap_remove(vcpu, spte);
448 kvm_arch_ops->tlb_flush(vcpu);
449 *spte &= ~(u64)PT_WRITABLE_MASK;
453 #ifdef MMU_DEBUG
454 static int is_empty_shadow_page(u64 *spt)
456 u64 *pos;
457 u64 *end;
459 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
460 if (*pos != 0) {
461 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
462 pos, *pos);
463 return 0;
465 return 1;
467 #endif
469 static void kvm_mmu_free_page(struct kvm_vcpu *vcpu,
470 struct kvm_mmu_page *page_head)
472 ASSERT(is_empty_shadow_page(page_head->spt));
473 list_del(&page_head->link);
474 mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt);
475 mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head);
476 ++vcpu->kvm->n_free_mmu_pages;
479 static unsigned kvm_page_table_hashfn(gfn_t gfn)
481 return gfn;
484 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
485 u64 *parent_pte)
487 struct kvm_mmu_page *page;
489 if (!vcpu->kvm->n_free_mmu_pages)
490 return NULL;
492 page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
493 sizeof *page);
494 page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
495 set_page_private(virt_to_page(page->spt), (unsigned long)page);
496 list_add(&page->link, &vcpu->kvm->active_mmu_pages);
497 ASSERT(is_empty_shadow_page(page->spt));
498 page->slot_bitmap = 0;
499 page->multimapped = 0;
500 page->parent_pte = parent_pte;
501 --vcpu->kvm->n_free_mmu_pages;
502 return page;
505 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
506 struct kvm_mmu_page *page, u64 *parent_pte)
508 struct kvm_pte_chain *pte_chain;
509 struct hlist_node *node;
510 int i;
512 if (!parent_pte)
513 return;
514 if (!page->multimapped) {
515 u64 *old = page->parent_pte;
517 if (!old) {
518 page->parent_pte = parent_pte;
519 return;
521 page->multimapped = 1;
522 pte_chain = mmu_alloc_pte_chain(vcpu);
523 INIT_HLIST_HEAD(&page->parent_ptes);
524 hlist_add_head(&pte_chain->link, &page->parent_ptes);
525 pte_chain->parent_ptes[0] = old;
527 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
528 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
529 continue;
530 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
531 if (!pte_chain->parent_ptes[i]) {
532 pte_chain->parent_ptes[i] = parent_pte;
533 return;
536 pte_chain = mmu_alloc_pte_chain(vcpu);
537 BUG_ON(!pte_chain);
538 hlist_add_head(&pte_chain->link, &page->parent_ptes);
539 pte_chain->parent_ptes[0] = parent_pte;
542 static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu,
543 struct kvm_mmu_page *page,
544 u64 *parent_pte)
546 struct kvm_pte_chain *pte_chain;
547 struct hlist_node *node;
548 int i;
550 if (!page->multimapped) {
551 BUG_ON(page->parent_pte != parent_pte);
552 page->parent_pte = NULL;
553 return;
555 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
556 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
557 if (!pte_chain->parent_ptes[i])
558 break;
559 if (pte_chain->parent_ptes[i] != parent_pte)
560 continue;
561 while (i + 1 < NR_PTE_CHAIN_ENTRIES
562 && pte_chain->parent_ptes[i + 1]) {
563 pte_chain->parent_ptes[i]
564 = pte_chain->parent_ptes[i + 1];
565 ++i;
567 pte_chain->parent_ptes[i] = NULL;
568 if (i == 0) {
569 hlist_del(&pte_chain->link);
570 mmu_free_pte_chain(vcpu, pte_chain);
571 if (hlist_empty(&page->parent_ptes)) {
572 page->multimapped = 0;
573 page->parent_pte = NULL;
576 return;
578 BUG();
581 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
582 gfn_t gfn)
584 unsigned index;
585 struct hlist_head *bucket;
586 struct kvm_mmu_page *page;
587 struct hlist_node *node;
589 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
590 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
591 bucket = &vcpu->kvm->mmu_page_hash[index];
592 hlist_for_each_entry(page, node, bucket, hash_link)
593 if (page->gfn == gfn && !page->role.metaphysical) {
594 pgprintk("%s: found role %x\n",
595 __FUNCTION__, page->role.word);
596 return page;
598 return NULL;
601 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
602 gfn_t gfn,
603 gva_t gaddr,
604 unsigned level,
605 int metaphysical,
606 unsigned hugepage_access,
607 u64 *parent_pte)
609 union kvm_mmu_page_role role;
610 unsigned index;
611 unsigned quadrant;
612 struct hlist_head *bucket;
613 struct kvm_mmu_page *page;
614 struct hlist_node *node;
616 role.word = 0;
617 role.glevels = vcpu->mmu.root_level;
618 role.level = level;
619 role.metaphysical = metaphysical;
620 role.hugepage_access = hugepage_access;
621 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
622 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
623 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
624 role.quadrant = quadrant;
626 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
627 gfn, role.word);
628 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
629 bucket = &vcpu->kvm->mmu_page_hash[index];
630 hlist_for_each_entry(page, node, bucket, hash_link)
631 if (page->gfn == gfn && page->role.word == role.word) {
632 mmu_page_add_parent_pte(vcpu, page, parent_pte);
633 pgprintk("%s: found\n", __FUNCTION__);
634 return page;
636 page = kvm_mmu_alloc_page(vcpu, parent_pte);
637 if (!page)
638 return page;
639 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
640 page->gfn = gfn;
641 page->role = role;
642 hlist_add_head(&page->hash_link, bucket);
643 if (!metaphysical)
644 rmap_write_protect(vcpu, gfn);
645 return page;
648 static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
649 struct kvm_mmu_page *page)
651 unsigned i;
652 u64 *pt;
653 u64 ent;
655 pt = page->spt;
657 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
658 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
659 if (pt[i] & PT_PRESENT_MASK)
660 rmap_remove(vcpu, &pt[i]);
661 pt[i] = 0;
663 kvm_arch_ops->tlb_flush(vcpu);
664 return;
667 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
668 ent = pt[i];
670 pt[i] = 0;
671 if (!(ent & PT_PRESENT_MASK))
672 continue;
673 ent &= PT64_BASE_ADDR_MASK;
674 mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
678 static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
679 struct kvm_mmu_page *page,
680 u64 *parent_pte)
682 mmu_page_remove_parent_pte(vcpu, page, parent_pte);
685 static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
686 struct kvm_mmu_page *page)
688 u64 *parent_pte;
690 while (page->multimapped || page->parent_pte) {
691 if (!page->multimapped)
692 parent_pte = page->parent_pte;
693 else {
694 struct kvm_pte_chain *chain;
696 chain = container_of(page->parent_ptes.first,
697 struct kvm_pte_chain, link);
698 parent_pte = chain->parent_ptes[0];
700 BUG_ON(!parent_pte);
701 kvm_mmu_put_page(vcpu, page, parent_pte);
702 *parent_pte = 0;
704 kvm_mmu_page_unlink_children(vcpu, page);
705 if (!page->root_count) {
706 hlist_del(&page->hash_link);
707 kvm_mmu_free_page(vcpu, page);
708 } else
709 list_move(&page->link, &vcpu->kvm->active_mmu_pages);
712 static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
714 unsigned index;
715 struct hlist_head *bucket;
716 struct kvm_mmu_page *page;
717 struct hlist_node *node, *n;
718 int r;
720 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
721 r = 0;
722 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
723 bucket = &vcpu->kvm->mmu_page_hash[index];
724 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
725 if (page->gfn == gfn && !page->role.metaphysical) {
726 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
727 page->role.word);
728 kvm_mmu_zap_page(vcpu, page);
729 r = 1;
731 return r;
734 static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
736 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
737 struct kvm_mmu_page *page_head = page_header(__pa(pte));
739 __set_bit(slot, &page_head->slot_bitmap);
742 hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
744 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
746 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
749 hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
751 struct page *page;
753 ASSERT((gpa & HPA_ERR_MASK) == 0);
754 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
755 if (!page)
756 return gpa | HPA_ERR_MASK;
757 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
758 | (gpa & (PAGE_SIZE-1));
761 hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
763 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
765 if (gpa == UNMAPPED_GVA)
766 return UNMAPPED_GVA;
767 return gpa_to_hpa(vcpu, gpa);
770 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
772 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
774 if (gpa == UNMAPPED_GVA)
775 return NULL;
776 return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
779 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
783 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
785 int level = PT32E_ROOT_LEVEL;
786 hpa_t table_addr = vcpu->mmu.root_hpa;
788 for (; ; level--) {
789 u32 index = PT64_INDEX(v, level);
790 u64 *table;
791 u64 pte;
793 ASSERT(VALID_PAGE(table_addr));
794 table = __va(table_addr);
796 if (level == 1) {
797 pte = table[index];
798 if (is_present_pte(pte) && is_writeble_pte(pte))
799 return 0;
800 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
801 page_header_update_slot(vcpu->kvm, table, v);
802 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
803 PT_USER_MASK;
804 rmap_add(vcpu, &table[index]);
805 return 0;
808 if (table[index] == 0) {
809 struct kvm_mmu_page *new_table;
810 gfn_t pseudo_gfn;
812 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
813 >> PAGE_SHIFT;
814 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
815 v, level - 1,
816 1, 0, &table[index]);
817 if (!new_table) {
818 pgprintk("nonpaging_map: ENOMEM\n");
819 return -ENOMEM;
822 table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
823 | PT_WRITABLE_MASK | PT_USER_MASK;
825 table_addr = table[index] & PT64_BASE_ADDR_MASK;
829 static void mmu_free_roots(struct kvm_vcpu *vcpu)
831 int i;
832 struct kvm_mmu_page *page;
834 #ifdef CONFIG_X86_64
835 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
836 hpa_t root = vcpu->mmu.root_hpa;
838 ASSERT(VALID_PAGE(root));
839 page = page_header(root);
840 --page->root_count;
841 vcpu->mmu.root_hpa = INVALID_PAGE;
842 return;
844 #endif
845 for (i = 0; i < 4; ++i) {
846 hpa_t root = vcpu->mmu.pae_root[i];
848 if (root) {
849 ASSERT(VALID_PAGE(root));
850 root &= PT64_BASE_ADDR_MASK;
851 page = page_header(root);
852 --page->root_count;
854 vcpu->mmu.pae_root[i] = INVALID_PAGE;
856 vcpu->mmu.root_hpa = INVALID_PAGE;
859 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
861 int i;
862 gfn_t root_gfn;
863 struct kvm_mmu_page *page;
865 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
867 #ifdef CONFIG_X86_64
868 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
869 hpa_t root = vcpu->mmu.root_hpa;
871 ASSERT(!VALID_PAGE(root));
872 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
873 PT64_ROOT_LEVEL, 0, 0, NULL);
874 root = __pa(page->spt);
875 ++page->root_count;
876 vcpu->mmu.root_hpa = root;
877 return;
879 #endif
880 for (i = 0; i < 4; ++i) {
881 hpa_t root = vcpu->mmu.pae_root[i];
883 ASSERT(!VALID_PAGE(root));
884 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
885 if (!is_present_pte(vcpu->pdptrs[i])) {
886 vcpu->mmu.pae_root[i] = 0;
887 continue;
889 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
890 } else if (vcpu->mmu.root_level == 0)
891 root_gfn = 0;
892 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
893 PT32_ROOT_LEVEL, !is_paging(vcpu),
894 0, NULL);
895 root = __pa(page->spt);
896 ++page->root_count;
897 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
899 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
902 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
904 return vaddr;
907 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
908 u32 error_code)
910 gpa_t addr = gva;
911 hpa_t paddr;
912 int r;
914 r = mmu_topup_memory_caches(vcpu);
915 if (r)
916 return r;
918 ASSERT(vcpu);
919 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
922 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
924 if (is_error_hpa(paddr))
925 return 1;
927 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
930 static void nonpaging_free(struct kvm_vcpu *vcpu)
932 mmu_free_roots(vcpu);
935 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
937 struct kvm_mmu *context = &vcpu->mmu;
939 context->new_cr3 = nonpaging_new_cr3;
940 context->page_fault = nonpaging_page_fault;
941 context->gva_to_gpa = nonpaging_gva_to_gpa;
942 context->free = nonpaging_free;
943 context->root_level = 0;
944 context->shadow_root_level = PT32E_ROOT_LEVEL;
945 mmu_alloc_roots(vcpu);
946 ASSERT(VALID_PAGE(context->root_hpa));
947 kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
948 return 0;
951 static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
953 ++vcpu->stat.tlb_flush;
954 kvm_arch_ops->tlb_flush(vcpu);
957 static void paging_new_cr3(struct kvm_vcpu *vcpu)
959 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
960 mmu_free_roots(vcpu);
961 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
962 kvm_mmu_free_some_pages(vcpu);
963 mmu_alloc_roots(vcpu);
964 kvm_mmu_flush_tlb(vcpu);
965 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
968 static void inject_page_fault(struct kvm_vcpu *vcpu,
969 u64 addr,
970 u32 err_code)
972 kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
975 static inline int fix_read_pf(u64 *shadow_ent)
977 if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
978 !(*shadow_ent & PT_USER_MASK)) {
980 * If supervisor write protect is disabled, we shadow kernel
981 * pages as user pages so we can trap the write access.
983 *shadow_ent |= PT_USER_MASK;
984 *shadow_ent &= ~PT_WRITABLE_MASK;
986 return 1;
989 return 0;
992 static void paging_free(struct kvm_vcpu *vcpu)
994 nonpaging_free(vcpu);
997 #define PTTYPE 64
998 #include "paging_tmpl.h"
999 #undef PTTYPE
1001 #define PTTYPE 32
1002 #include "paging_tmpl.h"
1003 #undef PTTYPE
1005 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1007 struct kvm_mmu *context = &vcpu->mmu;
1009 ASSERT(is_pae(vcpu));
1010 context->new_cr3 = paging_new_cr3;
1011 context->page_fault = paging64_page_fault;
1012 context->gva_to_gpa = paging64_gva_to_gpa;
1013 context->free = paging_free;
1014 context->root_level = level;
1015 context->shadow_root_level = level;
1016 mmu_alloc_roots(vcpu);
1017 ASSERT(VALID_PAGE(context->root_hpa));
1018 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
1019 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
1020 return 0;
1023 static int paging64_init_context(struct kvm_vcpu *vcpu)
1025 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1028 static int paging32_init_context(struct kvm_vcpu *vcpu)
1030 struct kvm_mmu *context = &vcpu->mmu;
1032 context->new_cr3 = paging_new_cr3;
1033 context->page_fault = paging32_page_fault;
1034 context->gva_to_gpa = paging32_gva_to_gpa;
1035 context->free = paging_free;
1036 context->root_level = PT32_ROOT_LEVEL;
1037 context->shadow_root_level = PT32E_ROOT_LEVEL;
1038 mmu_alloc_roots(vcpu);
1039 ASSERT(VALID_PAGE(context->root_hpa));
1040 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
1041 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
1042 return 0;
1045 static int paging32E_init_context(struct kvm_vcpu *vcpu)
1047 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1050 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1052 ASSERT(vcpu);
1053 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1055 mmu_topup_memory_caches(vcpu);
1056 if (!is_paging(vcpu))
1057 return nonpaging_init_context(vcpu);
1058 else if (is_long_mode(vcpu))
1059 return paging64_init_context(vcpu);
1060 else if (is_pae(vcpu))
1061 return paging32E_init_context(vcpu);
1062 else
1063 return paging32_init_context(vcpu);
1066 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1068 ASSERT(vcpu);
1069 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1070 vcpu->mmu.free(vcpu);
1071 vcpu->mmu.root_hpa = INVALID_PAGE;
1075 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1077 int r;
1079 destroy_kvm_mmu(vcpu);
1080 r = init_kvm_mmu(vcpu);
1081 if (r < 0)
1082 goto out;
1083 r = mmu_topup_memory_caches(vcpu);
1084 out:
1085 return r;
1088 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1089 struct kvm_mmu_page *page,
1090 u64 *spte)
1092 u64 pte;
1093 struct kvm_mmu_page *child;
1095 pte = *spte;
1096 if (is_present_pte(pte)) {
1097 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1098 rmap_remove(vcpu, spte);
1099 else {
1100 child = page_header(pte & PT64_BASE_ADDR_MASK);
1101 mmu_page_remove_parent_pte(vcpu, child, spte);
1104 *spte = 0;
1107 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1108 struct kvm_mmu_page *page,
1109 u64 *spte,
1110 const void *new, int bytes)
1112 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1113 return;
1115 if (page->role.glevels == PT32_ROOT_LEVEL)
1116 paging32_update_pte(vcpu, page, spte, new, bytes);
1117 else
1118 paging64_update_pte(vcpu, page, spte, new, bytes);
1121 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1122 const u8 *old, const u8 *new, int bytes)
1124 gfn_t gfn = gpa >> PAGE_SHIFT;
1125 struct kvm_mmu_page *page;
1126 struct hlist_node *node, *n;
1127 struct hlist_head *bucket;
1128 unsigned index;
1129 u64 *spte;
1130 unsigned offset = offset_in_page(gpa);
1131 unsigned pte_size;
1132 unsigned page_offset;
1133 unsigned misaligned;
1134 unsigned quadrant;
1135 int level;
1136 int flooded = 0;
1137 int npte;
1139 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1140 if (gfn == vcpu->last_pt_write_gfn) {
1141 ++vcpu->last_pt_write_count;
1142 if (vcpu->last_pt_write_count >= 3)
1143 flooded = 1;
1144 } else {
1145 vcpu->last_pt_write_gfn = gfn;
1146 vcpu->last_pt_write_count = 1;
1148 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1149 bucket = &vcpu->kvm->mmu_page_hash[index];
1150 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1151 if (page->gfn != gfn || page->role.metaphysical)
1152 continue;
1153 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1154 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1155 misaligned |= bytes < 4;
1156 if (misaligned || flooded) {
1158 * Misaligned accesses are too much trouble to fix
1159 * up; also, they usually indicate a page is not used
1160 * as a page table.
1162 * If we're seeing too many writes to a page,
1163 * it may no longer be a page table, or we may be
1164 * forking, in which case it is better to unmap the
1165 * page.
1167 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1168 gpa, bytes, page->role.word);
1169 kvm_mmu_zap_page(vcpu, page);
1170 continue;
1172 page_offset = offset;
1173 level = page->role.level;
1174 npte = 1;
1175 if (page->role.glevels == PT32_ROOT_LEVEL) {
1176 page_offset <<= 1; /* 32->64 */
1178 * A 32-bit pde maps 4MB while the shadow pdes map
1179 * only 2MB. So we need to double the offset again
1180 * and zap two pdes instead of one.
1182 if (level == PT32_ROOT_LEVEL) {
1183 page_offset &= ~7; /* kill rounding error */
1184 page_offset <<= 1;
1185 npte = 2;
1187 quadrant = page_offset >> PAGE_SHIFT;
1188 page_offset &= ~PAGE_MASK;
1189 if (quadrant != page->role.quadrant)
1190 continue;
1192 spte = &page->spt[page_offset / sizeof(*spte)];
1193 while (npte--) {
1194 mmu_pte_write_zap_pte(vcpu, page, spte);
1195 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
1196 ++spte;
1201 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1203 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1205 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1208 void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1210 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1211 struct kvm_mmu_page *page;
1213 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1214 struct kvm_mmu_page, link);
1215 kvm_mmu_zap_page(vcpu, page);
1218 EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
1220 static void free_mmu_pages(struct kvm_vcpu *vcpu)
1222 struct kvm_mmu_page *page;
1224 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1225 page = container_of(vcpu->kvm->active_mmu_pages.next,
1226 struct kvm_mmu_page, link);
1227 kvm_mmu_zap_page(vcpu, page);
1229 free_page((unsigned long)vcpu->mmu.pae_root);
1232 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1234 struct page *page;
1235 int i;
1237 ASSERT(vcpu);
1239 vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
1242 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1243 * Therefore we need to allocate shadow page tables in the first
1244 * 4GB of memory, which happens to fit the DMA32 zone.
1246 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1247 if (!page)
1248 goto error_1;
1249 vcpu->mmu.pae_root = page_address(page);
1250 for (i = 0; i < 4; ++i)
1251 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1253 return 0;
1255 error_1:
1256 free_mmu_pages(vcpu);
1257 return -ENOMEM;
1260 int kvm_mmu_create(struct kvm_vcpu *vcpu)
1262 ASSERT(vcpu);
1263 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1265 return alloc_mmu_pages(vcpu);
1268 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1270 ASSERT(vcpu);
1271 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1273 return init_kvm_mmu(vcpu);
1276 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1278 ASSERT(vcpu);
1280 destroy_kvm_mmu(vcpu);
1281 free_mmu_pages(vcpu);
1282 mmu_free_memory_caches(vcpu);
1285 void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
1287 struct kvm *kvm = vcpu->kvm;
1288 struct kvm_mmu_page *page;
1290 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1291 int i;
1292 u64 *pt;
1294 if (!test_bit(slot, &page->slot_bitmap))
1295 continue;
1297 pt = page->spt;
1298 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1299 /* avoid RMW */
1300 if (pt[i] & PT_WRITABLE_MASK) {
1301 rmap_remove(vcpu, &pt[i]);
1302 pt[i] &= ~PT_WRITABLE_MASK;
1307 void kvm_mmu_zap_all(struct kvm_vcpu *vcpu)
1309 destroy_kvm_mmu(vcpu);
1311 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1312 struct kvm_mmu_page *page;
1314 page = container_of(vcpu->kvm->active_mmu_pages.next,
1315 struct kvm_mmu_page, link);
1316 kvm_mmu_zap_page(vcpu, page);
1319 mmu_free_memory_caches(vcpu);
1320 kvm_arch_ops->tlb_flush(vcpu);
1321 init_kvm_mmu(vcpu);
1324 void kvm_mmu_module_exit(void)
1326 if (pte_chain_cache)
1327 kmem_cache_destroy(pte_chain_cache);
1328 if (rmap_desc_cache)
1329 kmem_cache_destroy(rmap_desc_cache);
1330 if (mmu_page_cache)
1331 kmem_cache_destroy(mmu_page_cache);
1332 if (mmu_page_header_cache)
1333 kmem_cache_destroy(mmu_page_header_cache);
1336 int kvm_mmu_module_init(void)
1338 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1339 sizeof(struct kvm_pte_chain),
1340 0, 0, NULL, NULL);
1341 if (!pte_chain_cache)
1342 goto nomem;
1343 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1344 sizeof(struct kvm_rmap_desc),
1345 0, 0, NULL, NULL);
1346 if (!rmap_desc_cache)
1347 goto nomem;
1349 mmu_page_cache = kmem_cache_create("kvm_mmu_page",
1350 PAGE_SIZE,
1351 PAGE_SIZE, 0, NULL, NULL);
1352 if (!mmu_page_cache)
1353 goto nomem;
1355 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1356 sizeof(struct kvm_mmu_page),
1357 0, 0, NULL, NULL);
1358 if (!mmu_page_header_cache)
1359 goto nomem;
1361 return 0;
1363 nomem:
1364 kvm_mmu_module_exit();
1365 return -ENOMEM;
1368 #ifdef AUDIT
1370 static const char *audit_msg;
1372 static gva_t canonicalize(gva_t gva)
1374 #ifdef CONFIG_X86_64
1375 gva = (long long)(gva << 16) >> 16;
1376 #endif
1377 return gva;
1380 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1381 gva_t va, int level)
1383 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1384 int i;
1385 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1387 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1388 u64 ent = pt[i];
1390 if (!(ent & PT_PRESENT_MASK))
1391 continue;
1393 va = canonicalize(va);
1394 if (level > 1)
1395 audit_mappings_page(vcpu, ent, va, level - 1);
1396 else {
1397 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1398 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1400 if ((ent & PT_PRESENT_MASK)
1401 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1402 printk(KERN_ERR "audit error: (%s) levels %d"
1403 " gva %lx gpa %llx hpa %llx ent %llx\n",
1404 audit_msg, vcpu->mmu.root_level,
1405 va, gpa, hpa, ent);
1410 static void audit_mappings(struct kvm_vcpu *vcpu)
1412 unsigned i;
1414 if (vcpu->mmu.root_level == 4)
1415 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1416 else
1417 for (i = 0; i < 4; ++i)
1418 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1419 audit_mappings_page(vcpu,
1420 vcpu->mmu.pae_root[i],
1421 i << 30,
1425 static int count_rmaps(struct kvm_vcpu *vcpu)
1427 int nmaps = 0;
1428 int i, j, k;
1430 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1431 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1432 struct kvm_rmap_desc *d;
1434 for (j = 0; j < m->npages; ++j) {
1435 struct page *page = m->phys_mem[j];
1437 if (!page->private)
1438 continue;
1439 if (!(page->private & 1)) {
1440 ++nmaps;
1441 continue;
1443 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1444 while (d) {
1445 for (k = 0; k < RMAP_EXT; ++k)
1446 if (d->shadow_ptes[k])
1447 ++nmaps;
1448 else
1449 break;
1450 d = d->more;
1454 return nmaps;
1457 static int count_writable_mappings(struct kvm_vcpu *vcpu)
1459 int nmaps = 0;
1460 struct kvm_mmu_page *page;
1461 int i;
1463 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1464 u64 *pt = page->spt;
1466 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1467 continue;
1469 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1470 u64 ent = pt[i];
1472 if (!(ent & PT_PRESENT_MASK))
1473 continue;
1474 if (!(ent & PT_WRITABLE_MASK))
1475 continue;
1476 ++nmaps;
1479 return nmaps;
1482 static void audit_rmap(struct kvm_vcpu *vcpu)
1484 int n_rmap = count_rmaps(vcpu);
1485 int n_actual = count_writable_mappings(vcpu);
1487 if (n_rmap != n_actual)
1488 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1489 __FUNCTION__, audit_msg, n_rmap, n_actual);
1492 static void audit_write_protection(struct kvm_vcpu *vcpu)
1494 struct kvm_mmu_page *page;
1496 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1497 hfn_t hfn;
1498 struct page *pg;
1500 if (page->role.metaphysical)
1501 continue;
1503 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1504 >> PAGE_SHIFT;
1505 pg = pfn_to_page(hfn);
1506 if (pg->private)
1507 printk(KERN_ERR "%s: (%s) shadow page has writable"
1508 " mappings: gfn %lx role %x\n",
1509 __FUNCTION__, audit_msg, page->gfn,
1510 page->role.word);
1514 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1516 int olddbg = dbg;
1518 dbg = 0;
1519 audit_msg = msg;
1520 audit_rmap(vcpu);
1521 audit_write_protection(vcpu);
1522 audit_mappings(vcpu);
1523 dbg = olddbg;
1526 #endif