2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
7 * Copyright (C) 2006 Qumranet, Inc.
10 * Avi Kivity <avi@qumranet.com>
11 * Yaniv Kamay <yaniv@qumranet.com>
13 * This work is licensed under the terms of the GNU GPL, version 2. See
14 * the COPYING file in the top-level directory.
20 #include <linux/kvm.h>
21 #include <linux/module.h>
22 #include <linux/errno.h>
23 #include <asm/processor.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
28 #include <linux/miscdevice.h>
29 #include <linux/vmalloc.h>
30 #include <asm/uaccess.h>
31 #include <linux/reboot.h>
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/file.h>
37 #include <linux/sysdev.h>
38 #include <linux/cpu.h>
40 #include "x86_emulate.h"
41 #include "segment_descriptor.h"
47 MODULE_AUTHOR("Qumranet");
48 MODULE_LICENSE("GPL");
50 static spinlock_t kvm_lock
= SPIN_LOCK_UNLOCKED
;
51 static struct list_head vm_list
= LIST_HEAD_INIT(vm_list
);
53 struct kvm_arch_ops
*kvm_arch_ops
;
54 struct kvm_stat kvm_stat
;
55 EXPORT_SYMBOL_GPL(kvm_stat
);
57 static struct kvm_stats_debugfs_item
{
60 struct dentry
*dentry
;
61 } debugfs_entries
[] = {
62 { "pf_fixed", &kvm_stat
.pf_fixed
},
63 { "pf_guest", &kvm_stat
.pf_guest
},
64 { "tlb_flush", &kvm_stat
.tlb_flush
},
65 { "invlpg", &kvm_stat
.invlpg
},
66 { "exits", &kvm_stat
.exits
},
67 { "io_exits", &kvm_stat
.io_exits
},
68 { "mmio_exits", &kvm_stat
.mmio_exits
},
69 { "signal_exits", &kvm_stat
.signal_exits
},
70 { "irq_window", &kvm_stat
.irq_window_exits
},
71 { "halt_exits", &kvm_stat
.halt_exits
},
72 { "request_irq", &kvm_stat
.request_irq_exits
},
73 { "irq_exits", &kvm_stat
.irq_exits
},
77 static struct dentry
*debugfs_dir
;
79 #define MAX_IO_MSRS 256
81 #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
82 #define LMSW_GUEST_MASK 0x0eULL
83 #define CR4_RESEVED_BITS (~((1ULL << 11) - 1))
84 #define CR8_RESEVED_BITS (~0x0fULL)
85 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
88 // LDT or TSS descriptor in the GDT. 16 bytes.
89 struct segment_descriptor_64
{
90 struct segment_descriptor s
;
97 unsigned long segment_base(u16 selector
)
99 struct descriptor_table gdt
;
100 struct segment_descriptor
*d
;
101 unsigned long table_base
;
102 typedef unsigned long ul
;
108 asm ("sgdt %0" : "=m"(gdt
));
109 table_base
= gdt
.base
;
111 if (selector
& 4) { /* from ldt */
114 asm ("sldt %0" : "=g"(ldt_selector
));
115 table_base
= segment_base(ldt_selector
);
117 d
= (struct segment_descriptor
*)(table_base
+ (selector
& ~7));
118 v
= d
->base_low
| ((ul
)d
->base_mid
<< 16) | ((ul
)d
->base_high
<< 24);
121 && (d
->type
== 2 || d
->type
== 9 || d
->type
== 11))
122 v
|= ((ul
)((struct segment_descriptor_64
*)d
)->base_higher
) << 32;
126 EXPORT_SYMBOL_GPL(segment_base
);
128 static inline int valid_vcpu(int n
)
130 return likely(n
>= 0 && n
< KVM_MAX_VCPUS
);
133 int kvm_read_guest(struct kvm_vcpu
*vcpu
, gva_t addr
, unsigned long size
,
136 unsigned char *host_buf
= dest
;
137 unsigned long req_size
= size
;
145 paddr
= gva_to_hpa(vcpu
, addr
);
147 if (is_error_hpa(paddr
))
150 guest_buf
= (hva_t
)kmap_atomic(
151 pfn_to_page(paddr
>> PAGE_SHIFT
),
153 offset
= addr
& ~PAGE_MASK
;
155 now
= min(size
, PAGE_SIZE
- offset
);
156 memcpy(host_buf
, (void*)guest_buf
, now
);
160 kunmap_atomic((void *)(guest_buf
& PAGE_MASK
), KM_USER0
);
162 return req_size
- size
;
164 EXPORT_SYMBOL_GPL(kvm_read_guest
);
166 int kvm_write_guest(struct kvm_vcpu
*vcpu
, gva_t addr
, unsigned long size
,
169 unsigned char *host_buf
= data
;
170 unsigned long req_size
= size
;
178 paddr
= gva_to_hpa(vcpu
, addr
);
180 if (is_error_hpa(paddr
))
183 guest_buf
= (hva_t
)kmap_atomic(
184 pfn_to_page(paddr
>> PAGE_SHIFT
), KM_USER0
);
185 offset
= addr
& ~PAGE_MASK
;
187 now
= min(size
, PAGE_SIZE
- offset
);
188 memcpy((void*)guest_buf
, host_buf
, now
);
192 kunmap_atomic((void *)(guest_buf
& PAGE_MASK
), KM_USER0
);
194 return req_size
- size
;
196 EXPORT_SYMBOL_GPL(kvm_write_guest
);
198 static int vcpu_slot(struct kvm_vcpu
*vcpu
)
200 return vcpu
- vcpu
->kvm
->vcpus
;
204 * Switches to specified vcpu, until a matching vcpu_put()
206 static struct kvm_vcpu
*vcpu_load(struct kvm
*kvm
, int vcpu_slot
)
208 struct kvm_vcpu
*vcpu
= &kvm
->vcpus
[vcpu_slot
];
210 mutex_lock(&vcpu
->mutex
);
211 if (unlikely(!vcpu
->vmcs
)) {
212 mutex_unlock(&vcpu
->mutex
);
215 return kvm_arch_ops
->vcpu_load(vcpu
);
218 static void vcpu_put(struct kvm_vcpu
*vcpu
)
220 kvm_arch_ops
->vcpu_put(vcpu
);
221 mutex_unlock(&vcpu
->mutex
);
224 static int kvm_dev_open(struct inode
*inode
, struct file
*filp
)
226 struct kvm
*kvm
= kzalloc(sizeof(struct kvm
), GFP_KERNEL
);
232 spin_lock_init(&kvm
->lock
);
233 INIT_LIST_HEAD(&kvm
->active_mmu_pages
);
234 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
) {
235 struct kvm_vcpu
*vcpu
= &kvm
->vcpus
[i
];
237 mutex_init(&vcpu
->mutex
);
240 vcpu
->mmu
.root_hpa
= INVALID_PAGE
;
241 INIT_LIST_HEAD(&vcpu
->free_pages
);
242 spin_lock(&kvm_lock
);
243 list_add(&kvm
->vm_list
, &vm_list
);
244 spin_unlock(&kvm_lock
);
246 filp
->private_data
= kvm
;
251 * Free any memory in @free but not in @dont.
253 static void kvm_free_physmem_slot(struct kvm_memory_slot
*free
,
254 struct kvm_memory_slot
*dont
)
258 if (!dont
|| free
->phys_mem
!= dont
->phys_mem
)
259 if (free
->phys_mem
) {
260 for (i
= 0; i
< free
->npages
; ++i
)
261 if (free
->phys_mem
[i
])
262 __free_page(free
->phys_mem
[i
]);
263 vfree(free
->phys_mem
);
266 if (!dont
|| free
->dirty_bitmap
!= dont
->dirty_bitmap
)
267 vfree(free
->dirty_bitmap
);
271 free
->dirty_bitmap
= 0;
274 static void kvm_free_physmem(struct kvm
*kvm
)
278 for (i
= 0; i
< kvm
->nmemslots
; ++i
)
279 kvm_free_physmem_slot(&kvm
->memslots
[i
], 0);
282 static void kvm_free_vcpu(struct kvm_vcpu
*vcpu
)
284 if (!vcpu_load(vcpu
->kvm
, vcpu_slot(vcpu
)))
287 kvm_mmu_destroy(vcpu
);
289 kvm_arch_ops
->vcpu_free(vcpu
);
292 static void kvm_free_vcpus(struct kvm
*kvm
)
296 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
)
297 kvm_free_vcpu(&kvm
->vcpus
[i
]);
300 static int kvm_dev_release(struct inode
*inode
, struct file
*filp
)
302 struct kvm
*kvm
= filp
->private_data
;
304 spin_lock(&kvm_lock
);
305 list_del(&kvm
->vm_list
);
306 spin_unlock(&kvm_lock
);
308 kvm_free_physmem(kvm
);
313 static void inject_gp(struct kvm_vcpu
*vcpu
)
315 kvm_arch_ops
->inject_gp(vcpu
, 0);
319 * Load the pae pdptrs. Return true is they are all valid.
321 static int load_pdptrs(struct kvm_vcpu
*vcpu
, unsigned long cr3
)
323 gfn_t pdpt_gfn
= cr3
>> PAGE_SHIFT
;
324 unsigned offset
= ((cr3
& (PAGE_SIZE
-1)) >> 5) << 2;
329 struct kvm_memory_slot
*memslot
;
331 spin_lock(&vcpu
->kvm
->lock
);
332 memslot
= gfn_to_memslot(vcpu
->kvm
, pdpt_gfn
);
333 /* FIXME: !memslot - emulate? 0xff? */
334 pdpt
= kmap_atomic(gfn_to_page(memslot
, pdpt_gfn
), KM_USER0
);
337 for (i
= 0; i
< 4; ++i
) {
338 pdpte
= pdpt
[offset
+ i
];
339 if ((pdpte
& 1) && (pdpte
& 0xfffffff0000001e6ull
)) {
345 for (i
= 0; i
< 4; ++i
)
346 vcpu
->pdptrs
[i
] = pdpt
[offset
+ i
];
349 kunmap_atomic(pdpt
, KM_USER0
);
350 spin_unlock(&vcpu
->kvm
->lock
);
355 void set_cr0(struct kvm_vcpu
*vcpu
, unsigned long cr0
)
357 if (cr0
& CR0_RESEVED_BITS
) {
358 printk(KERN_DEBUG
"set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
364 if ((cr0
& CR0_NW_MASK
) && !(cr0
& CR0_CD_MASK
)) {
365 printk(KERN_DEBUG
"set_cr0: #GP, CD == 0 && NW == 1\n");
370 if ((cr0
& CR0_PG_MASK
) && !(cr0
& CR0_PE_MASK
)) {
371 printk(KERN_DEBUG
"set_cr0: #GP, set PG flag "
372 "and a clear PE flag\n");
377 if (!is_paging(vcpu
) && (cr0
& CR0_PG_MASK
)) {
379 if ((vcpu
->shadow_efer
& EFER_LME
)) {
383 printk(KERN_DEBUG
"set_cr0: #GP, start paging "
384 "in long mode while PAE is disabled\n");
388 kvm_arch_ops
->get_cs_db_l_bits(vcpu
, &cs_db
, &cs_l
);
390 printk(KERN_DEBUG
"set_cr0: #GP, start paging "
391 "in long mode while CS.L == 1\n");
398 if (is_pae(vcpu
) && !load_pdptrs(vcpu
, vcpu
->cr3
)) {
399 printk(KERN_DEBUG
"set_cr0: #GP, pdptrs "
407 kvm_arch_ops
->set_cr0(vcpu
, cr0
);
410 spin_lock(&vcpu
->kvm
->lock
);
411 kvm_mmu_reset_context(vcpu
);
412 spin_unlock(&vcpu
->kvm
->lock
);
415 EXPORT_SYMBOL_GPL(set_cr0
);
417 void lmsw(struct kvm_vcpu
*vcpu
, unsigned long msw
)
419 kvm_arch_ops
->decache_cr0_cr4_guest_bits(vcpu
);
420 set_cr0(vcpu
, (vcpu
->cr0
& ~0x0ful
) | (msw
& 0x0f));
422 EXPORT_SYMBOL_GPL(lmsw
);
424 void set_cr4(struct kvm_vcpu
*vcpu
, unsigned long cr4
)
426 if (cr4
& CR4_RESEVED_BITS
) {
427 printk(KERN_DEBUG
"set_cr4: #GP, reserved bits\n");
432 if (is_long_mode(vcpu
)) {
433 if (!(cr4
& CR4_PAE_MASK
)) {
434 printk(KERN_DEBUG
"set_cr4: #GP, clearing PAE while "
439 } else if (is_paging(vcpu
) && !is_pae(vcpu
) && (cr4
& CR4_PAE_MASK
)
440 && !load_pdptrs(vcpu
, vcpu
->cr3
)) {
441 printk(KERN_DEBUG
"set_cr4: #GP, pdptrs reserved bits\n");
445 if (cr4
& CR4_VMXE_MASK
) {
446 printk(KERN_DEBUG
"set_cr4: #GP, setting VMXE\n");
450 kvm_arch_ops
->set_cr4(vcpu
, cr4
);
451 spin_lock(&vcpu
->kvm
->lock
);
452 kvm_mmu_reset_context(vcpu
);
453 spin_unlock(&vcpu
->kvm
->lock
);
455 EXPORT_SYMBOL_GPL(set_cr4
);
457 void set_cr3(struct kvm_vcpu
*vcpu
, unsigned long cr3
)
459 if (is_long_mode(vcpu
)) {
460 if (cr3
& CR3_L_MODE_RESEVED_BITS
) {
461 printk(KERN_DEBUG
"set_cr3: #GP, reserved bits\n");
466 if (cr3
& CR3_RESEVED_BITS
) {
467 printk(KERN_DEBUG
"set_cr3: #GP, reserved bits\n");
471 if (is_paging(vcpu
) && is_pae(vcpu
) &&
472 !load_pdptrs(vcpu
, cr3
)) {
473 printk(KERN_DEBUG
"set_cr3: #GP, pdptrs "
481 spin_lock(&vcpu
->kvm
->lock
);
483 * Does the new cr3 value map to physical memory? (Note, we
484 * catch an invalid cr3 even in real-mode, because it would
485 * cause trouble later on when we turn on paging anyway.)
487 * A real CPU would silently accept an invalid cr3 and would
488 * attempt to use it - with largely undefined (and often hard
489 * to debug) behavior on the guest side.
491 if (unlikely(!gfn_to_memslot(vcpu
->kvm
, cr3
>> PAGE_SHIFT
)))
494 vcpu
->mmu
.new_cr3(vcpu
);
495 spin_unlock(&vcpu
->kvm
->lock
);
497 EXPORT_SYMBOL_GPL(set_cr3
);
499 void set_cr8(struct kvm_vcpu
*vcpu
, unsigned long cr8
)
501 if ( cr8
& CR8_RESEVED_BITS
) {
502 printk(KERN_DEBUG
"set_cr8: #GP, reserved bits 0x%lx\n", cr8
);
508 EXPORT_SYMBOL_GPL(set_cr8
);
510 void fx_init(struct kvm_vcpu
*vcpu
)
512 struct __attribute__ ((__packed__
)) fx_image_s
{
518 u64 operand
;// fpu dp
524 fx_save(vcpu
->host_fx_image
);
526 fx_save(vcpu
->guest_fx_image
);
527 fx_restore(vcpu
->host_fx_image
);
529 fx_image
= (struct fx_image_s
*)vcpu
->guest_fx_image
;
530 fx_image
->mxcsr
= 0x1f80;
531 memset(vcpu
->guest_fx_image
+ sizeof(struct fx_image_s
),
532 0, FX_IMAGE_SIZE
- sizeof(struct fx_image_s
));
534 EXPORT_SYMBOL_GPL(fx_init
);
537 * Creates some virtual cpus. Good luck creating more than one.
539 static int kvm_dev_ioctl_create_vcpu(struct kvm
*kvm
, int n
)
542 struct kvm_vcpu
*vcpu
;
548 vcpu
= &kvm
->vcpus
[n
];
550 mutex_lock(&vcpu
->mutex
);
553 mutex_unlock(&vcpu
->mutex
);
557 vcpu
->host_fx_image
= (char*)ALIGN((hva_t
)vcpu
->fx_buf
,
559 vcpu
->guest_fx_image
= vcpu
->host_fx_image
+ FX_IMAGE_SIZE
;
561 r
= kvm_arch_ops
->vcpu_create(vcpu
);
565 r
= kvm_mmu_create(vcpu
);
569 kvm_arch_ops
->vcpu_load(vcpu
);
570 r
= kvm_mmu_setup(vcpu
);
572 r
= kvm_arch_ops
->vcpu_setup(vcpu
);
582 mutex_unlock(&vcpu
->mutex
);
588 * Allocate some memory and give it an address in the guest physical address
591 * Discontiguous memory is allowed, mostly for framebuffers.
593 static int kvm_dev_ioctl_set_memory_region(struct kvm
*kvm
,
594 struct kvm_memory_region
*mem
)
598 unsigned long npages
;
600 struct kvm_memory_slot
*memslot
;
601 struct kvm_memory_slot old
, new;
602 int memory_config_version
;
605 /* General sanity checks */
606 if (mem
->memory_size
& (PAGE_SIZE
- 1))
608 if (mem
->guest_phys_addr
& (PAGE_SIZE
- 1))
610 if (mem
->slot
>= KVM_MEMORY_SLOTS
)
612 if (mem
->guest_phys_addr
+ mem
->memory_size
< mem
->guest_phys_addr
)
615 memslot
= &kvm
->memslots
[mem
->slot
];
616 base_gfn
= mem
->guest_phys_addr
>> PAGE_SHIFT
;
617 npages
= mem
->memory_size
>> PAGE_SHIFT
;
620 mem
->flags
&= ~KVM_MEM_LOG_DIRTY_PAGES
;
623 spin_lock(&kvm
->lock
);
625 memory_config_version
= kvm
->memory_config_version
;
626 new = old
= *memslot
;
628 new.base_gfn
= base_gfn
;
630 new.flags
= mem
->flags
;
632 /* Disallow changing a memory slot's size. */
634 if (npages
&& old
.npages
&& npages
!= old
.npages
)
637 /* Check for overlaps */
639 for (i
= 0; i
< KVM_MEMORY_SLOTS
; ++i
) {
640 struct kvm_memory_slot
*s
= &kvm
->memslots
[i
];
644 if (!((base_gfn
+ npages
<= s
->base_gfn
) ||
645 (base_gfn
>= s
->base_gfn
+ s
->npages
)))
649 * Do memory allocations outside lock. memory_config_version will
652 spin_unlock(&kvm
->lock
);
654 /* Deallocate if slot is being removed */
658 /* Free page dirty bitmap if unneeded */
659 if (!(new.flags
& KVM_MEM_LOG_DIRTY_PAGES
))
660 new.dirty_bitmap
= 0;
664 /* Allocate if a slot is being created */
665 if (npages
&& !new.phys_mem
) {
666 new.phys_mem
= vmalloc(npages
* sizeof(struct page
*));
671 memset(new.phys_mem
, 0, npages
* sizeof(struct page
*));
672 for (i
= 0; i
< npages
; ++i
) {
673 new.phys_mem
[i
] = alloc_page(GFP_HIGHUSER
675 if (!new.phys_mem
[i
])
677 new.phys_mem
[i
]->private = 0;
681 /* Allocate page dirty bitmap if needed */
682 if ((new.flags
& KVM_MEM_LOG_DIRTY_PAGES
) && !new.dirty_bitmap
) {
683 unsigned dirty_bytes
= ALIGN(npages
, BITS_PER_LONG
) / 8;
685 new.dirty_bitmap
= vmalloc(dirty_bytes
);
686 if (!new.dirty_bitmap
)
688 memset(new.dirty_bitmap
, 0, dirty_bytes
);
691 spin_lock(&kvm
->lock
);
693 if (memory_config_version
!= kvm
->memory_config_version
) {
694 spin_unlock(&kvm
->lock
);
695 kvm_free_physmem_slot(&new, &old
);
703 if (mem
->slot
>= kvm
->nmemslots
)
704 kvm
->nmemslots
= mem
->slot
+ 1;
707 ++kvm
->memory_config_version
;
709 spin_unlock(&kvm
->lock
);
711 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
) {
712 struct kvm_vcpu
*vcpu
;
714 vcpu
= vcpu_load(kvm
, i
);
717 kvm_mmu_reset_context(vcpu
);
721 kvm_free_physmem_slot(&old
, &new);
725 spin_unlock(&kvm
->lock
);
727 kvm_free_physmem_slot(&new, &old
);
732 static void do_remove_write_access(struct kvm_vcpu
*vcpu
, int slot
)
734 spin_lock(&vcpu
->kvm
->lock
);
735 kvm_mmu_slot_remove_write_access(vcpu
, slot
);
736 spin_unlock(&vcpu
->kvm
->lock
);
740 * Get (and clear) the dirty memory log for a memory slot.
742 static int kvm_dev_ioctl_get_dirty_log(struct kvm
*kvm
,
743 struct kvm_dirty_log
*log
)
745 struct kvm_memory_slot
*memslot
;
749 unsigned long any
= 0;
751 spin_lock(&kvm
->lock
);
754 * Prevent changes to guest memory configuration even while the lock
758 spin_unlock(&kvm
->lock
);
760 if (log
->slot
>= KVM_MEMORY_SLOTS
)
763 memslot
= &kvm
->memslots
[log
->slot
];
765 if (!memslot
->dirty_bitmap
)
768 n
= ALIGN(memslot
->npages
, 8) / 8;
770 for (i
= 0; !any
&& i
< n
; ++i
)
771 any
= memslot
->dirty_bitmap
[i
];
774 if (copy_to_user(log
->dirty_bitmap
, memslot
->dirty_bitmap
, n
))
779 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
) {
780 struct kvm_vcpu
*vcpu
= vcpu_load(kvm
, i
);
785 do_remove_write_access(vcpu
, log
->slot
);
786 memset(memslot
->dirty_bitmap
, 0, n
);
789 kvm_arch_ops
->tlb_flush(vcpu
);
797 spin_lock(&kvm
->lock
);
799 spin_unlock(&kvm
->lock
);
803 struct kvm_memory_slot
*gfn_to_memslot(struct kvm
*kvm
, gfn_t gfn
)
807 for (i
= 0; i
< kvm
->nmemslots
; ++i
) {
808 struct kvm_memory_slot
*memslot
= &kvm
->memslots
[i
];
810 if (gfn
>= memslot
->base_gfn
811 && gfn
< memslot
->base_gfn
+ memslot
->npages
)
816 EXPORT_SYMBOL_GPL(gfn_to_memslot
);
818 void mark_page_dirty(struct kvm
*kvm
, gfn_t gfn
)
821 struct kvm_memory_slot
*memslot
= 0;
822 unsigned long rel_gfn
;
824 for (i
= 0; i
< kvm
->nmemslots
; ++i
) {
825 memslot
= &kvm
->memslots
[i
];
827 if (gfn
>= memslot
->base_gfn
828 && gfn
< memslot
->base_gfn
+ memslot
->npages
) {
830 if (!memslot
|| !memslot
->dirty_bitmap
)
833 rel_gfn
= gfn
- memslot
->base_gfn
;
836 if (!test_bit(rel_gfn
, memslot
->dirty_bitmap
))
837 set_bit(rel_gfn
, memslot
->dirty_bitmap
);
843 static int emulator_read_std(unsigned long addr
,
846 struct x86_emulate_ctxt
*ctxt
)
848 struct kvm_vcpu
*vcpu
= ctxt
->vcpu
;
852 gpa_t gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, addr
);
853 unsigned offset
= addr
& (PAGE_SIZE
-1);
854 unsigned tocopy
= min(bytes
, (unsigned)PAGE_SIZE
- offset
);
856 struct kvm_memory_slot
*memslot
;
859 if (gpa
== UNMAPPED_GVA
)
860 return X86EMUL_PROPAGATE_FAULT
;
861 pfn
= gpa
>> PAGE_SHIFT
;
862 memslot
= gfn_to_memslot(vcpu
->kvm
, pfn
);
864 return X86EMUL_UNHANDLEABLE
;
865 page
= kmap_atomic(gfn_to_page(memslot
, pfn
), KM_USER0
);
867 memcpy(data
, page
+ offset
, tocopy
);
869 kunmap_atomic(page
, KM_USER0
);
876 return X86EMUL_CONTINUE
;
879 static int emulator_write_std(unsigned long addr
,
882 struct x86_emulate_ctxt
*ctxt
)
884 printk(KERN_ERR
"emulator_write_std: addr %lx n %d\n",
886 return X86EMUL_UNHANDLEABLE
;
889 static int emulator_read_emulated(unsigned long addr
,
892 struct x86_emulate_ctxt
*ctxt
)
894 struct kvm_vcpu
*vcpu
= ctxt
->vcpu
;
896 if (vcpu
->mmio_read_completed
) {
897 memcpy(val
, vcpu
->mmio_data
, bytes
);
898 vcpu
->mmio_read_completed
= 0;
899 return X86EMUL_CONTINUE
;
900 } else if (emulator_read_std(addr
, val
, bytes
, ctxt
)
902 return X86EMUL_CONTINUE
;
904 gpa_t gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, addr
);
906 if (gpa
== UNMAPPED_GVA
)
907 return X86EMUL_PROPAGATE_FAULT
;
908 vcpu
->mmio_needed
= 1;
909 vcpu
->mmio_phys_addr
= gpa
;
910 vcpu
->mmio_size
= bytes
;
911 vcpu
->mmio_is_write
= 0;
913 return X86EMUL_UNHANDLEABLE
;
917 static int emulator_write_phys(struct kvm_vcpu
*vcpu
, gpa_t gpa
,
918 unsigned long val
, int bytes
)
920 struct kvm_memory_slot
*m
;
924 if (((gpa
+ bytes
- 1) >> PAGE_SHIFT
) != (gpa
>> PAGE_SHIFT
))
926 m
= gfn_to_memslot(vcpu
->kvm
, gpa
>> PAGE_SHIFT
);
929 page
= gfn_to_page(m
, gpa
>> PAGE_SHIFT
);
930 kvm_mmu_pre_write(vcpu
, gpa
, bytes
);
931 virt
= kmap_atomic(page
, KM_USER0
);
932 memcpy(virt
+ offset_in_page(gpa
), &val
, bytes
);
933 kunmap_atomic(virt
, KM_USER0
);
934 kvm_mmu_post_write(vcpu
, gpa
, bytes
);
938 static int emulator_write_emulated(unsigned long addr
,
941 struct x86_emulate_ctxt
*ctxt
)
943 struct kvm_vcpu
*vcpu
= ctxt
->vcpu
;
944 gpa_t gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, addr
);
946 if (gpa
== UNMAPPED_GVA
)
947 return X86EMUL_PROPAGATE_FAULT
;
949 if (emulator_write_phys(vcpu
, gpa
, val
, bytes
))
950 return X86EMUL_CONTINUE
;
952 vcpu
->mmio_needed
= 1;
953 vcpu
->mmio_phys_addr
= gpa
;
954 vcpu
->mmio_size
= bytes
;
955 vcpu
->mmio_is_write
= 1;
956 memcpy(vcpu
->mmio_data
, &val
, bytes
);
958 return X86EMUL_CONTINUE
;
961 static int emulator_cmpxchg_emulated(unsigned long addr
,
965 struct x86_emulate_ctxt
*ctxt
)
971 printk(KERN_WARNING
"kvm: emulating exchange as write\n");
973 return emulator_write_emulated(addr
, new, bytes
, ctxt
);
978 static int emulator_cmpxchg8b_emulated(unsigned long addr
,
979 unsigned long old_lo
,
980 unsigned long old_hi
,
981 unsigned long new_lo
,
982 unsigned long new_hi
,
983 struct x86_emulate_ctxt
*ctxt
)
990 printk(KERN_WARNING
"kvm: emulating exchange8b as write\n");
992 r
= emulator_write_emulated(addr
, new_lo
, 4, ctxt
);
993 if (r
!= X86EMUL_CONTINUE
)
995 return emulator_write_emulated(addr
+4, new_hi
, 4, ctxt
);
1000 static unsigned long get_segment_base(struct kvm_vcpu
*vcpu
, int seg
)
1002 return kvm_arch_ops
->get_segment_base(vcpu
, seg
);
1005 int emulate_invlpg(struct kvm_vcpu
*vcpu
, gva_t address
)
1007 return X86EMUL_CONTINUE
;
1010 int emulate_clts(struct kvm_vcpu
*vcpu
)
1014 kvm_arch_ops
->decache_cr0_cr4_guest_bits(vcpu
);
1015 cr0
= vcpu
->cr0
& ~CR0_TS_MASK
;
1016 kvm_arch_ops
->set_cr0(vcpu
, cr0
);
1017 return X86EMUL_CONTINUE
;
1020 int emulator_get_dr(struct x86_emulate_ctxt
* ctxt
, int dr
, unsigned long *dest
)
1022 struct kvm_vcpu
*vcpu
= ctxt
->vcpu
;
1026 *dest
= kvm_arch_ops
->get_dr(vcpu
, dr
);
1027 return X86EMUL_CONTINUE
;
1029 printk(KERN_DEBUG
"%s: unexpected dr %u\n",
1031 return X86EMUL_UNHANDLEABLE
;
1035 int emulator_set_dr(struct x86_emulate_ctxt
*ctxt
, int dr
, unsigned long value
)
1037 unsigned long mask
= (ctxt
->mode
== X86EMUL_MODE_PROT64
) ? ~0ULL : ~0U;
1040 kvm_arch_ops
->set_dr(ctxt
->vcpu
, dr
, value
& mask
, &exception
);
1042 /* FIXME: better handling */
1043 return X86EMUL_UNHANDLEABLE
;
1045 return X86EMUL_CONTINUE
;
1048 static void report_emulation_failure(struct x86_emulate_ctxt
*ctxt
)
1050 static int reported
;
1052 unsigned long rip
= ctxt
->vcpu
->rip
;
1053 unsigned long rip_linear
;
1055 rip_linear
= rip
+ get_segment_base(ctxt
->vcpu
, VCPU_SREG_CS
);
1060 emulator_read_std(rip_linear
, (void *)opcodes
, 4, ctxt
);
1062 printk(KERN_ERR
"emulation failed but !mmio_needed?"
1063 " rip %lx %02x %02x %02x %02x\n",
1064 rip
, opcodes
[0], opcodes
[1], opcodes
[2], opcodes
[3]);
1068 struct x86_emulate_ops emulate_ops
= {
1069 .read_std
= emulator_read_std
,
1070 .write_std
= emulator_write_std
,
1071 .read_emulated
= emulator_read_emulated
,
1072 .write_emulated
= emulator_write_emulated
,
1073 .cmpxchg_emulated
= emulator_cmpxchg_emulated
,
1074 #ifdef CONFIG_X86_32
1075 .cmpxchg8b_emulated
= emulator_cmpxchg8b_emulated
,
1079 int emulate_instruction(struct kvm_vcpu
*vcpu
,
1080 struct kvm_run
*run
,
1084 struct x86_emulate_ctxt emulate_ctxt
;
1088 kvm_arch_ops
->cache_regs(vcpu
);
1090 kvm_arch_ops
->get_cs_db_l_bits(vcpu
, &cs_db
, &cs_l
);
1092 emulate_ctxt
.vcpu
= vcpu
;
1093 emulate_ctxt
.eflags
= kvm_arch_ops
->get_rflags(vcpu
);
1094 emulate_ctxt
.cr2
= cr2
;
1095 emulate_ctxt
.mode
= (emulate_ctxt
.eflags
& X86_EFLAGS_VM
)
1096 ? X86EMUL_MODE_REAL
: cs_l
1097 ? X86EMUL_MODE_PROT64
: cs_db
1098 ? X86EMUL_MODE_PROT32
: X86EMUL_MODE_PROT16
;
1100 if (emulate_ctxt
.mode
== X86EMUL_MODE_PROT64
) {
1101 emulate_ctxt
.cs_base
= 0;
1102 emulate_ctxt
.ds_base
= 0;
1103 emulate_ctxt
.es_base
= 0;
1104 emulate_ctxt
.ss_base
= 0;
1106 emulate_ctxt
.cs_base
= get_segment_base(vcpu
, VCPU_SREG_CS
);
1107 emulate_ctxt
.ds_base
= get_segment_base(vcpu
, VCPU_SREG_DS
);
1108 emulate_ctxt
.es_base
= get_segment_base(vcpu
, VCPU_SREG_ES
);
1109 emulate_ctxt
.ss_base
= get_segment_base(vcpu
, VCPU_SREG_SS
);
1112 emulate_ctxt
.gs_base
= get_segment_base(vcpu
, VCPU_SREG_GS
);
1113 emulate_ctxt
.fs_base
= get_segment_base(vcpu
, VCPU_SREG_FS
);
1115 vcpu
->mmio_is_write
= 0;
1116 r
= x86_emulate_memop(&emulate_ctxt
, &emulate_ops
);
1118 if ((r
|| vcpu
->mmio_is_write
) && run
) {
1119 run
->mmio
.phys_addr
= vcpu
->mmio_phys_addr
;
1120 memcpy(run
->mmio
.data
, vcpu
->mmio_data
, 8);
1121 run
->mmio
.len
= vcpu
->mmio_size
;
1122 run
->mmio
.is_write
= vcpu
->mmio_is_write
;
1126 if (kvm_mmu_unprotect_page_virt(vcpu
, cr2
))
1127 return EMULATE_DONE
;
1128 if (!vcpu
->mmio_needed
) {
1129 report_emulation_failure(&emulate_ctxt
);
1130 return EMULATE_FAIL
;
1132 return EMULATE_DO_MMIO
;
1135 kvm_arch_ops
->decache_regs(vcpu
);
1136 kvm_arch_ops
->set_rflags(vcpu
, emulate_ctxt
.eflags
);
1138 if (vcpu
->mmio_is_write
)
1139 return EMULATE_DO_MMIO
;
1141 return EMULATE_DONE
;
1143 EXPORT_SYMBOL_GPL(emulate_instruction
);
1145 static u64
mk_cr_64(u64 curr_cr
, u32 new_val
)
1147 return (curr_cr
& ~((1ULL << 32) - 1)) | new_val
;
1150 void realmode_lgdt(struct kvm_vcpu
*vcpu
, u16 limit
, unsigned long base
)
1152 struct descriptor_table dt
= { limit
, base
};
1154 kvm_arch_ops
->set_gdt(vcpu
, &dt
);
1157 void realmode_lidt(struct kvm_vcpu
*vcpu
, u16 limit
, unsigned long base
)
1159 struct descriptor_table dt
= { limit
, base
};
1161 kvm_arch_ops
->set_idt(vcpu
, &dt
);
1164 void realmode_lmsw(struct kvm_vcpu
*vcpu
, unsigned long msw
,
1165 unsigned long *rflags
)
1168 *rflags
= kvm_arch_ops
->get_rflags(vcpu
);
1171 unsigned long realmode_get_cr(struct kvm_vcpu
*vcpu
, int cr
)
1173 kvm_arch_ops
->decache_cr0_cr4_guest_bits(vcpu
);
1184 vcpu_printf(vcpu
, "%s: unexpected cr %u\n", __FUNCTION__
, cr
);
1189 void realmode_set_cr(struct kvm_vcpu
*vcpu
, int cr
, unsigned long val
,
1190 unsigned long *rflags
)
1194 set_cr0(vcpu
, mk_cr_64(vcpu
->cr0
, val
));
1195 *rflags
= kvm_arch_ops
->get_rflags(vcpu
);
1204 set_cr4(vcpu
, mk_cr_64(vcpu
->cr4
, val
));
1207 vcpu_printf(vcpu
, "%s: unexpected cr %u\n", __FUNCTION__
, cr
);
1212 * Register the para guest with the host:
1214 static int vcpu_register_para(struct kvm_vcpu
*vcpu
, gpa_t para_state_gpa
)
1216 struct kvm_vcpu_para_state
*para_state
;
1217 hpa_t para_state_hpa
, hypercall_hpa
;
1218 struct page
*para_state_page
;
1219 unsigned char *hypercall
;
1220 gpa_t hypercall_gpa
;
1222 printk(KERN_DEBUG
"kvm: guest trying to enter paravirtual mode\n");
1223 printk(KERN_DEBUG
".... para_state_gpa: %08Lx\n", para_state_gpa
);
1226 * Needs to be page aligned:
1228 if (para_state_gpa
!= PAGE_ALIGN(para_state_gpa
))
1231 para_state_hpa
= gpa_to_hpa(vcpu
, para_state_gpa
);
1232 printk(KERN_DEBUG
".... para_state_hpa: %08Lx\n", para_state_hpa
);
1233 if (is_error_hpa(para_state_hpa
))
1236 para_state_page
= pfn_to_page(para_state_hpa
>> PAGE_SHIFT
);
1237 para_state
= kmap_atomic(para_state_page
, KM_USER0
);
1239 printk(KERN_DEBUG
".... guest version: %d\n", para_state
->guest_version
);
1240 printk(KERN_DEBUG
".... size: %d\n", para_state
->size
);
1242 para_state
->host_version
= KVM_PARA_API_VERSION
;
1244 * We cannot support guests that try to register themselves
1245 * with a newer API version than the host supports:
1247 if (para_state
->guest_version
> KVM_PARA_API_VERSION
) {
1248 para_state
->ret
= -KVM_EINVAL
;
1249 goto err_kunmap_skip
;
1252 hypercall_gpa
= para_state
->hypercall_gpa
;
1253 hypercall_hpa
= gpa_to_hpa(vcpu
, hypercall_gpa
);
1254 printk(KERN_DEBUG
".... hypercall_hpa: %08Lx\n", hypercall_hpa
);
1255 if (is_error_hpa(hypercall_hpa
)) {
1256 para_state
->ret
= -KVM_EINVAL
;
1257 goto err_kunmap_skip
;
1260 printk(KERN_DEBUG
"kvm: para guest successfully registered.\n");
1261 vcpu
->para_state_page
= para_state_page
;
1262 vcpu
->para_state_gpa
= para_state_gpa
;
1263 vcpu
->hypercall_gpa
= hypercall_gpa
;
1265 hypercall
= kmap_atomic(pfn_to_page(hypercall_hpa
>> PAGE_SHIFT
),
1266 KM_USER1
) + (hypercall_hpa
& ~PAGE_MASK
);
1267 kvm_arch_ops
->patch_hypercall(vcpu
, hypercall
);
1268 kunmap_atomic(hypercall
, KM_USER1
);
1270 para_state
->ret
= 0;
1272 kunmap_atomic(para_state
, KM_USER0
);
1278 int kvm_get_msr_common(struct kvm_vcpu
*vcpu
, u32 msr
, u64
*pdata
)
1283 case 0xc0010010: /* SYSCFG */
1284 case 0xc0010015: /* HWCR */
1285 case MSR_IA32_PLATFORM_ID
:
1286 case MSR_IA32_P5_MC_ADDR
:
1287 case MSR_IA32_P5_MC_TYPE
:
1288 case MSR_IA32_MC0_CTL
:
1289 case MSR_IA32_MCG_STATUS
:
1290 case MSR_IA32_MCG_CAP
:
1291 case MSR_IA32_MC0_MISC
:
1292 case MSR_IA32_MC0_MISC
+4:
1293 case MSR_IA32_MC0_MISC
+8:
1294 case MSR_IA32_MC0_MISC
+12:
1295 case MSR_IA32_MC0_MISC
+16:
1296 case MSR_IA32_UCODE_REV
:
1297 case MSR_IA32_PERF_STATUS
:
1298 /* MTRR registers */
1300 case 0x200 ... 0x2ff:
1303 case 0xcd: /* fsb frequency */
1306 case MSR_IA32_APICBASE
:
1307 data
= vcpu
->apic_base
;
1309 case MSR_IA32_MISC_ENABLE
:
1310 data
= vcpu
->ia32_misc_enable_msr
;
1312 #ifdef CONFIG_X86_64
1314 data
= vcpu
->shadow_efer
;
1318 printk(KERN_ERR
"kvm: unhandled rdmsr: 0x%x\n", msr
);
1324 EXPORT_SYMBOL_GPL(kvm_get_msr_common
);
1327 * Reads an msr value (of 'msr_index') into 'pdata'.
1328 * Returns 0 on success, non-0 otherwise.
1329 * Assumes vcpu_load() was already called.
1331 static int get_msr(struct kvm_vcpu
*vcpu
, u32 msr_index
, u64
*pdata
)
1333 return kvm_arch_ops
->get_msr(vcpu
, msr_index
, pdata
);
1336 #ifdef CONFIG_X86_64
1338 static void set_efer(struct kvm_vcpu
*vcpu
, u64 efer
)
1340 if (efer
& EFER_RESERVED_BITS
) {
1341 printk(KERN_DEBUG
"set_efer: 0x%llx #GP, reserved bits\n",
1348 && (vcpu
->shadow_efer
& EFER_LME
) != (efer
& EFER_LME
)) {
1349 printk(KERN_DEBUG
"set_efer: #GP, change LME while paging\n");
1354 kvm_arch_ops
->set_efer(vcpu
, efer
);
1357 efer
|= vcpu
->shadow_efer
& EFER_LMA
;
1359 vcpu
->shadow_efer
= efer
;
1364 int kvm_set_msr_common(struct kvm_vcpu
*vcpu
, u32 msr
, u64 data
)
1367 #ifdef CONFIG_X86_64
1369 set_efer(vcpu
, data
);
1372 case MSR_IA32_MC0_STATUS
:
1373 printk(KERN_WARNING
"%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1374 __FUNCTION__
, data
);
1376 case MSR_IA32_UCODE_REV
:
1377 case MSR_IA32_UCODE_WRITE
:
1378 case 0x200 ... 0x2ff: /* MTRRs */
1380 case MSR_IA32_APICBASE
:
1381 vcpu
->apic_base
= data
;
1383 case MSR_IA32_MISC_ENABLE
:
1384 vcpu
->ia32_misc_enable_msr
= data
;
1387 * This is the 'probe whether the host is KVM' logic:
1389 case MSR_KVM_API_MAGIC
:
1390 return vcpu_register_para(vcpu
, data
);
1393 printk(KERN_ERR
"kvm: unhandled wrmsr: 0x%x\n", msr
);
1398 EXPORT_SYMBOL_GPL(kvm_set_msr_common
);
1401 * Writes msr value into into the appropriate "register".
1402 * Returns 0 on success, non-0 otherwise.
1403 * Assumes vcpu_load() was already called.
1405 static int set_msr(struct kvm_vcpu
*vcpu
, u32 msr_index
, u64 data
)
1407 return kvm_arch_ops
->set_msr(vcpu
, msr_index
, data
);
1410 void kvm_resched(struct kvm_vcpu
*vcpu
)
1414 /* Cannot fail - no vcpu unplug yet. */
1415 vcpu_load(vcpu
->kvm
, vcpu_slot(vcpu
));
1417 EXPORT_SYMBOL_GPL(kvm_resched
);
1419 void load_msrs(struct vmx_msr_entry
*e
, int n
)
1423 for (i
= 0; i
< n
; ++i
)
1424 wrmsrl(e
[i
].index
, e
[i
].data
);
1426 EXPORT_SYMBOL_GPL(load_msrs
);
1428 void save_msrs(struct vmx_msr_entry
*e
, int n
)
1432 for (i
= 0; i
< n
; ++i
)
1433 rdmsrl(e
[i
].index
, e
[i
].data
);
1435 EXPORT_SYMBOL_GPL(save_msrs
);
1437 static int kvm_dev_ioctl_run(struct kvm
*kvm
, struct kvm_run
*kvm_run
)
1439 struct kvm_vcpu
*vcpu
;
1442 if (!valid_vcpu(kvm_run
->vcpu
))
1445 vcpu
= vcpu_load(kvm
, kvm_run
->vcpu
);
1449 /* re-sync apic's tpr */
1450 vcpu
->cr8
= kvm_run
->cr8
;
1452 if (kvm_run
->emulated
) {
1453 kvm_arch_ops
->skip_emulated_instruction(vcpu
);
1454 kvm_run
->emulated
= 0;
1457 if (kvm_run
->mmio_completed
) {
1458 memcpy(vcpu
->mmio_data
, kvm_run
->mmio
.data
, 8);
1459 vcpu
->mmio_read_completed
= 1;
1462 vcpu
->mmio_needed
= 0;
1464 r
= kvm_arch_ops
->run(vcpu
, kvm_run
);
1470 static int kvm_dev_ioctl_get_regs(struct kvm
*kvm
, struct kvm_regs
*regs
)
1472 struct kvm_vcpu
*vcpu
;
1474 if (!valid_vcpu(regs
->vcpu
))
1477 vcpu
= vcpu_load(kvm
, regs
->vcpu
);
1481 kvm_arch_ops
->cache_regs(vcpu
);
1483 regs
->rax
= vcpu
->regs
[VCPU_REGS_RAX
];
1484 regs
->rbx
= vcpu
->regs
[VCPU_REGS_RBX
];
1485 regs
->rcx
= vcpu
->regs
[VCPU_REGS_RCX
];
1486 regs
->rdx
= vcpu
->regs
[VCPU_REGS_RDX
];
1487 regs
->rsi
= vcpu
->regs
[VCPU_REGS_RSI
];
1488 regs
->rdi
= vcpu
->regs
[VCPU_REGS_RDI
];
1489 regs
->rsp
= vcpu
->regs
[VCPU_REGS_RSP
];
1490 regs
->rbp
= vcpu
->regs
[VCPU_REGS_RBP
];
1491 #ifdef CONFIG_X86_64
1492 regs
->r8
= vcpu
->regs
[VCPU_REGS_R8
];
1493 regs
->r9
= vcpu
->regs
[VCPU_REGS_R9
];
1494 regs
->r10
= vcpu
->regs
[VCPU_REGS_R10
];
1495 regs
->r11
= vcpu
->regs
[VCPU_REGS_R11
];
1496 regs
->r12
= vcpu
->regs
[VCPU_REGS_R12
];
1497 regs
->r13
= vcpu
->regs
[VCPU_REGS_R13
];
1498 regs
->r14
= vcpu
->regs
[VCPU_REGS_R14
];
1499 regs
->r15
= vcpu
->regs
[VCPU_REGS_R15
];
1502 regs
->rip
= vcpu
->rip
;
1503 regs
->rflags
= kvm_arch_ops
->get_rflags(vcpu
);
1506 * Don't leak debug flags in case they were set for guest debugging
1508 if (vcpu
->guest_debug
.enabled
&& vcpu
->guest_debug
.singlestep
)
1509 regs
->rflags
&= ~(X86_EFLAGS_TF
| X86_EFLAGS_RF
);
1516 static int kvm_dev_ioctl_set_regs(struct kvm
*kvm
, struct kvm_regs
*regs
)
1518 struct kvm_vcpu
*vcpu
;
1520 if (!valid_vcpu(regs
->vcpu
))
1523 vcpu
= vcpu_load(kvm
, regs
->vcpu
);
1527 vcpu
->regs
[VCPU_REGS_RAX
] = regs
->rax
;
1528 vcpu
->regs
[VCPU_REGS_RBX
] = regs
->rbx
;
1529 vcpu
->regs
[VCPU_REGS_RCX
] = regs
->rcx
;
1530 vcpu
->regs
[VCPU_REGS_RDX
] = regs
->rdx
;
1531 vcpu
->regs
[VCPU_REGS_RSI
] = regs
->rsi
;
1532 vcpu
->regs
[VCPU_REGS_RDI
] = regs
->rdi
;
1533 vcpu
->regs
[VCPU_REGS_RSP
] = regs
->rsp
;
1534 vcpu
->regs
[VCPU_REGS_RBP
] = regs
->rbp
;
1535 #ifdef CONFIG_X86_64
1536 vcpu
->regs
[VCPU_REGS_R8
] = regs
->r8
;
1537 vcpu
->regs
[VCPU_REGS_R9
] = regs
->r9
;
1538 vcpu
->regs
[VCPU_REGS_R10
] = regs
->r10
;
1539 vcpu
->regs
[VCPU_REGS_R11
] = regs
->r11
;
1540 vcpu
->regs
[VCPU_REGS_R12
] = regs
->r12
;
1541 vcpu
->regs
[VCPU_REGS_R13
] = regs
->r13
;
1542 vcpu
->regs
[VCPU_REGS_R14
] = regs
->r14
;
1543 vcpu
->regs
[VCPU_REGS_R15
] = regs
->r15
;
1546 vcpu
->rip
= regs
->rip
;
1547 kvm_arch_ops
->set_rflags(vcpu
, regs
->rflags
);
1549 kvm_arch_ops
->decache_regs(vcpu
);
1556 static void get_segment(struct kvm_vcpu
*vcpu
,
1557 struct kvm_segment
*var
, int seg
)
1559 return kvm_arch_ops
->get_segment(vcpu
, var
, seg
);
1562 static int kvm_dev_ioctl_get_sregs(struct kvm
*kvm
, struct kvm_sregs
*sregs
)
1564 struct kvm_vcpu
*vcpu
;
1565 struct descriptor_table dt
;
1567 if (!valid_vcpu(sregs
->vcpu
))
1569 vcpu
= vcpu_load(kvm
, sregs
->vcpu
);
1573 get_segment(vcpu
, &sregs
->cs
, VCPU_SREG_CS
);
1574 get_segment(vcpu
, &sregs
->ds
, VCPU_SREG_DS
);
1575 get_segment(vcpu
, &sregs
->es
, VCPU_SREG_ES
);
1576 get_segment(vcpu
, &sregs
->fs
, VCPU_SREG_FS
);
1577 get_segment(vcpu
, &sregs
->gs
, VCPU_SREG_GS
);
1578 get_segment(vcpu
, &sregs
->ss
, VCPU_SREG_SS
);
1580 get_segment(vcpu
, &sregs
->tr
, VCPU_SREG_TR
);
1581 get_segment(vcpu
, &sregs
->ldt
, VCPU_SREG_LDTR
);
1583 kvm_arch_ops
->get_idt(vcpu
, &dt
);
1584 sregs
->idt
.limit
= dt
.limit
;
1585 sregs
->idt
.base
= dt
.base
;
1586 kvm_arch_ops
->get_gdt(vcpu
, &dt
);
1587 sregs
->gdt
.limit
= dt
.limit
;
1588 sregs
->gdt
.base
= dt
.base
;
1590 kvm_arch_ops
->decache_cr0_cr4_guest_bits(vcpu
);
1591 sregs
->cr0
= vcpu
->cr0
;
1592 sregs
->cr2
= vcpu
->cr2
;
1593 sregs
->cr3
= vcpu
->cr3
;
1594 sregs
->cr4
= vcpu
->cr4
;
1595 sregs
->cr8
= vcpu
->cr8
;
1596 sregs
->efer
= vcpu
->shadow_efer
;
1597 sregs
->apic_base
= vcpu
->apic_base
;
1599 memcpy(sregs
->interrupt_bitmap
, vcpu
->irq_pending
,
1600 sizeof sregs
->interrupt_bitmap
);
1607 static void set_segment(struct kvm_vcpu
*vcpu
,
1608 struct kvm_segment
*var
, int seg
)
1610 return kvm_arch_ops
->set_segment(vcpu
, var
, seg
);
1613 static int kvm_dev_ioctl_set_sregs(struct kvm
*kvm
, struct kvm_sregs
*sregs
)
1615 struct kvm_vcpu
*vcpu
;
1616 int mmu_reset_needed
= 0;
1618 struct descriptor_table dt
;
1620 if (!valid_vcpu(sregs
->vcpu
))
1622 vcpu
= vcpu_load(kvm
, sregs
->vcpu
);
1626 set_segment(vcpu
, &sregs
->cs
, VCPU_SREG_CS
);
1627 set_segment(vcpu
, &sregs
->ds
, VCPU_SREG_DS
);
1628 set_segment(vcpu
, &sregs
->es
, VCPU_SREG_ES
);
1629 set_segment(vcpu
, &sregs
->fs
, VCPU_SREG_FS
);
1630 set_segment(vcpu
, &sregs
->gs
, VCPU_SREG_GS
);
1631 set_segment(vcpu
, &sregs
->ss
, VCPU_SREG_SS
);
1633 set_segment(vcpu
, &sregs
->tr
, VCPU_SREG_TR
);
1634 set_segment(vcpu
, &sregs
->ldt
, VCPU_SREG_LDTR
);
1636 dt
.limit
= sregs
->idt
.limit
;
1637 dt
.base
= sregs
->idt
.base
;
1638 kvm_arch_ops
->set_idt(vcpu
, &dt
);
1639 dt
.limit
= sregs
->gdt
.limit
;
1640 dt
.base
= sregs
->gdt
.base
;
1641 kvm_arch_ops
->set_gdt(vcpu
, &dt
);
1643 vcpu
->cr2
= sregs
->cr2
;
1644 mmu_reset_needed
|= vcpu
->cr3
!= sregs
->cr3
;
1645 vcpu
->cr3
= sregs
->cr3
;
1647 vcpu
->cr8
= sregs
->cr8
;
1649 mmu_reset_needed
|= vcpu
->shadow_efer
!= sregs
->efer
;
1650 #ifdef CONFIG_X86_64
1651 kvm_arch_ops
->set_efer(vcpu
, sregs
->efer
);
1653 vcpu
->apic_base
= sregs
->apic_base
;
1655 kvm_arch_ops
->decache_cr0_cr4_guest_bits(vcpu
);
1657 mmu_reset_needed
|= vcpu
->cr0
!= sregs
->cr0
;
1658 kvm_arch_ops
->set_cr0_no_modeswitch(vcpu
, sregs
->cr0
);
1660 mmu_reset_needed
|= vcpu
->cr4
!= sregs
->cr4
;
1661 kvm_arch_ops
->set_cr4(vcpu
, sregs
->cr4
);
1662 if (!is_long_mode(vcpu
) && is_pae(vcpu
))
1663 load_pdptrs(vcpu
, vcpu
->cr3
);
1665 if (mmu_reset_needed
)
1666 kvm_mmu_reset_context(vcpu
);
1668 memcpy(vcpu
->irq_pending
, sregs
->interrupt_bitmap
,
1669 sizeof vcpu
->irq_pending
);
1670 vcpu
->irq_summary
= 0;
1671 for (i
= 0; i
< NR_IRQ_WORDS
; ++i
)
1672 if (vcpu
->irq_pending
[i
])
1673 __set_bit(i
, &vcpu
->irq_summary
);
1681 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1682 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1684 * This list is modified at module load time to reflect the
1685 * capabilities of the host cpu.
1687 static u32 msrs_to_save
[] = {
1688 MSR_IA32_SYSENTER_CS
, MSR_IA32_SYSENTER_ESP
, MSR_IA32_SYSENTER_EIP
,
1690 #ifdef CONFIG_X86_64
1691 MSR_CSTAR
, MSR_KERNEL_GS_BASE
, MSR_SYSCALL_MASK
, MSR_LSTAR
,
1693 MSR_IA32_TIME_STAMP_COUNTER
,
1696 static unsigned num_msrs_to_save
;
1698 static u32 emulated_msrs
[] = {
1699 MSR_IA32_MISC_ENABLE
,
1702 static __init
void kvm_init_msr_list(void)
1707 for (i
= j
= 0; i
< ARRAY_SIZE(msrs_to_save
); i
++) {
1708 if (rdmsr_safe(msrs_to_save
[i
], &dummy
[0], &dummy
[1]) < 0)
1711 msrs_to_save
[j
] = msrs_to_save
[i
];
1714 num_msrs_to_save
= j
;
1718 * Adapt set_msr() to msr_io()'s calling convention
1720 static int do_set_msr(struct kvm_vcpu
*vcpu
, unsigned index
, u64
*data
)
1722 return set_msr(vcpu
, index
, *data
);
1726 * Read or write a bunch of msrs. All parameters are kernel addresses.
1728 * @return number of msrs set successfully.
1730 static int __msr_io(struct kvm
*kvm
, struct kvm_msrs
*msrs
,
1731 struct kvm_msr_entry
*entries
,
1732 int (*do_msr
)(struct kvm_vcpu
*vcpu
,
1733 unsigned index
, u64
*data
))
1735 struct kvm_vcpu
*vcpu
;
1738 if (!valid_vcpu(msrs
->vcpu
))
1741 vcpu
= vcpu_load(kvm
, msrs
->vcpu
);
1745 for (i
= 0; i
< msrs
->nmsrs
; ++i
)
1746 if (do_msr(vcpu
, entries
[i
].index
, &entries
[i
].data
))
1755 * Read or write a bunch of msrs. Parameters are user addresses.
1757 * @return number of msrs set successfully.
1759 static int msr_io(struct kvm
*kvm
, struct kvm_msrs __user
*user_msrs
,
1760 int (*do_msr
)(struct kvm_vcpu
*vcpu
,
1761 unsigned index
, u64
*data
),
1764 struct kvm_msrs msrs
;
1765 struct kvm_msr_entry
*entries
;
1770 if (copy_from_user(&msrs
, user_msrs
, sizeof msrs
))
1774 if (msrs
.nmsrs
>= MAX_IO_MSRS
)
1778 size
= sizeof(struct kvm_msr_entry
) * msrs
.nmsrs
;
1779 entries
= vmalloc(size
);
1784 if (copy_from_user(entries
, user_msrs
->entries
, size
))
1787 r
= n
= __msr_io(kvm
, &msrs
, entries
, do_msr
);
1792 if (writeback
&& copy_to_user(user_msrs
->entries
, entries
, size
))
1804 static int kvm_dev_ioctl_dump_vcpu(struct kvm
*kvm
, int vcpu_slot
)
1806 struct kvm_vcpu
*vcpu
;
1808 if (vcpu_slot
< 0 || vcpu_slot
>= KVM_MAX_VCPUS
)
1810 vcpu
= vcpu_load(kvm
, vcpu_slot
);
1820 * Translate a guest virtual address to a guest physical address.
1822 static int kvm_dev_ioctl_translate(struct kvm
*kvm
, struct kvm_translation
*tr
)
1824 unsigned long vaddr
= tr
->linear_address
;
1825 struct kvm_vcpu
*vcpu
;
1828 vcpu
= vcpu_load(kvm
, tr
->vcpu
);
1831 spin_lock(&kvm
->lock
);
1832 gpa
= vcpu
->mmu
.gva_to_gpa(vcpu
, vaddr
);
1833 tr
->physical_address
= gpa
;
1834 tr
->valid
= gpa
!= UNMAPPED_GVA
;
1837 spin_unlock(&kvm
->lock
);
1843 static int kvm_dev_ioctl_interrupt(struct kvm
*kvm
, struct kvm_interrupt
*irq
)
1845 struct kvm_vcpu
*vcpu
;
1847 if (!valid_vcpu(irq
->vcpu
))
1849 if (irq
->irq
< 0 || irq
->irq
>= 256)
1851 vcpu
= vcpu_load(kvm
, irq
->vcpu
);
1855 set_bit(irq
->irq
, vcpu
->irq_pending
);
1856 set_bit(irq
->irq
/ BITS_PER_LONG
, &vcpu
->irq_summary
);
1863 static int kvm_dev_ioctl_debug_guest(struct kvm
*kvm
,
1864 struct kvm_debug_guest
*dbg
)
1866 struct kvm_vcpu
*vcpu
;
1869 if (!valid_vcpu(dbg
->vcpu
))
1871 vcpu
= vcpu_load(kvm
, dbg
->vcpu
);
1875 r
= kvm_arch_ops
->set_guest_debug(vcpu
, dbg
);
1882 static long kvm_dev_ioctl(struct file
*filp
,
1883 unsigned int ioctl
, unsigned long arg
)
1885 struct kvm
*kvm
= filp
->private_data
;
1889 case KVM_GET_API_VERSION
:
1890 r
= KVM_API_VERSION
;
1892 case KVM_CREATE_VCPU
:
1893 r
= kvm_dev_ioctl_create_vcpu(kvm
, arg
);
1898 struct kvm_run kvm_run
;
1901 if (copy_from_user(&kvm_run
, (void *)arg
, sizeof kvm_run
))
1903 r
= kvm_dev_ioctl_run(kvm
, &kvm_run
);
1904 if (r
< 0 && r
!= -EINTR
)
1906 if (copy_to_user((void *)arg
, &kvm_run
, sizeof kvm_run
)) {
1912 case KVM_GET_REGS
: {
1913 struct kvm_regs kvm_regs
;
1916 if (copy_from_user(&kvm_regs
, (void *)arg
, sizeof kvm_regs
))
1918 r
= kvm_dev_ioctl_get_regs(kvm
, &kvm_regs
);
1922 if (copy_to_user((void *)arg
, &kvm_regs
, sizeof kvm_regs
))
1927 case KVM_SET_REGS
: {
1928 struct kvm_regs kvm_regs
;
1931 if (copy_from_user(&kvm_regs
, (void *)arg
, sizeof kvm_regs
))
1933 r
= kvm_dev_ioctl_set_regs(kvm
, &kvm_regs
);
1939 case KVM_GET_SREGS
: {
1940 struct kvm_sregs kvm_sregs
;
1943 if (copy_from_user(&kvm_sregs
, (void *)arg
, sizeof kvm_sregs
))
1945 r
= kvm_dev_ioctl_get_sregs(kvm
, &kvm_sregs
);
1949 if (copy_to_user((void *)arg
, &kvm_sregs
, sizeof kvm_sregs
))
1954 case KVM_SET_SREGS
: {
1955 struct kvm_sregs kvm_sregs
;
1958 if (copy_from_user(&kvm_sregs
, (void *)arg
, sizeof kvm_sregs
))
1960 r
= kvm_dev_ioctl_set_sregs(kvm
, &kvm_sregs
);
1966 case KVM_TRANSLATE
: {
1967 struct kvm_translation tr
;
1970 if (copy_from_user(&tr
, (void *)arg
, sizeof tr
))
1972 r
= kvm_dev_ioctl_translate(kvm
, &tr
);
1976 if (copy_to_user((void *)arg
, &tr
, sizeof tr
))
1981 case KVM_INTERRUPT
: {
1982 struct kvm_interrupt irq
;
1985 if (copy_from_user(&irq
, (void *)arg
, sizeof irq
))
1987 r
= kvm_dev_ioctl_interrupt(kvm
, &irq
);
1993 case KVM_DEBUG_GUEST
: {
1994 struct kvm_debug_guest dbg
;
1997 if (copy_from_user(&dbg
, (void *)arg
, sizeof dbg
))
1999 r
= kvm_dev_ioctl_debug_guest(kvm
, &dbg
);
2005 case KVM_SET_MEMORY_REGION
: {
2006 struct kvm_memory_region kvm_mem
;
2009 if (copy_from_user(&kvm_mem
, (void *)arg
, sizeof kvm_mem
))
2011 r
= kvm_dev_ioctl_set_memory_region(kvm
, &kvm_mem
);
2016 case KVM_GET_DIRTY_LOG
: {
2017 struct kvm_dirty_log log
;
2020 if (copy_from_user(&log
, (void *)arg
, sizeof log
))
2022 r
= kvm_dev_ioctl_get_dirty_log(kvm
, &log
);
2028 r
= msr_io(kvm
, (void __user
*)arg
, get_msr
, 1);
2031 r
= msr_io(kvm
, (void __user
*)arg
, do_set_msr
, 0);
2033 case KVM_GET_MSR_INDEX_LIST
: {
2034 struct kvm_msr_list __user
*user_msr_list
= (void __user
*)arg
;
2035 struct kvm_msr_list msr_list
;
2039 if (copy_from_user(&msr_list
, user_msr_list
, sizeof msr_list
))
2042 msr_list
.nmsrs
= num_msrs_to_save
+ ARRAY_SIZE(emulated_msrs
);
2043 if (copy_to_user(user_msr_list
, &msr_list
, sizeof msr_list
))
2046 if (n
< num_msrs_to_save
)
2049 if (copy_to_user(user_msr_list
->indices
, &msrs_to_save
,
2050 num_msrs_to_save
* sizeof(u32
)))
2052 if (copy_to_user(user_msr_list
->indices
2053 + num_msrs_to_save
* sizeof(u32
),
2055 ARRAY_SIZE(emulated_msrs
) * sizeof(u32
)))
2062 r
= kvm_dev_ioctl_dump_vcpu(kvm
, arg
);
2074 static struct page
*kvm_dev_nopage(struct vm_area_struct
*vma
,
2075 unsigned long address
,
2078 struct kvm
*kvm
= vma
->vm_file
->private_data
;
2079 unsigned long pgoff
;
2080 struct kvm_memory_slot
*slot
;
2083 *type
= VM_FAULT_MINOR
;
2084 pgoff
= ((address
- vma
->vm_start
) >> PAGE_SHIFT
) + vma
->vm_pgoff
;
2085 slot
= gfn_to_memslot(kvm
, pgoff
);
2087 return NOPAGE_SIGBUS
;
2088 page
= gfn_to_page(slot
, pgoff
);
2090 return NOPAGE_SIGBUS
;
2095 static struct vm_operations_struct kvm_dev_vm_ops
= {
2096 .nopage
= kvm_dev_nopage
,
2099 static int kvm_dev_mmap(struct file
*file
, struct vm_area_struct
*vma
)
2101 vma
->vm_ops
= &kvm_dev_vm_ops
;
2105 static struct file_operations kvm_chardev_ops
= {
2106 .open
= kvm_dev_open
,
2107 .release
= kvm_dev_release
,
2108 .unlocked_ioctl
= kvm_dev_ioctl
,
2109 .compat_ioctl
= kvm_dev_ioctl
,
2110 .mmap
= kvm_dev_mmap
,
2113 static struct miscdevice kvm_dev
= {
2119 static int kvm_reboot(struct notifier_block
*notifier
, unsigned long val
,
2122 if (val
== SYS_RESTART
) {
2124 * Some (well, at least mine) BIOSes hang on reboot if
2127 printk(KERN_INFO
"kvm: exiting hardware virtualization\n");
2128 on_each_cpu(kvm_arch_ops
->hardware_disable
, 0, 0, 1);
2133 static struct notifier_block kvm_reboot_notifier
= {
2134 .notifier_call
= kvm_reboot
,
2139 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
2142 static void decache_vcpus_on_cpu(int cpu
)
2145 struct kvm_vcpu
*vcpu
;
2148 spin_lock(&kvm_lock
);
2149 list_for_each_entry(vm
, &vm_list
, vm_list
)
2150 for (i
= 0; i
< KVM_MAX_VCPUS
; ++i
) {
2151 vcpu
= &vm
->vcpus
[i
];
2153 * If the vcpu is locked, then it is running on some
2154 * other cpu and therefore it is not cached on the
2157 * If it's not locked, check the last cpu it executed
2160 if (mutex_trylock(&vcpu
->mutex
)) {
2161 if (vcpu
->cpu
== cpu
) {
2162 kvm_arch_ops
->vcpu_decache(vcpu
);
2165 mutex_unlock(&vcpu
->mutex
);
2168 spin_unlock(&kvm_lock
);
2171 static int kvm_cpu_hotplug(struct notifier_block
*notifier
, unsigned long val
,
2177 case CPU_DOWN_PREPARE
:
2178 case CPU_UP_CANCELED
:
2179 printk(KERN_INFO
"kvm: disabling virtualization on CPU%d\n",
2181 decache_vcpus_on_cpu(cpu
);
2182 smp_call_function_single(cpu
, kvm_arch_ops
->hardware_disable
,
2186 printk(KERN_INFO
"kvm: enabling virtualization on CPU%d\n",
2188 smp_call_function_single(cpu
, kvm_arch_ops
->hardware_enable
,
2195 static struct notifier_block kvm_cpu_notifier
= {
2196 .notifier_call
= kvm_cpu_hotplug
,
2197 .priority
= 20, /* must be > scheduler priority */
2200 static __init
void kvm_init_debug(void)
2202 struct kvm_stats_debugfs_item
*p
;
2204 debugfs_dir
= debugfs_create_dir("kvm", 0);
2205 for (p
= debugfs_entries
; p
->name
; ++p
)
2206 p
->dentry
= debugfs_create_u32(p
->name
, 0444, debugfs_dir
,
2210 static void kvm_exit_debug(void)
2212 struct kvm_stats_debugfs_item
*p
;
2214 for (p
= debugfs_entries
; p
->name
; ++p
)
2215 debugfs_remove(p
->dentry
);
2216 debugfs_remove(debugfs_dir
);
2219 static int kvm_suspend(struct sys_device
*dev
, pm_message_t state
)
2221 decache_vcpus_on_cpu(raw_smp_processor_id());
2222 on_each_cpu(kvm_arch_ops
->hardware_disable
, 0, 0, 1);
2226 static int kvm_resume(struct sys_device
*dev
)
2228 on_each_cpu(kvm_arch_ops
->hardware_enable
, 0, 0, 1);
2232 static struct sysdev_class kvm_sysdev_class
= {
2233 set_kset_name("kvm"),
2234 .suspend
= kvm_suspend
,
2235 .resume
= kvm_resume
,
2238 static struct sys_device kvm_sysdev
= {
2240 .cls
= &kvm_sysdev_class
,
2243 hpa_t bad_page_address
;
2245 int kvm_init_arch(struct kvm_arch_ops
*ops
, struct module
*module
)
2250 printk(KERN_ERR
"kvm: already loaded the other module\n");
2254 if (!ops
->cpu_has_kvm_support()) {
2255 printk(KERN_ERR
"kvm: no hardware support\n");
2258 if (ops
->disabled_by_bios()) {
2259 printk(KERN_ERR
"kvm: disabled by bios\n");
2265 r
= kvm_arch_ops
->hardware_setup();
2269 on_each_cpu(kvm_arch_ops
->hardware_enable
, 0, 0, 1);
2270 r
= register_cpu_notifier(&kvm_cpu_notifier
);
2273 register_reboot_notifier(&kvm_reboot_notifier
);
2275 r
= sysdev_class_register(&kvm_sysdev_class
);
2279 r
= sysdev_register(&kvm_sysdev
);
2283 kvm_chardev_ops
.owner
= module
;
2285 r
= misc_register(&kvm_dev
);
2287 printk (KERN_ERR
"kvm: misc device register failed\n");
2294 sysdev_unregister(&kvm_sysdev
);
2296 sysdev_class_unregister(&kvm_sysdev_class
);
2298 unregister_reboot_notifier(&kvm_reboot_notifier
);
2299 unregister_cpu_notifier(&kvm_cpu_notifier
);
2301 on_each_cpu(kvm_arch_ops
->hardware_disable
, 0, 0, 1);
2302 kvm_arch_ops
->hardware_unsetup();
2306 void kvm_exit_arch(void)
2308 misc_deregister(&kvm_dev
);
2309 sysdev_unregister(&kvm_sysdev
);
2310 sysdev_class_unregister(&kvm_sysdev_class
);
2311 unregister_reboot_notifier(&kvm_reboot_notifier
);
2312 unregister_cpu_notifier(&kvm_cpu_notifier
);
2313 on_each_cpu(kvm_arch_ops
->hardware_disable
, 0, 0, 1);
2314 kvm_arch_ops
->hardware_unsetup();
2315 kvm_arch_ops
= NULL
;
2318 static __init
int kvm_init(void)
2320 static struct page
*bad_page
;
2325 kvm_init_msr_list();
2327 if ((bad_page
= alloc_page(GFP_KERNEL
)) == NULL
) {
2332 bad_page_address
= page_to_pfn(bad_page
) << PAGE_SHIFT
;
2333 memset(__va(bad_page_address
), 0, PAGE_SIZE
);
2342 static __exit
void kvm_exit(void)
2345 __free_page(pfn_to_page(bad_page_address
>> PAGE_SHIFT
));
2348 module_init(kvm_init
)
2349 module_exit(kvm_exit
)
2351 EXPORT_SYMBOL_GPL(kvm_init_arch
);
2352 EXPORT_SYMBOL_GPL(kvm_exit_arch
);