2 * qemu/kvm integration, x86 specific code
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
10 #include "config-host.h"
20 #include <sys/utsname.h>
21 #include <linux/kvm_para.h>
22 #include <sys/ioctl.h>
27 #define MSR_IA32_TSC 0x10
29 static struct kvm_msr_list
*kvm_msr_list
;
30 extern unsigned int kvm_shadow_memory
;
32 int kvm_set_tss_addr(kvm_context_t kvm
, unsigned long addr
)
36 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_TSS_ADDR
, addr
);
38 fprintf(stderr
, "kvm_set_tss_addr: %m\n");
44 static int kvm_init_tss(kvm_context_t kvm
)
48 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_SET_TSS_ADDR
);
51 * this address is 3 pages before the bios, and the bios should present
54 r
= kvm_set_tss_addr(kvm
, 0xfeffd000);
56 fprintf(stderr
, "kvm_init_tss: unable to set tss addr\n");
60 fprintf(stderr
, "kvm does not support KVM_CAP_SET_TSS_ADDR\n");
65 static int kvm_set_identity_map_addr(kvm_context_t kvm
, uint64_t addr
)
67 #ifdef KVM_CAP_SET_IDENTITY_MAP_ADDR
70 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_SET_IDENTITY_MAP_ADDR
);
72 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_IDENTITY_MAP_ADDR
, &addr
);
74 fprintf(stderr
, "kvm_set_identity_map_addr: %m\n");
83 static int kvm_init_identity_map_page(kvm_context_t kvm
)
85 #ifdef KVM_CAP_SET_IDENTITY_MAP_ADDR
88 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_SET_IDENTITY_MAP_ADDR
);
91 * this address is 4 pages before the bios, and the bios should present
94 r
= kvm_set_identity_map_addr(kvm
, 0xfeffc000);
96 fprintf(stderr
, "kvm_init_identity_map_page: "
97 "unable to set identity mapping addr\n");
105 static int kvm_create_pit(kvm_context_t kvm
)
110 kvm_state
->pit_in_kernel
= 0;
111 if (!kvm
->no_pit_creation
) {
112 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_PIT
);
114 r
= kvm_vm_ioctl(kvm_state
, KVM_CREATE_PIT
);
116 kvm_state
->pit_in_kernel
= 1;
118 fprintf(stderr
, "Create kernel PIC irqchip failed\n");
127 int kvm_arch_create(kvm_context_t kvm
, unsigned long phys_mem_bytes
,
132 r
= kvm_init_tss(kvm
);
137 r
= kvm_init_identity_map_page(kvm
);
143 * Tell fw_cfg to notify the BIOS to reserve the range.
145 if (e820_add_entry(0xfeffc000, 0x4000, E820_RESERVED
) < 0) {
146 perror("e820_add_entry() table is full");
150 r
= kvm_create_pit(kvm
);
155 r
= kvm_init_coalesced_mmio(kvm
);
163 #ifdef KVM_EXIT_TPR_ACCESS
165 static int kvm_handle_tpr_access(CPUState
*env
)
167 struct kvm_run
*run
= env
->kvm_run
;
168 kvm_tpr_access_report(env
,
170 run
->tpr_access
.is_write
);
175 int kvm_enable_vapic(CPUState
*env
, uint64_t vapic
)
177 struct kvm_vapic_addr va
= {
181 return kvm_vcpu_ioctl(env
, KVM_SET_VAPIC_ADDR
, &va
);
186 int kvm_arch_run(CPUState
*env
)
189 struct kvm_run
*run
= env
->kvm_run
;
191 switch (run
->exit_reason
) {
192 #ifdef KVM_EXIT_SET_TPR
193 case KVM_EXIT_SET_TPR
:
196 #ifdef KVM_EXIT_TPR_ACCESS
197 case KVM_EXIT_TPR_ACCESS
:
198 r
= kvm_handle_tpr_access(env
);
209 #ifdef KVM_CAP_IRQCHIP
211 int kvm_get_lapic(CPUState
*env
, struct kvm_lapic_state
*s
)
215 if (!kvm_irqchip_in_kernel()) {
219 r
= kvm_vcpu_ioctl(env
, KVM_GET_LAPIC
, s
);
221 fprintf(stderr
, "KVM_GET_LAPIC failed\n");
226 int kvm_set_lapic(CPUState
*env
, struct kvm_lapic_state
*s
)
230 if (!kvm_irqchip_in_kernel()) {
234 r
= kvm_vcpu_ioctl(env
, KVM_SET_LAPIC
, s
);
237 fprintf(stderr
, "KVM_SET_LAPIC failed\n");
246 int kvm_get_pit(kvm_context_t kvm
, struct kvm_pit_state
*s
)
248 if (!kvm_pit_in_kernel()) {
251 return kvm_vm_ioctl(kvm_state
, KVM_GET_PIT
, s
);
254 int kvm_set_pit(kvm_context_t kvm
, struct kvm_pit_state
*s
)
256 if (!kvm_pit_in_kernel()) {
259 return kvm_vm_ioctl(kvm_state
, KVM_SET_PIT
, s
);
262 #ifdef KVM_CAP_PIT_STATE2
263 int kvm_get_pit2(kvm_context_t kvm
, struct kvm_pit_state2
*ps2
)
265 if (!kvm_pit_in_kernel()) {
268 return kvm_vm_ioctl(kvm_state
, KVM_GET_PIT2
, ps2
);
271 int kvm_set_pit2(kvm_context_t kvm
, struct kvm_pit_state2
*ps2
)
273 if (!kvm_pit_in_kernel()) {
276 return kvm_vm_ioctl(kvm_state
, KVM_SET_PIT2
, ps2
);
282 int kvm_has_pit_state2(kvm_context_t kvm
)
286 #ifdef KVM_CAP_PIT_STATE2
287 r
= kvm_check_extension(kvm_state
, KVM_CAP_PIT_STATE2
);
292 void kvm_show_code(CPUState
*env
)
294 #define SHOW_CODE_LEN 50
295 struct kvm_regs regs
;
296 struct kvm_sregs sregs
;
300 char code_str
[SHOW_CODE_LEN
* 3 + 1];
303 r
= kvm_vcpu_ioctl(env
, KVM_GET_SREGS
, &sregs
);
305 perror("KVM_GET_SREGS");
308 r
= kvm_vcpu_ioctl(env
, KVM_GET_REGS
, ®s
);
310 perror("KVM_GET_REGS");
313 rip
= sregs
.cs
.base
+ regs
.rip
;
314 back_offset
= regs
.rip
;
315 if (back_offset
> 20) {
319 for (n
= -back_offset
; n
< SHOW_CODE_LEN
-back_offset
; ++n
) {
321 strcat(code_str
, " -->");
323 cpu_physical_memory_rw(rip
+ n
, &code
, 1, 1);
324 sprintf(code_str
+ strlen(code_str
), " %02x", code
);
326 fprintf(stderr
, "code:%s\n", code_str
);
331 * Returns available msr list. User must free.
333 static struct kvm_msr_list
*kvm_get_msr_list(void)
335 struct kvm_msr_list sizer
, *msrs
;
339 r
= kvm_ioctl(kvm_state
, KVM_GET_MSR_INDEX_LIST
, &sizer
);
340 if (r
< 0 && r
!= -E2BIG
) {
343 /* Old kernel modules had a bug and could write beyond the provided
344 memory. Allocate at least a safe amount of 1K. */
345 msrs
= qemu_malloc(MAX(1024, sizeof(*msrs
) +
346 sizer
.nmsrs
* sizeof(*msrs
->indices
)));
348 msrs
->nmsrs
= sizer
.nmsrs
;
349 r
= kvm_ioctl(kvm_state
, KVM_GET_MSR_INDEX_LIST
, msrs
);
358 static void print_seg(FILE *file
, const char *name
, struct kvm_segment
*seg
)
361 "%s %04x (%08llx/%08x p %d dpl %d db %d s %d type %x l %d"
363 name
, seg
->selector
, seg
->base
, seg
->limit
, seg
->present
,
364 seg
->dpl
, seg
->db
, seg
->s
, seg
->type
, seg
->l
, seg
->g
,
368 static void print_dt(FILE *file
, const char *name
, struct kvm_dtable
*dt
)
370 fprintf(stderr
, "%s %llx/%x\n", name
, dt
->base
, dt
->limit
);
373 void kvm_show_regs(CPUState
*env
)
375 struct kvm_regs regs
;
376 struct kvm_sregs sregs
;
379 r
= kvm_vcpu_ioctl(env
, KVM_GET_REGS
, ®s
);
381 perror("KVM_GET_REGS");
385 "rax %016llx rbx %016llx rcx %016llx rdx %016llx\n"
386 "rsi %016llx rdi %016llx rsp %016llx rbp %016llx\n"
387 "r8 %016llx r9 %016llx r10 %016llx r11 %016llx\n"
388 "r12 %016llx r13 %016llx r14 %016llx r15 %016llx\n"
389 "rip %016llx rflags %08llx\n",
390 regs
.rax
, regs
.rbx
, regs
.rcx
, regs
.rdx
,
391 regs
.rsi
, regs
.rdi
, regs
.rsp
, regs
.rbp
,
392 regs
.r8
, regs
.r9
, regs
.r10
, regs
.r11
,
393 regs
.r12
, regs
.r13
, regs
.r14
, regs
.r15
,
394 regs
.rip
, regs
.rflags
);
395 r
= kvm_vcpu_ioctl(env
, KVM_GET_SREGS
, &sregs
);
397 perror("KVM_GET_SREGS");
400 print_seg(stderr
, "cs", &sregs
.cs
);
401 print_seg(stderr
, "ds", &sregs
.ds
);
402 print_seg(stderr
, "es", &sregs
.es
);
403 print_seg(stderr
, "ss", &sregs
.ss
);
404 print_seg(stderr
, "fs", &sregs
.fs
);
405 print_seg(stderr
, "gs", &sregs
.gs
);
406 print_seg(stderr
, "tr", &sregs
.tr
);
407 print_seg(stderr
, "ldt", &sregs
.ldt
);
408 print_dt(stderr
, "gdt", &sregs
.gdt
);
409 print_dt(stderr
, "idt", &sregs
.idt
);
410 fprintf(stderr
, "cr0 %llx cr2 %llx cr3 %llx cr4 %llx cr8 %llx"
412 sregs
.cr0
, sregs
.cr2
, sregs
.cr3
, sregs
.cr4
, sregs
.cr8
,
416 static void kvm_set_cr8(CPUState
*env
, uint64_t cr8
)
418 env
->kvm_run
->cr8
= cr8
;
421 int kvm_set_shadow_pages(kvm_context_t kvm
, unsigned int nrshadow_pages
)
423 #ifdef KVM_CAP_MMU_SHADOW_CACHE_CONTROL
426 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
,
427 KVM_CAP_MMU_SHADOW_CACHE_CONTROL
);
429 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_NR_MMU_PAGES
, nrshadow_pages
);
431 fprintf(stderr
, "kvm_set_shadow_pages: %m\n");
440 int kvm_get_shadow_pages(kvm_context_t kvm
, unsigned int *nrshadow_pages
)
442 #ifdef KVM_CAP_MMU_SHADOW_CACHE_CONTROL
445 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
,
446 KVM_CAP_MMU_SHADOW_CACHE_CONTROL
);
448 *nrshadow_pages
= kvm_vm_ioctl(kvm_state
, KVM_GET_NR_MMU_PAGES
);
456 static int kvm_enable_tpr_access_reporting(CPUState
*env
)
459 struct kvm_tpr_access_ctl tac
= { .enabled
= 1 };
461 r
= kvm_ioctl(env
->kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_VAPIC
);
465 return kvm_vcpu_ioctl(env
, KVM_TPR_ACCESS_REPORTING
, &tac
);
469 #ifdef KVM_CAP_ADJUST_CLOCK
470 static struct kvm_clock_data kvmclock_data
;
472 static void kvmclock_pre_save(void *opaque
)
474 struct kvm_clock_data
*cl
= opaque
;
476 kvm_vm_ioctl(kvm_state
, KVM_GET_CLOCK
, cl
);
479 static int kvmclock_post_load(void *opaque
, int version_id
)
481 struct kvm_clock_data
*cl
= opaque
;
483 return kvm_vm_ioctl(kvm_state
, KVM_SET_CLOCK
, cl
);
486 static const VMStateDescription vmstate_kvmclock
= {
489 .minimum_version_id
= 1,
490 .minimum_version_id_old
= 1,
491 .pre_save
= kvmclock_pre_save
,
492 .post_load
= kvmclock_post_load
,
493 .fields
= (VMStateField
[]) {
494 VMSTATE_U64(clock
, struct kvm_clock_data
),
495 VMSTATE_END_OF_LIST()
500 int kvm_arch_qemu_create_context(void)
503 struct utsname utsname
;
506 lm_capable_kernel
= strcmp(utsname
.machine
, "x86_64") == 0;
508 if (kvm_shadow_memory
) {
509 kvm_set_shadow_pages(kvm_context
, kvm_shadow_memory
);
512 kvm_msr_list
= kvm_get_msr_list();
517 #ifdef KVM_CAP_ADJUST_CLOCK
518 if (kvm_check_extension(kvm_state
, KVM_CAP_ADJUST_CLOCK
)) {
519 vmstate_register(NULL
, 0, &vmstate_kvmclock
, &kvmclock_data
);
523 r
= kvm_set_boot_cpu_id(0);
524 if (r
< 0 && r
!= -ENOSYS
) {
531 static void kvm_arch_save_mpstate(CPUState
*env
)
533 #ifdef KVM_CAP_MP_STATE
535 struct kvm_mp_state mp_state
;
537 r
= kvm_get_mpstate(env
, &mp_state
);
541 env
->mp_state
= mp_state
.mp_state
;
542 if (kvm_irqchip_in_kernel()) {
543 env
->halted
= (env
->mp_state
== KVM_MP_STATE_HALTED
);
551 static void kvm_arch_load_mpstate(CPUState
*env
)
553 #ifdef KVM_CAP_MP_STATE
554 struct kvm_mp_state mp_state
;
557 * -1 indicates that the host did not support GET_MP_STATE ioctl,
560 if (env
->mp_state
!= -1) {
561 mp_state
.mp_state
= env
->mp_state
;
562 kvm_set_mpstate(env
, &mp_state
);
567 static void kvm_reset_mpstate(CPUState
*env
)
569 #ifdef KVM_CAP_MP_STATE
570 if (kvm_check_extension(kvm_state
, KVM_CAP_MP_STATE
)) {
571 if (kvm_irqchip_in_kernel()) {
572 env
->mp_state
= cpu_is_bsp(env
) ? KVM_MP_STATE_RUNNABLE
:
573 KVM_MP_STATE_UNINITIALIZED
;
575 env
->mp_state
= KVM_MP_STATE_RUNNABLE
;
581 #define XSAVE_CWD_RIP 2
582 #define XSAVE_CWD_RDP 4
583 #define XSAVE_MXCSR 6
584 #define XSAVE_ST_SPACE 8
585 #define XSAVE_XMM_SPACE 40
586 #define XSAVE_XSTATE_BV 128
587 #define XSAVE_YMMH_SPACE 144
589 void kvm_arch_load_regs(CPUState
*env
, int level
)
593 assert(kvm_cpu_is_stopped(env
) || env
->thread_id
== kvm_get_thread_id());
595 kvm_getput_regs(env
, 1);
602 rc
= kvm_put_msrs(env
, level
);
604 perror("kvm__msrs FAILED");
607 if (level
>= KVM_PUT_RESET_STATE
) {
608 kvm_arch_load_mpstate(env
);
611 if (level
== KVM_PUT_FULL_STATE
) {
612 if (env
->kvm_vcpu_update_vapic
) {
613 kvm_tpr_enable_vapic(env
);
617 kvm_put_vcpu_events(env
, level
);
618 kvm_put_debugregs(env
);
621 kvm_guest_debug_workarounds(env
);
624 void kvm_arch_save_regs(CPUState
*env
)
628 assert(kvm_cpu_is_stopped(env
) || env
->thread_id
== kvm_get_thread_id());
630 kvm_getput_regs(env
, 0);
637 rc
= kvm_get_msrs(env
);
639 perror("kvm_get_msrs FAILED");
642 kvm_arch_save_mpstate(env
);
644 kvm_get_vcpu_events(env
);
645 kvm_get_debugregs(env
);
648 static int _kvm_arch_init_vcpu(CPUState
*env
)
650 kvm_arch_reset_vcpu(env
);
652 #ifdef KVM_EXIT_TPR_ACCESS
653 kvm_enable_tpr_access_reporting(env
);
655 kvm_reset_mpstate(env
);
659 int kvm_arch_halt(CPUState
*env
)
662 if (!((env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
663 (env
->eflags
& IF_MASK
)) &&
664 !(env
->interrupt_request
& CPU_INTERRUPT_NMI
)) {
670 int kvm_arch_pre_run(CPUState
*env
, struct kvm_run
*run
)
672 if (!kvm_irqchip_in_kernel()) {
673 kvm_set_cr8(env
, cpu_get_apic_tpr(env
->apic_state
));
678 int kvm_arch_has_work(CPUState
*env
)
680 if (((env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
681 (env
->eflags
& IF_MASK
)) ||
682 (env
->interrupt_request
& CPU_INTERRUPT_NMI
)) {
688 int kvm_arch_try_push_interrupts(void *opaque
)
690 CPUState
*env
= cpu_single_env
;
693 if (kvm_is_ready_for_interrupt_injection(env
) &&
694 (env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
695 (env
->eflags
& IF_MASK
)) {
696 env
->interrupt_request
&= ~CPU_INTERRUPT_HARD
;
697 irq
= cpu_get_pic_interrupt(env
);
699 r
= kvm_inject_irq(env
, irq
);
701 printf("cpu %d fail inject %x\n", env
->cpu_index
, irq
);
706 return (env
->interrupt_request
& CPU_INTERRUPT_HARD
) != 0;
709 #ifdef KVM_CAP_USER_NMI
710 void kvm_arch_push_nmi(void *opaque
)
712 CPUState
*env
= cpu_single_env
;
715 if (likely(!(env
->interrupt_request
& CPU_INTERRUPT_NMI
))) {
719 env
->interrupt_request
&= ~CPU_INTERRUPT_NMI
;
720 r
= kvm_inject_nmi(env
);
722 printf("cpu %d fail inject NMI\n", env
->cpu_index
);
725 #endif /* KVM_CAP_USER_NMI */
727 static int kvm_reset_msrs(CPUState
*env
)
730 struct kvm_msrs info
;
731 struct kvm_msr_entry entries
[100];
734 struct kvm_msr_entry
*msrs
= msr_data
.entries
;
742 for (n
= 0; n
< kvm_msr_list
->nmsrs
; n
++) {
743 index
= kvm_msr_list
->indices
[n
];
746 data
= 0x0007040600070406ULL
;
751 kvm_msr_entry_set(&msrs
[n
], kvm_msr_list
->indices
[n
], data
);
754 msr_data
.info
.nmsrs
= n
;
756 return kvm_vcpu_ioctl(env
, KVM_SET_MSRS
, &msr_data
);
760 void kvm_arch_cpu_reset(CPUState
*env
)
763 kvm_arch_reset_vcpu(env
);
764 kvm_reset_mpstate(env
);
767 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
768 void kvm_arch_do_ioperm(void *_data
)
770 struct ioperm_data
*data
= _data
;
771 ioperm(data
->start_port
, data
->num
, data
->turn_on
);
776 * Setup x86 specific IRQ routing
778 int kvm_arch_init_irq_routing(void)
782 if (kvm_irqchip
&& kvm_has_gsi_routing()) {
783 kvm_clear_gsi_routes();
784 for (i
= 0; i
< 8; ++i
) {
788 r
= kvm_add_irq_route(i
, KVM_IRQCHIP_PIC_MASTER
, i
);
793 for (i
= 8; i
< 16; ++i
) {
794 r
= kvm_add_irq_route(i
, KVM_IRQCHIP_PIC_SLAVE
, i
- 8);
799 for (i
= 0; i
< 24; ++i
) {
800 if (i
== 0 && irq0override
) {
801 r
= kvm_add_irq_route(i
, KVM_IRQCHIP_IOAPIC
, 2);
802 } else if (i
!= 2 || !irq0override
) {
803 r
= kvm_add_irq_route(i
, KVM_IRQCHIP_IOAPIC
, i
);
809 kvm_commit_irq_routes();
814 void kvm_arch_process_irqchip_events(CPUState
*env
)
816 if (env
->interrupt_request
& CPU_INTERRUPT_INIT
) {
817 kvm_cpu_synchronize_state(env
);
820 if (env
->interrupt_request
& CPU_INTERRUPT_SIPI
) {
821 kvm_cpu_synchronize_state(env
);