4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
9 #include "config-host.h"
15 #include "qemu-common.h"
26 #include <sys/utsname.h>
27 #include <sys/syscall.h>
29 #include <sys/ioctl.h>
31 #include <sys/prctl.h>
37 #define PR_MCE_KILL 33
41 #define BUS_MCEERR_AR 4
44 #define BUS_MCEERR_AO 5
47 #define EXPECTED_KVM_API_VERSION 12
49 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
50 #error libkvm: userspace and kernel version mismatch
55 int kvm_pit_reinject
= 1;
60 kvm_context_t kvm_context
;
62 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
63 pthread_cond_t qemu_vcpu_cond
= PTHREAD_COND_INITIALIZER
;
64 pthread_cond_t qemu_system_cond
= PTHREAD_COND_INITIALIZER
;
65 pthread_cond_t qemu_pause_cond
= PTHREAD_COND_INITIALIZER
;
66 pthread_cond_t qemu_work_cond
= PTHREAD_COND_INITIALIZER
;
67 __thread CPUState
*current_env
;
69 static int qemu_system_ready
;
71 #define SIG_IPI (SIGRTMIN+4)
74 static int io_thread_fd
= -1;
75 static int io_thread_sigfd
= -1;
77 static CPUState
*kvm_debug_cpu_requested
;
79 static uint64_t phys_ram_size
;
81 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
82 /* The list of ioperm_data */
83 static QLIST_HEAD(, ioperm_data
) ioperm_head
;
86 //#define DEBUG_MEMREG
88 #define DPRINTF(fmt, args...) \
89 do { fprintf(stderr, "%s:%d " fmt , __func__, __LINE__, ##args); } while (0)
91 #define DPRINTF(fmt, args...) do {} while (0)
94 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
96 int kvm_abi
= EXPECTED_KVM_API_VERSION
;
99 #ifdef KVM_CAP_SET_GUEST_DEBUG
100 static int kvm_debug(CPUState
*env
,
101 struct kvm_debug_exit_arch
*arch_info
)
103 int handle
= kvm_arch_debug(arch_info
);
106 kvm_debug_cpu_requested
= env
;
113 static int handle_unhandled(uint64_t reason
)
115 fprintf(stderr
, "kvm: unhandled exit %" PRIx64
"\n", reason
);
120 static inline void set_gsi(kvm_context_t kvm
, unsigned int gsi
)
122 uint32_t *bitmap
= kvm
->used_gsi_bitmap
;
124 if (gsi
< kvm
->max_gsi
)
125 bitmap
[gsi
/ 32] |= 1U << (gsi
% 32);
127 DPRINTF("Invalid GSI %u\n", gsi
);
130 static inline void clear_gsi(kvm_context_t kvm
, unsigned int gsi
)
132 uint32_t *bitmap
= kvm
->used_gsi_bitmap
;
134 if (gsi
< kvm
->max_gsi
)
135 bitmap
[gsi
/ 32] &= ~(1U << (gsi
% 32));
137 DPRINTF("Invalid GSI %u\n", gsi
);
141 unsigned long phys_addr
;
143 unsigned long userspace_addr
;
148 struct slot_info slots
[KVM_MAX_NUM_MEM_REGIONS
];
150 static void init_slots(void)
154 for (i
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
)
158 static int get_free_slot(kvm_context_t kvm
)
163 #if defined(KVM_CAP_SET_TSS_ADDR) && !defined(__s390__)
164 tss_ext
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_SET_TSS_ADDR
);
170 * on older kernels where the set tss ioctl is not supprted we must save
171 * slot 0 to hold the extended memory, as the vmx will use the last 3
172 * pages of this slot.
179 for (; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
)
185 static void register_slot(int slot
, unsigned long phys_addr
,
186 unsigned long len
, unsigned long userspace_addr
,
189 slots
[slot
].phys_addr
= phys_addr
;
190 slots
[slot
].len
= len
;
191 slots
[slot
].userspace_addr
= userspace_addr
;
192 slots
[slot
].flags
= flags
;
195 static void free_slot(int slot
)
198 slots
[slot
].logging_count
= 0;
201 static int get_slot(unsigned long phys_addr
)
205 for (i
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
) {
206 if (slots
[i
].len
&& slots
[i
].phys_addr
<= phys_addr
&&
207 (slots
[i
].phys_addr
+ slots
[i
].len
- 1) >= phys_addr
)
213 /* Returns -1 if this slot is not totally contained on any other,
214 * and the number of the slot otherwise */
215 static int get_container_slot(uint64_t phys_addr
, unsigned long size
)
219 for (i
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
)
220 if (slots
[i
].len
&& slots
[i
].phys_addr
<= phys_addr
&&
221 (slots
[i
].phys_addr
+ slots
[i
].len
) >= phys_addr
+ size
)
226 int kvm_is_containing_region(kvm_context_t kvm
, unsigned long phys_addr
,
229 int slot
= get_container_slot(phys_addr
, size
);
236 * dirty pages logging control
238 static int kvm_dirty_pages_log_change(kvm_context_t kvm
,
239 unsigned long phys_addr
, unsigned flags
,
243 int slot
= get_slot(phys_addr
);
246 fprintf(stderr
, "BUG: %s: invalid parameters\n", __FUNCTION__
);
250 flags
= (slots
[slot
].flags
& ~mask
) | flags
;
251 if (flags
== slots
[slot
].flags
)
253 slots
[slot
].flags
= flags
;
256 struct kvm_userspace_memory_region mem
= {
258 .memory_size
= slots
[slot
].len
,
259 .guest_phys_addr
= slots
[slot
].phys_addr
,
260 .userspace_addr
= slots
[slot
].userspace_addr
,
261 .flags
= slots
[slot
].flags
,
265 DPRINTF("slot %d start %llx len %llx flags %x\n",
266 mem
.slot
, mem
.guest_phys_addr
, mem
.memory_size
, mem
.flags
);
267 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_USER_MEMORY_REGION
, &mem
);
269 fprintf(stderr
, "%s: %m\n", __FUNCTION__
);
274 static int kvm_dirty_pages_log_change_all(kvm_context_t kvm
,
275 int (*change
)(kvm_context_t kvm
,
281 for (i
= r
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
&& r
== 0; i
++) {
283 r
= change(kvm
, slots
[i
].phys_addr
, slots
[i
].len
);
288 int kvm_dirty_pages_log_enable_slot(kvm_context_t kvm
, uint64_t phys_addr
,
291 int slot
= get_slot(phys_addr
);
293 DPRINTF("start %" PRIx64
" len %" PRIx64
"\n", phys_addr
, len
);
295 fprintf(stderr
, "BUG: %s: invalid parameters\n", __func__
);
299 if (slots
[slot
].logging_count
++)
302 return kvm_dirty_pages_log_change(kvm
, slots
[slot
].phys_addr
,
303 KVM_MEM_LOG_DIRTY_PAGES
,
304 KVM_MEM_LOG_DIRTY_PAGES
);
307 int kvm_dirty_pages_log_disable_slot(kvm_context_t kvm
, uint64_t phys_addr
,
310 int slot
= get_slot(phys_addr
);
313 fprintf(stderr
, "BUG: %s: invalid parameters\n", __func__
);
317 if (--slots
[slot
].logging_count
)
320 return kvm_dirty_pages_log_change(kvm
, slots
[slot
].phys_addr
, 0,
321 KVM_MEM_LOG_DIRTY_PAGES
);
325 * Enable dirty page logging for all memory regions
327 int kvm_dirty_pages_log_enable_all(kvm_context_t kvm
)
329 if (kvm
->dirty_pages_log_all
)
331 kvm
->dirty_pages_log_all
= 1;
332 return kvm_dirty_pages_log_change_all(kvm
, kvm_dirty_pages_log_enable_slot
);
336 * Enable dirty page logging only for memory regions that were created with
337 * dirty logging enabled (disable for all other memory regions).
339 int kvm_dirty_pages_log_reset(kvm_context_t kvm
)
341 if (!kvm
->dirty_pages_log_all
)
343 kvm
->dirty_pages_log_all
= 0;
344 return kvm_dirty_pages_log_change_all(kvm
,
345 kvm_dirty_pages_log_disable_slot
);
349 static int kvm_create_context(void);
351 int kvm_init(int smp_cpus
)
357 fd
= open("/dev/kvm", O_RDWR
);
359 perror("open /dev/kvm");
362 r
= ioctl(fd
, KVM_GET_API_VERSION
, 0);
365 "kvm kernel version too old: "
366 "KVM_GET_API_VERSION ioctl not supported\n");
369 if (r
< EXPECTED_KVM_API_VERSION
) {
370 fprintf(stderr
, "kvm kernel version too old: "
371 "We expect API version %d or newer, but got "
372 "version %d\n", EXPECTED_KVM_API_VERSION
, r
);
375 if (r
> EXPECTED_KVM_API_VERSION
) {
376 fprintf(stderr
, "kvm userspace version too old\n");
380 kvm_page_size
= getpagesize();
381 kvm_state
= qemu_mallocz(sizeof(*kvm_state
));
382 kvm_context
= &kvm_state
->kvm_context
;
385 kvm_state
->vmfd
= -1;
386 kvm_context
->opaque
= cpu_single_env
;
387 kvm_context
->dirty_pages_log_all
= 0;
388 kvm_context
->no_irqchip_creation
= 0;
389 kvm_context
->no_pit_creation
= 0;
391 #ifdef KVM_CAP_SET_GUEST_DEBUG
392 QTAILQ_INIT(&kvm_state
->kvm_sw_breakpoints
);
395 gsi_count
= kvm_get_gsi_count(kvm_context
);
399 /* Round up so we can search ints using ffs */
400 gsi_bits
= ALIGN(gsi_count
, 32);
401 kvm_context
->used_gsi_bitmap
= qemu_mallocz(gsi_bits
/ 8);
402 kvm_context
->max_gsi
= gsi_bits
;
404 /* Mark any over-allocated bits as already in use */
405 for (i
= gsi_count
; i
< gsi_bits
; i
++)
406 set_gsi(kvm_context
, i
);
409 kvm_cpu_register_phys_memory_client();
411 pthread_mutex_lock(&qemu_mutex
);
412 return kvm_create_context();
419 static void kvm_finalize(KVMState
*s
)
422 if (kvm->vcpu_fd[0] != -1)
423 close(kvm->vcpu_fd[0]);
424 if (kvm->vm_fd != -1)
431 void kvm_disable_irqchip_creation(kvm_context_t kvm
)
433 kvm
->no_irqchip_creation
= 1;
436 void kvm_disable_pit_creation(kvm_context_t kvm
)
438 kvm
->no_pit_creation
= 1;
441 static void kvm_reset_vcpu(void *opaque
)
443 CPUState
*env
= opaque
;
445 kvm_arch_cpu_reset(env
);
448 static void kvm_create_vcpu(CPUState
*env
, int id
)
452 KVMState
*s
= kvm_state
;
454 r
= kvm_vm_ioctl(kvm_state
, KVM_CREATE_VCPU
, id
);
456 fprintf(stderr
, "kvm_create_vcpu: %m\n");
457 fprintf(stderr
, "Failed to create vCPU. Check the -smp parameter.\n");
462 env
->kvm_state
= kvm_state
;
464 mmap_size
= kvm_ioctl(kvm_state
, KVM_GET_VCPU_MMAP_SIZE
, 0);
466 fprintf(stderr
, "get vcpu mmap size: %m\n");
470 mmap(NULL
, mmap_size
, PROT_READ
| PROT_WRITE
, MAP_SHARED
, env
->kvm_fd
,
472 if (env
->kvm_run
== MAP_FAILED
) {
473 fprintf(stderr
, "mmap vcpu area: %m\n");
477 #ifdef KVM_CAP_COALESCED_MMIO
478 if (s
->coalesced_mmio
&& !s
->coalesced_mmio_ring
)
479 s
->coalesced_mmio_ring
= (void *) env
->kvm_run
+
480 s
->coalesced_mmio
* PAGE_SIZE
;
483 r
= kvm_arch_init_vcpu(env
);
485 qemu_register_reset(kvm_reset_vcpu
, env
);
492 /* We're no good with semi-broken states. */
496 static int kvm_set_boot_vcpu_id(kvm_context_t kvm
, uint32_t id
)
498 #ifdef KVM_CAP_SET_BOOT_CPU_ID
499 int r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_SET_BOOT_CPU_ID
);
501 return kvm_vm_ioctl(kvm_state
, KVM_SET_BOOT_CPU_ID
, id
);
508 int kvm_create_vm(kvm_context_t kvm
)
511 #ifdef KVM_CAP_IRQ_ROUTING
512 kvm
->irq_routes
= qemu_mallocz(sizeof(*kvm
->irq_routes
));
513 kvm
->nr_allocated_irq_routes
= 0;
516 fd
= kvm_ioctl(kvm_state
, KVM_CREATE_VM
, 0);
518 fprintf(stderr
, "kvm_create_vm: %m\n");
521 kvm_state
->vmfd
= fd
;
525 static int kvm_create_default_phys_mem(kvm_context_t kvm
,
526 unsigned long phys_mem_bytes
,
529 #ifdef KVM_CAP_USER_MEMORY
530 int r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_USER_MEMORY
);
534 "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
536 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
541 void kvm_create_irqchip(kvm_context_t kvm
)
545 kvm
->irqchip_in_kernel
= 0;
546 #ifdef KVM_CAP_IRQCHIP
547 if (!kvm
->no_irqchip_creation
) {
548 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_IRQCHIP
);
549 if (r
> 0) { /* kernel irqchip supported */
550 r
= kvm_vm_ioctl(kvm_state
, KVM_CREATE_IRQCHIP
);
552 kvm
->irqchip_inject_ioctl
= KVM_IRQ_LINE
;
553 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
554 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
,
555 KVM_CAP_IRQ_INJECT_STATUS
);
557 kvm
->irqchip_inject_ioctl
= KVM_IRQ_LINE_STATUS
;
559 kvm
->irqchip_in_kernel
= 1;
561 fprintf(stderr
, "Create kernel PIC irqchip failed\n");
565 kvm_state
->irqchip_in_kernel
= kvm
->irqchip_in_kernel
;
568 int kvm_create(kvm_context_t kvm
, unsigned long phys_mem_bytes
, void **vm_mem
)
572 r
= kvm_create_vm(kvm
);
575 r
= kvm_arch_create(kvm
, phys_mem_bytes
, vm_mem
);
579 r
= kvm_create_default_phys_mem(kvm
, phys_mem_bytes
, vm_mem
);
582 kvm_create_irqchip(kvm
);
588 int kvm_register_phys_mem(kvm_context_t kvm
,
589 unsigned long phys_start
, void *userspace_addr
,
590 unsigned long len
, int log
)
593 struct kvm_userspace_memory_region memory
= {
595 .guest_phys_addr
= phys_start
,
596 .userspace_addr
= (unsigned long) (uintptr_t) userspace_addr
,
597 .flags
= log
? KVM_MEM_LOG_DIRTY_PAGES
: 0,
601 memory
.slot
= get_free_slot(kvm
);
603 ("memory: gpa: %llx, size: %llx, uaddr: %llx, slot: %x, flags: %x\n",
604 memory
.guest_phys_addr
, memory
.memory_size
, memory
.userspace_addr
,
605 memory
.slot
, memory
.flags
);
606 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_USER_MEMORY_REGION
, &memory
);
608 fprintf(stderr
, "create_userspace_phys_mem: %s\n", strerror(-r
));
611 register_slot(memory
.slot
, memory
.guest_phys_addr
, memory
.memory_size
,
612 memory
.userspace_addr
, memory
.flags
);
617 /* destroy/free a whole slot.
618 * phys_start, len and slot are the params passed to kvm_create_phys_mem()
620 void kvm_destroy_phys_mem(kvm_context_t kvm
, unsigned long phys_start
,
625 struct kvm_userspace_memory_region memory
= {
627 .guest_phys_addr
= phys_start
,
632 slot
= get_slot(phys_start
);
634 if ((slot
>= KVM_MAX_NUM_MEM_REGIONS
) || (slot
== -1)) {
635 fprintf(stderr
, "BUG: %s: invalid parameters (slot=%d)\n", __FUNCTION__
,
639 if (phys_start
!= slots
[slot
].phys_addr
) {
641 "WARNING: %s: phys_start is 0x%lx expecting 0x%lx\n",
642 __FUNCTION__
, phys_start
, slots
[slot
].phys_addr
);
643 phys_start
= slots
[slot
].phys_addr
;
647 DPRINTF("slot %d start %llx len %llx flags %x\n",
648 memory
.slot
, memory
.guest_phys_addr
, memory
.memory_size
,
650 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_USER_MEMORY_REGION
, &memory
);
652 fprintf(stderr
, "destroy_userspace_phys_mem: %s", strerror(-r
));
656 free_slot(memory
.slot
);
659 void kvm_unregister_memory_area(kvm_context_t kvm
, uint64_t phys_addr
,
663 int slot
= get_container_slot(phys_addr
, size
);
666 DPRINTF("Unregistering memory region %" PRIx64
" (%lx)\n", phys_addr
, size
);
667 kvm_destroy_phys_mem(kvm
, phys_addr
, size
);
672 static int kvm_get_map(kvm_context_t kvm
, int ioctl_num
, int slot
, void *buf
)
675 struct kvm_dirty_log log
= {
679 log
.dirty_bitmap
= buf
;
681 r
= kvm_vm_ioctl(kvm_state
, ioctl_num
, &log
);
687 int kvm_get_dirty_pages(kvm_context_t kvm
, unsigned long phys_addr
, void *buf
)
691 slot
= get_slot(phys_addr
);
692 return kvm_get_map(kvm
, KVM_GET_DIRTY_LOG
, slot
, buf
);
695 int kvm_get_dirty_pages_range(kvm_context_t kvm
, unsigned long phys_addr
,
696 unsigned long len
, void *opaque
,
697 int (*cb
)(unsigned long start
,
698 unsigned long len
, void *bitmap
,
703 unsigned long end_addr
= phys_addr
+ len
;
706 for (i
= 0; i
< KVM_MAX_NUM_MEM_REGIONS
; ++i
) {
707 if ((slots
[i
].len
&& (uint64_t) slots
[i
].phys_addr
>= phys_addr
)
708 && ((uint64_t) slots
[i
].phys_addr
+ slots
[i
].len
<= end_addr
)) {
709 buf
= qemu_malloc(BITMAP_SIZE(slots
[i
].len
));
710 r
= kvm_get_map(kvm
, KVM_GET_DIRTY_LOG
, i
, buf
);
715 r
= cb(slots
[i
].phys_addr
, slots
[i
].len
, buf
, opaque
);
724 #ifdef KVM_CAP_IRQCHIP
726 int kvm_set_irq_level(kvm_context_t kvm
, int irq
, int level
, int *status
)
728 struct kvm_irq_level event
;
731 if (!kvm
->irqchip_in_kernel
)
735 r
= kvm_vm_ioctl(kvm_state
, kvm
->irqchip_inject_ioctl
, &event
);
737 perror("kvm_set_irq_level");
740 #ifdef KVM_CAP_IRQ_INJECT_STATUS
742 (kvm
->irqchip_inject_ioctl
== KVM_IRQ_LINE
) ? 1 : event
.status
;
751 int kvm_get_irqchip(kvm_context_t kvm
, struct kvm_irqchip
*chip
)
755 if (!kvm
->irqchip_in_kernel
)
757 r
= kvm_vm_ioctl(kvm_state
, KVM_GET_IRQCHIP
, chip
);
759 perror("kvm_get_irqchip\n");
764 int kvm_set_irqchip(kvm_context_t kvm
, struct kvm_irqchip
*chip
)
768 if (!kvm
->irqchip_in_kernel
)
770 r
= kvm_vm_ioctl(kvm_state
, KVM_SET_IRQCHIP
, chip
);
772 perror("kvm_set_irqchip\n");
779 static int handle_debug(CPUState
*env
)
781 #ifdef KVM_CAP_SET_GUEST_DEBUG
782 struct kvm_run
*run
= env
->kvm_run
;
784 return kvm_debug(env
, &run
->debug
.arch
);
790 int kvm_get_regs(CPUState
*env
, struct kvm_regs
*regs
)
792 return kvm_vcpu_ioctl(env
, KVM_GET_REGS
, regs
);
795 int kvm_set_regs(CPUState
*env
, struct kvm_regs
*regs
)
797 return kvm_vcpu_ioctl(env
, KVM_SET_REGS
, regs
);
800 int kvm_get_fpu(CPUState
*env
, struct kvm_fpu
*fpu
)
802 return kvm_vcpu_ioctl(env
, KVM_GET_FPU
, fpu
);
805 int kvm_set_fpu(CPUState
*env
, struct kvm_fpu
*fpu
)
807 return kvm_vcpu_ioctl(env
, KVM_SET_FPU
, fpu
);
810 int kvm_get_sregs(CPUState
*env
, struct kvm_sregs
*sregs
)
812 return kvm_vcpu_ioctl(env
, KVM_GET_SREGS
, sregs
);
815 int kvm_set_sregs(CPUState
*env
, struct kvm_sregs
*sregs
)
817 return kvm_vcpu_ioctl(env
, KVM_SET_SREGS
, sregs
);
820 #ifdef KVM_CAP_MP_STATE
821 int kvm_get_mpstate(CPUState
*env
, struct kvm_mp_state
*mp_state
)
825 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_MP_STATE
);
827 return kvm_vcpu_ioctl(env
, KVM_GET_MP_STATE
, mp_state
);
831 int kvm_set_mpstate(CPUState
*env
, struct kvm_mp_state
*mp_state
)
835 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_MP_STATE
);
837 return kvm_vcpu_ioctl(env
, KVM_SET_MP_STATE
, mp_state
);
842 static int handle_mmio(CPUState
*env
)
844 unsigned long addr
= env
->kvm_run
->mmio
.phys_addr
;
845 struct kvm_run
*kvm_run
= env
->kvm_run
;
846 void *data
= kvm_run
->mmio
.data
;
848 /* hack: Red Hat 7.1 generates these weird accesses. */
849 if ((addr
> 0xa0000 - 4 && addr
<= 0xa0000) && kvm_run
->mmio
.len
== 3)
852 cpu_physical_memory_rw(addr
, data
, kvm_run
->mmio
.len
, kvm_run
->mmio
.is_write
);
856 int handle_io_window(kvm_context_t kvm
)
861 int handle_shutdown(kvm_context_t kvm
, CPUState
*env
)
863 /* stop the current vcpu from going back to guest mode */
866 qemu_system_reset_request();
870 static inline void push_nmi(kvm_context_t kvm
)
872 #ifdef KVM_CAP_USER_NMI
873 kvm_arch_push_nmi(kvm
->opaque
);
874 #endif /* KVM_CAP_USER_NMI */
877 void post_kvm_run(kvm_context_t kvm
, CPUState
*env
)
879 pthread_mutex_lock(&qemu_mutex
);
880 kvm_arch_post_run(env
, env
->kvm_run
);
881 cpu_single_env
= env
;
884 int pre_kvm_run(kvm_context_t kvm
, CPUState
*env
)
886 kvm_arch_pre_run(env
, env
->kvm_run
);
888 if (env
->kvm_vcpu_dirty
) {
889 kvm_arch_load_regs(env
, KVM_PUT_RUNTIME_STATE
);
890 env
->kvm_vcpu_dirty
= 0;
893 pthread_mutex_unlock(&qemu_mutex
);
897 int kvm_is_ready_for_interrupt_injection(CPUState
*env
)
899 return env
->kvm_run
->ready_for_interrupt_injection
;
902 int kvm_run(CPUState
*env
)
905 kvm_context_t kvm
= &env
->kvm_state
->kvm_context
;
906 struct kvm_run
*run
= env
->kvm_run
;
907 int fd
= env
->kvm_fd
;
911 #if !defined(__s390__)
912 if (!kvm
->irqchip_in_kernel
)
913 run
->request_interrupt_window
= kvm_arch_try_push_interrupts(env
);
916 r
= pre_kvm_run(kvm
, env
);
919 r
= ioctl(fd
, KVM_RUN
, 0);
921 if (r
== -1 && errno
!= EINTR
&& errno
!= EAGAIN
) {
923 post_kvm_run(kvm
, env
);
924 fprintf(stderr
, "kvm_run: %s\n", strerror(-r
));
928 post_kvm_run(kvm
, env
);
930 kvm_flush_coalesced_mmio_buffer();
932 #if !defined(__s390__)
934 r
= handle_io_window(kvm
);
939 switch (run
->exit_reason
) {
940 case KVM_EXIT_UNKNOWN
:
941 r
= handle_unhandled(run
->hw
.hardware_exit_reason
);
943 case KVM_EXIT_FAIL_ENTRY
:
944 r
= handle_unhandled(run
->fail_entry
.hardware_entry_failure_reason
);
946 case KVM_EXIT_EXCEPTION
:
947 fprintf(stderr
, "exception %d (%x)\n", run
->ex
.exception
,
954 r
= kvm_handle_io(run
->io
.port
,
955 (uint8_t *)run
+ run
->io
.data_offset
,
962 r
= handle_debug(env
);
965 r
= handle_mmio(env
);
968 r
= kvm_arch_halt(env
);
970 case KVM_EXIT_IRQ_WINDOW_OPEN
:
972 case KVM_EXIT_SHUTDOWN
:
973 r
= handle_shutdown(kvm
, env
);
975 #if defined(__s390__)
976 case KVM_EXIT_S390_SIEIC
:
977 r
= kvm_s390_handle_intercept(kvm
, env
, run
);
979 case KVM_EXIT_S390_RESET
:
980 r
= kvm_s390_handle_reset(kvm
, env
, run
);
983 case KVM_EXIT_INTERNAL_ERROR
:
984 kvm_handle_internal_error(env
, run
);
988 if (kvm_arch_run(env
)) {
989 fprintf(stderr
, "unhandled vm exit: 0x%x\n", run
->exit_reason
);
1002 int kvm_inject_irq(CPUState
*env
, unsigned irq
)
1004 struct kvm_interrupt intr
;
1007 return kvm_vcpu_ioctl(env
, KVM_INTERRUPT
, &intr
);
1010 int kvm_inject_nmi(CPUState
*env
)
1012 #ifdef KVM_CAP_USER_NMI
1013 return kvm_vcpu_ioctl(env
, KVM_NMI
);
1019 int kvm_init_coalesced_mmio(kvm_context_t kvm
)
1022 kvm_state
->coalesced_mmio
= 0;
1023 #ifdef KVM_CAP_COALESCED_MMIO
1024 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_COALESCED_MMIO
);
1026 kvm_state
->coalesced_mmio
= r
;
1033 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
1034 int kvm_assign_pci_device(kvm_context_t kvm
,
1035 struct kvm_assigned_pci_dev
*assigned_dev
)
1037 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_PCI_DEVICE
, assigned_dev
);
1040 static int kvm_old_assign_irq(kvm_context_t kvm
,
1041 struct kvm_assigned_irq
*assigned_irq
)
1043 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_IRQ
, assigned_irq
);
1046 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
1047 int kvm_assign_irq(kvm_context_t kvm
, struct kvm_assigned_irq
*assigned_irq
)
1051 ret
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_ASSIGN_DEV_IRQ
);
1053 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_DEV_IRQ
, assigned_irq
);
1056 return kvm_old_assign_irq(kvm
, assigned_irq
);
1059 int kvm_deassign_irq(kvm_context_t kvm
, struct kvm_assigned_irq
*assigned_irq
)
1061 return kvm_vm_ioctl(kvm_state
, KVM_DEASSIGN_DEV_IRQ
, assigned_irq
);
1064 int kvm_assign_irq(kvm_context_t kvm
, struct kvm_assigned_irq
*assigned_irq
)
1066 return kvm_old_assign_irq(kvm
, assigned_irq
);
1071 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
1072 int kvm_deassign_pci_device(kvm_context_t kvm
,
1073 struct kvm_assigned_pci_dev
*assigned_dev
)
1075 return kvm_vm_ioctl(kvm_state
, KVM_DEASSIGN_PCI_DEVICE
, assigned_dev
);
1079 int kvm_destroy_memory_region_works(kvm_context_t kvm
)
1083 #ifdef KVM_CAP_DESTROY_MEMORY_REGION_WORKS
1085 kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
,
1086 KVM_CAP_DESTROY_MEMORY_REGION_WORKS
);
1093 int kvm_reinject_control(kvm_context_t kvm
, int pit_reinject
)
1095 #ifdef KVM_CAP_REINJECT_CONTROL
1097 struct kvm_reinject_control control
;
1099 control
.pit_reinject
= pit_reinject
;
1101 r
= kvm_ioctl(kvm_state
, KVM_CHECK_EXTENSION
, KVM_CAP_REINJECT_CONTROL
);
1103 return kvm_vm_ioctl(kvm_state
, KVM_REINJECT_CONTROL
, &control
);
1109 int kvm_has_gsi_routing(kvm_context_t kvm
)
1113 #ifdef KVM_CAP_IRQ_ROUTING
1114 r
= kvm_check_extension(kvm_state
, KVM_CAP_IRQ_ROUTING
);
1119 int kvm_get_gsi_count(kvm_context_t kvm
)
1121 #ifdef KVM_CAP_IRQ_ROUTING
1122 return kvm_check_extension(kvm_state
, KVM_CAP_IRQ_ROUTING
);
1128 int kvm_clear_gsi_routes(kvm_context_t kvm
)
1130 #ifdef KVM_CAP_IRQ_ROUTING
1131 kvm
->irq_routes
->nr
= 0;
1138 int kvm_add_routing_entry(kvm_context_t kvm
,
1139 struct kvm_irq_routing_entry
*entry
)
1141 #ifdef KVM_CAP_IRQ_ROUTING
1142 struct kvm_irq_routing
*z
;
1143 struct kvm_irq_routing_entry
*new;
1146 if (kvm
->irq_routes
->nr
== kvm
->nr_allocated_irq_routes
) {
1147 n
= kvm
->nr_allocated_irq_routes
* 2;
1150 size
= sizeof(struct kvm_irq_routing
);
1151 size
+= n
* sizeof(*new);
1152 z
= realloc(kvm
->irq_routes
, size
);
1155 kvm
->nr_allocated_irq_routes
= n
;
1156 kvm
->irq_routes
= z
;
1158 n
= kvm
->irq_routes
->nr
++;
1159 new = &kvm
->irq_routes
->entries
[n
];
1160 memset(new, 0, sizeof(*new));
1161 new->gsi
= entry
->gsi
;
1162 new->type
= entry
->type
;
1163 new->flags
= entry
->flags
;
1166 set_gsi(kvm
, entry
->gsi
);
1174 int kvm_add_irq_route(kvm_context_t kvm
, int gsi
, int irqchip
, int pin
)
1176 #ifdef KVM_CAP_IRQ_ROUTING
1177 struct kvm_irq_routing_entry e
;
1180 e
.type
= KVM_IRQ_ROUTING_IRQCHIP
;
1182 e
.u
.irqchip
.irqchip
= irqchip
;
1183 e
.u
.irqchip
.pin
= pin
;
1184 return kvm_add_routing_entry(kvm
, &e
);
1190 int kvm_del_routing_entry(kvm_context_t kvm
,
1191 struct kvm_irq_routing_entry
*entry
)
1193 #ifdef KVM_CAP_IRQ_ROUTING
1194 struct kvm_irq_routing_entry
*e
, *p
;
1195 int i
, gsi
, found
= 0;
1199 for (i
= 0; i
< kvm
->irq_routes
->nr
; ++i
) {
1200 e
= &kvm
->irq_routes
->entries
[i
];
1201 if (e
->type
== entry
->type
&& e
->gsi
== gsi
) {
1203 case KVM_IRQ_ROUTING_IRQCHIP
:{
1204 if (e
->u
.irqchip
.irqchip
==
1205 entry
->u
.irqchip
.irqchip
1206 && e
->u
.irqchip
.pin
== entry
->u
.irqchip
.pin
) {
1207 p
= &kvm
->irq_routes
->entries
[--kvm
->irq_routes
->nr
];
1213 case KVM_IRQ_ROUTING_MSI
:{
1214 if (e
->u
.msi
.address_lo
==
1215 entry
->u
.msi
.address_lo
1216 && e
->u
.msi
.address_hi
==
1217 entry
->u
.msi
.address_hi
1218 && e
->u
.msi
.data
== entry
->u
.msi
.data
) {
1219 p
= &kvm
->irq_routes
->entries
[--kvm
->irq_routes
->nr
];
1229 /* If there are no other users of this GSI
1230 * mark it available in the bitmap */
1231 for (i
= 0; i
< kvm
->irq_routes
->nr
; i
++) {
1232 e
= &kvm
->irq_routes
->entries
[i
];
1236 if (i
== kvm
->irq_routes
->nr
)
1237 clear_gsi(kvm
, gsi
);
1249 int kvm_update_routing_entry(kvm_context_t kvm
,
1250 struct kvm_irq_routing_entry
*entry
,
1251 struct kvm_irq_routing_entry
*newentry
)
1253 #ifdef KVM_CAP_IRQ_ROUTING
1254 struct kvm_irq_routing_entry
*e
;
1257 if (entry
->gsi
!= newentry
->gsi
|| entry
->type
!= newentry
->type
) {
1261 for (i
= 0; i
< kvm
->irq_routes
->nr
; ++i
) {
1262 e
= &kvm
->irq_routes
->entries
[i
];
1263 if (e
->type
!= entry
->type
|| e
->gsi
!= entry
->gsi
) {
1267 case KVM_IRQ_ROUTING_IRQCHIP
:
1268 if (e
->u
.irqchip
.irqchip
== entry
->u
.irqchip
.irqchip
&&
1269 e
->u
.irqchip
.pin
== entry
->u
.irqchip
.pin
) {
1270 memcpy(&e
->u
.irqchip
, &newentry
->u
.irqchip
,
1271 sizeof e
->u
.irqchip
);
1275 case KVM_IRQ_ROUTING_MSI
:
1276 if (e
->u
.msi
.address_lo
== entry
->u
.msi
.address_lo
&&
1277 e
->u
.msi
.address_hi
== entry
->u
.msi
.address_hi
&&
1278 e
->u
.msi
.data
== entry
->u
.msi
.data
) {
1279 memcpy(&e
->u
.msi
, &newentry
->u
.msi
, sizeof e
->u
.msi
);
1293 int kvm_del_irq_route(kvm_context_t kvm
, int gsi
, int irqchip
, int pin
)
1295 #ifdef KVM_CAP_IRQ_ROUTING
1296 struct kvm_irq_routing_entry e
;
1299 e
.type
= KVM_IRQ_ROUTING_IRQCHIP
;
1301 e
.u
.irqchip
.irqchip
= irqchip
;
1302 e
.u
.irqchip
.pin
= pin
;
1303 return kvm_del_routing_entry(kvm
, &e
);
1309 int kvm_commit_irq_routes(kvm_context_t kvm
)
1311 #ifdef KVM_CAP_IRQ_ROUTING
1312 kvm
->irq_routes
->flags
= 0;
1313 return kvm_vm_ioctl(kvm_state
, KVM_SET_GSI_ROUTING
, kvm
->irq_routes
);
1319 int kvm_get_irq_route_gsi(kvm_context_t kvm
)
1322 uint32_t *buf
= kvm
->used_gsi_bitmap
;
1324 /* Return the lowest unused GSI in the bitmap */
1325 for (i
= 0; i
< kvm
->max_gsi
/ 32; i
++) {
1330 return bit
- 1 + i
* 32;
1336 #ifdef KVM_CAP_DEVICE_MSIX
1337 int kvm_assign_set_msix_nr(kvm_context_t kvm
,
1338 struct kvm_assigned_msix_nr
*msix_nr
)
1340 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_SET_MSIX_NR
, msix_nr
);
1343 int kvm_assign_set_msix_entry(kvm_context_t kvm
,
1344 struct kvm_assigned_msix_entry
*entry
)
1346 return kvm_vm_ioctl(kvm_state
, KVM_ASSIGN_SET_MSIX_ENTRY
, entry
);
1350 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD)
1352 #include <sys/eventfd.h>
1354 static int _kvm_irqfd(kvm_context_t kvm
, int fd
, int gsi
, int flags
)
1356 struct kvm_irqfd data
= {
1362 return kvm_vm_ioctl(kvm_state
, KVM_IRQFD
, &data
);
1365 int kvm_irqfd(kvm_context_t kvm
, int gsi
, int flags
)
1370 if (!kvm_check_extension(kvm_state
, KVM_CAP_IRQFD
))
1377 r
= _kvm_irqfd(kvm
, fd
, gsi
, 0);
1386 #else /* KVM_CAP_IRQFD */
1388 int kvm_irqfd(kvm_context_t kvm
, int gsi
, int flags
)
1393 #endif /* KVM_CAP_IRQFD */
1394 unsigned long kvm_get_thread_id(void)
1396 return syscall(SYS_gettid
);
1399 static void qemu_cond_wait(pthread_cond_t
*cond
)
1401 CPUState
*env
= cpu_single_env
;
1403 pthread_cond_wait(cond
, &qemu_mutex
);
1404 cpu_single_env
= env
;
1407 static void sig_ipi_handler(int n
)
1411 static void hardware_memory_error(void)
1413 fprintf(stderr
, "Hardware memory error!\n");
1417 static void sigbus_reraise(void)
1420 struct sigaction action
;
1422 memset(&action
, 0, sizeof(action
));
1423 action
.sa_handler
= SIG_DFL
;
1424 if (!sigaction(SIGBUS
, &action
, NULL
)) {
1427 sigaddset(&set
, SIGBUS
);
1428 sigprocmask(SIG_UNBLOCK
, &set
, NULL
);
1430 perror("Failed to re-raise SIGBUS!\n");
1434 static void sigbus_handler(int n
, struct qemu_signalfd_siginfo
*siginfo
,
1437 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1438 if (first_cpu
->mcg_cap
&& siginfo
->ssi_addr
1439 && siginfo
->ssi_code
== BUS_MCEERR_AO
) {
1441 unsigned long paddr
;
1444 /* Hope we are lucky for AO MCE */
1445 if (do_qemu_ram_addr_from_host((void *)(intptr_t)siginfo
->ssi_addr
,
1447 fprintf(stderr
, "Hardware memory error for memory used by "
1448 "QEMU itself instead of guest system!: %llx\n",
1449 (unsigned long long)siginfo
->ssi_addr
);
1452 status
= MCI_STATUS_VAL
| MCI_STATUS_UC
| MCI_STATUS_EN
1453 | MCI_STATUS_MISCV
| MCI_STATUS_ADDRV
| MCI_STATUS_S
1455 kvm_inject_x86_mce(first_cpu
, 9, status
,
1456 MCG_STATUS_MCIP
| MCG_STATUS_RIPV
, paddr
,
1457 (MCM_ADDR_PHYS
<< 6) | 0xc, 1);
1458 for (cenv
= first_cpu
->next_cpu
; cenv
!= NULL
; cenv
= cenv
->next_cpu
)
1459 kvm_inject_x86_mce(cenv
, 1, MCI_STATUS_VAL
| MCI_STATUS_UC
,
1460 MCG_STATUS_MCIP
| MCG_STATUS_RIPV
, 0, 0, 1);
1464 if (siginfo
->ssi_code
== BUS_MCEERR_AO
)
1466 else if (siginfo
->ssi_code
== BUS_MCEERR_AR
)
1467 hardware_memory_error();
1473 static void on_vcpu(CPUState
*env
, void (*func
)(void *data
), void *data
)
1475 struct qemu_work_item wi
;
1477 if (env
== current_env
) {
1484 if (!env
->kvm_cpu_state
.queued_work_first
)
1485 env
->kvm_cpu_state
.queued_work_first
= &wi
;
1487 env
->kvm_cpu_state
.queued_work_last
->next
= &wi
;
1488 env
->kvm_cpu_state
.queued_work_last
= &wi
;
1492 pthread_kill(env
->kvm_cpu_state
.thread
, SIG_IPI
);
1494 qemu_cond_wait(&qemu_work_cond
);
1497 static void do_kvm_cpu_synchronize_state(void *_env
)
1499 CPUState
*env
= _env
;
1501 if (!env
->kvm_vcpu_dirty
) {
1502 kvm_arch_save_regs(env
);
1503 env
->kvm_vcpu_dirty
= 1;
1507 void kvm_cpu_synchronize_state(CPUState
*env
)
1509 if (!env
->kvm_vcpu_dirty
)
1510 on_vcpu(env
, do_kvm_cpu_synchronize_state
, env
);
1513 void kvm_cpu_synchronize_post_reset(CPUState
*env
)
1515 kvm_arch_load_regs(env
, KVM_PUT_RESET_STATE
);
1516 env
->kvm_vcpu_dirty
= 0;
1519 void kvm_cpu_synchronize_post_init(CPUState
*env
)
1521 kvm_arch_load_regs(env
, KVM_PUT_FULL_STATE
);
1522 env
->kvm_vcpu_dirty
= 0;
1525 static void inject_interrupt(void *data
)
1527 cpu_interrupt(current_env
, (long) data
);
1530 void kvm_inject_interrupt(CPUState
*env
, int mask
)
1532 on_vcpu(env
, inject_interrupt
, (void *) (long) mask
);
1535 void kvm_update_interrupt_request(CPUState
*env
)
1540 if (!current_env
|| !current_env
->created
)
1543 * Testing for created here is really redundant
1545 if (current_env
&& current_env
->created
&&
1546 env
!= current_env
&& !env
->kvm_cpu_state
.signalled
)
1550 env
->kvm_cpu_state
.signalled
= 1;
1551 if (env
->kvm_cpu_state
.thread
)
1552 pthread_kill(env
->kvm_cpu_state
.thread
, SIG_IPI
);
1557 int kvm_cpu_exec(CPUState
*env
)
1563 printf("kvm_run returned %d\n", r
);
1570 int kvm_cpu_is_stopped(CPUState
*env
)
1572 return !vm_running
|| env
->stopped
;
1575 static void flush_queued_work(CPUState
*env
)
1577 struct qemu_work_item
*wi
;
1579 if (!env
->kvm_cpu_state
.queued_work_first
)
1582 while ((wi
= env
->kvm_cpu_state
.queued_work_first
)) {
1583 env
->kvm_cpu_state
.queued_work_first
= wi
->next
;
1587 env
->kvm_cpu_state
.queued_work_last
= NULL
;
1588 pthread_cond_broadcast(&qemu_work_cond
);
1591 static int kvm_mce_in_exception(CPUState
*env
)
1593 struct kvm_msr_entry msr_mcg_status
= {
1594 .index
= MSR_MCG_STATUS
,
1598 r
= kvm_get_msrs(env
, &msr_mcg_status
, 1);
1599 if (r
== -1 || r
== 0)
1601 return !!(msr_mcg_status
.data
& MCG_STATUS_MCIP
);
1604 static void kvm_on_sigbus(CPUState
*env
, siginfo_t
*siginfo
)
1606 #if defined(KVM_CAP_MCE) && defined(TARGET_I386)
1607 struct kvm_x86_mce mce
= {
1610 unsigned long paddr
;
1613 if (env
->mcg_cap
&& siginfo
->si_addr
1614 && (siginfo
->si_code
== BUS_MCEERR_AR
1615 || siginfo
->si_code
== BUS_MCEERR_AO
)) {
1616 if (siginfo
->si_code
== BUS_MCEERR_AR
) {
1617 /* Fake an Intel architectural Data Load SRAR UCR */
1618 mce
.status
= MCI_STATUS_VAL
| MCI_STATUS_UC
| MCI_STATUS_EN
1619 | MCI_STATUS_MISCV
| MCI_STATUS_ADDRV
| MCI_STATUS_S
1620 | MCI_STATUS_AR
| 0x134;
1621 mce
.misc
= (MCM_ADDR_PHYS
<< 6) | 0xc;
1622 mce
.mcg_status
= MCG_STATUS_MCIP
| MCG_STATUS_EIPV
;
1625 * If there is an MCE excpetion being processed, ignore
1628 r
= kvm_mce_in_exception(env
);
1630 fprintf(stderr
, "Failed to get MCE status\n");
1633 /* Fake an Intel architectural Memory scrubbing UCR */
1634 mce
.status
= MCI_STATUS_VAL
| MCI_STATUS_UC
| MCI_STATUS_EN
1635 | MCI_STATUS_MISCV
| MCI_STATUS_ADDRV
| MCI_STATUS_S
1637 mce
.misc
= (MCM_ADDR_PHYS
<< 6) | 0xc;
1638 mce
.mcg_status
= MCG_STATUS_MCIP
| MCG_STATUS_RIPV
;
1640 if (do_qemu_ram_addr_from_host((void *)siginfo
->si_addr
, &paddr
)) {
1641 fprintf(stderr
, "Hardware memory error for memory used by "
1642 "QEMU itself instaed of guest system!\n");
1643 /* Hope we are lucky for AO MCE */
1644 if (siginfo
->si_code
== BUS_MCEERR_AO
)
1647 hardware_memory_error();
1650 r
= kvm_set_mce(env
, &mce
);
1652 fprintf(stderr
, "kvm_set_mce: %s\n", strerror(errno
));
1658 if (siginfo
->si_code
== BUS_MCEERR_AO
)
1660 else if (siginfo
->si_code
== BUS_MCEERR_AR
)
1661 hardware_memory_error();
1667 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
1675 ts
.tv_sec
= timeout
/ 1000;
1676 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
1677 sigemptyset(&waitset
);
1678 sigaddset(&waitset
, SIG_IPI
);
1679 sigaddset(&waitset
, SIGBUS
);
1682 pthread_mutex_unlock(&qemu_mutex
);
1684 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
1687 pthread_mutex_lock(&qemu_mutex
);
1689 if (r
== -1 && !(e
== EAGAIN
|| e
== EINTR
)) {
1690 printf("sigtimedwait: %s\n", strerror(e
));
1696 kvm_on_sigbus(env
, &siginfo
);
1702 r
= sigpending(&chkset
);
1704 printf("sigpending: %s\n", strerror(e
));
1707 } while (sigismember(&chkset
, SIG_IPI
) || sigismember(&chkset
, SIGBUS
));
1709 cpu_single_env
= env
;
1710 flush_queued_work(env
);
1715 pthread_cond_signal(&qemu_pause_cond
);
1718 env
->kvm_cpu_state
.signalled
= 0;
1721 static int all_threads_paused(void)
1723 CPUState
*penv
= first_cpu
;
1728 penv
= (CPUState
*) penv
->next_cpu
;
1734 static void pause_all_threads(void)
1736 CPUState
*penv
= first_cpu
;
1739 if (penv
!= cpu_single_env
) {
1741 pthread_kill(penv
->kvm_cpu_state
.thread
, SIG_IPI
);
1747 penv
= (CPUState
*) penv
->next_cpu
;
1750 while (!all_threads_paused())
1751 qemu_cond_wait(&qemu_pause_cond
);
1754 static void resume_all_threads(void)
1756 CPUState
*penv
= first_cpu
;
1758 assert(!cpu_single_env
);
1763 pthread_kill(penv
->kvm_cpu_state
.thread
, SIG_IPI
);
1764 penv
= (CPUState
*) penv
->next_cpu
;
1768 static void kvm_vm_state_change_handler(void *context
, int running
, int reason
)
1771 resume_all_threads();
1773 pause_all_threads();
1776 static void setup_kernel_sigmask(CPUState
*env
)
1781 sigaddset(&set
, SIGUSR2
);
1782 sigaddset(&set
, SIGIO
);
1783 sigaddset(&set
, SIGALRM
);
1784 sigprocmask(SIG_BLOCK
, &set
, NULL
);
1786 sigprocmask(SIG_BLOCK
, NULL
, &set
);
1787 sigdelset(&set
, SIG_IPI
);
1788 sigdelset(&set
, SIGBUS
);
1790 kvm_set_signal_mask(env
, &set
);
1793 static void qemu_kvm_system_reset(void)
1795 pause_all_threads();
1797 qemu_system_reset();
1799 resume_all_threads();
1802 static void process_irqchip_events(CPUState
*env
)
1804 kvm_arch_process_irqchip_events(env
);
1805 if (kvm_arch_has_work(env
))
1809 static int kvm_main_loop_cpu(CPUState
*env
)
1812 int run_cpu
= !kvm_cpu_is_stopped(env
);
1813 if (run_cpu
&& !kvm_irqchip_in_kernel()) {
1814 process_irqchip_events(env
);
1815 run_cpu
= !env
->halted
;
1819 kvm_main_loop_wait(env
, 0);
1821 kvm_main_loop_wait(env
, 1000);
1824 pthread_mutex_unlock(&qemu_mutex
);
1828 static void *ap_main_loop(void *_env
)
1830 CPUState
*env
= _env
;
1832 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1833 struct ioperm_data
*data
= NULL
;
1837 env
->thread_id
= kvm_get_thread_id();
1838 sigfillset(&signals
);
1839 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
1841 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1842 /* do ioperm for io ports of assigned devices */
1843 QLIST_FOREACH(data
, &ioperm_head
, entries
)
1844 on_vcpu(env
, kvm_arch_do_ioperm
, data
);
1847 pthread_mutex_lock(&qemu_mutex
);
1848 cpu_single_env
= env
;
1850 kvm_create_vcpu(env
, env
->cpu_index
);
1851 setup_kernel_sigmask(env
);
1853 /* signal VCPU creation */
1854 current_env
->created
= 1;
1855 pthread_cond_signal(&qemu_vcpu_cond
);
1857 /* and wait for machine initialization */
1858 while (!qemu_system_ready
)
1859 qemu_cond_wait(&qemu_system_cond
);
1861 /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1862 cpu_single_env
= env
;
1864 kvm_main_loop_cpu(env
);
1868 int kvm_init_vcpu(CPUState
*env
)
1870 pthread_create(&env
->kvm_cpu_state
.thread
, NULL
, ap_main_loop
, env
);
1872 while (env
->created
== 0)
1873 qemu_cond_wait(&qemu_vcpu_cond
);
1878 int kvm_vcpu_inited(CPUState
*env
)
1880 return env
->created
;
1884 void kvm_hpet_disable_kpit(void)
1886 struct kvm_pit_state2 ps2
;
1888 kvm_get_pit2(kvm_context
, &ps2
);
1889 ps2
.flags
|= KVM_PIT_FLAGS_HPET_LEGACY
;
1890 kvm_set_pit2(kvm_context
, &ps2
);
1893 void kvm_hpet_enable_kpit(void)
1895 struct kvm_pit_state2 ps2
;
1897 kvm_get_pit2(kvm_context
, &ps2
);
1898 ps2
.flags
&= ~KVM_PIT_FLAGS_HPET_LEGACY
;
1899 kvm_set_pit2(kvm_context
, &ps2
);
1903 int kvm_init_ap(void)
1905 struct sigaction action
;
1907 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
1909 signal(SIG_IPI
, sig_ipi_handler
);
1911 memset(&action
, 0, sizeof(action
));
1912 action
.sa_flags
= SA_SIGINFO
;
1913 action
.sa_sigaction
= (void (*)(int, siginfo_t
*, void*))sigbus_handler
;
1914 sigaction(SIGBUS
, &action
, NULL
);
1915 prctl(PR_MCE_KILL
, 1, 1, 0, 0);
1919 void qemu_kvm_notify_work(void)
1921 /* Write 8 bytes to be compatible with eventfd. */
1922 static uint64_t val
= 1;
1925 if (io_thread_fd
== -1)
1929 ret
= write(io_thread_fd
, &val
, sizeof(val
));
1930 } while (ret
< 0 && errno
== EINTR
);
1932 /* EAGAIN is fine in case we have a pipe. */
1933 if (ret
< 0 && errno
!= EAGAIN
) {
1934 fprintf(stderr
, "qemu_kvm_notify_work: write() filed: %s\n",
1940 /* If we have signalfd, we mask out the signals we want to handle and then
1941 * use signalfd to listen for them. We rely on whatever the current signal
1942 * handler is to dispatch the signals when we receive them.
1945 static void sigfd_handler(void *opaque
)
1947 int fd
= (unsigned long) opaque
;
1948 struct qemu_signalfd_siginfo info
;
1949 struct sigaction action
;
1954 len
= read(fd
, &info
, sizeof(info
));
1955 } while (len
== -1 && errno
== EINTR
);
1957 if (len
== -1 && errno
== EAGAIN
)
1960 if (len
!= sizeof(info
)) {
1961 printf("read from sigfd returned %zd: %m\n", len
);
1965 sigaction(info
.ssi_signo
, NULL
, &action
);
1966 if ((action
.sa_flags
& SA_SIGINFO
) && action
.sa_sigaction
)
1967 action
.sa_sigaction(info
.ssi_signo
,
1968 (siginfo_t
*)&info
, NULL
);
1969 else if (action
.sa_handler
)
1970 action
.sa_handler(info
.ssi_signo
);
1975 /* Used to break IO thread out of select */
1976 static void io_thread_wakeup(void *opaque
)
1978 int fd
= (unsigned long) opaque
;
1982 /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */
1984 len
= read(fd
, buffer
, sizeof(buffer
));
1985 } while ((len
== -1 && errno
== EINTR
) || len
== sizeof(buffer
));
1988 int kvm_main_loop(void)
1994 io_thread
= pthread_self();
1995 qemu_system_ready
= 1;
1997 if (qemu_eventfd(fds
) == -1) {
1998 fprintf(stderr
, "failed to create eventfd\n");
2002 fcntl(fds
[0], F_SETFL
, O_NONBLOCK
);
2003 fcntl(fds
[1], F_SETFL
, O_NONBLOCK
);
2005 qemu_set_fd_handler2(fds
[0], NULL
, io_thread_wakeup
, NULL
,
2006 (void *)(unsigned long) fds
[0]);
2008 io_thread_fd
= fds
[1];
2011 sigaddset(&mask
, SIGIO
);
2012 sigaddset(&mask
, SIGALRM
);
2013 sigaddset(&mask
, SIGBUS
);
2014 sigprocmask(SIG_BLOCK
, &mask
, NULL
);
2016 sigfd
= qemu_signalfd(&mask
);
2018 fprintf(stderr
, "failed to create signalfd\n");
2022 fcntl(sigfd
, F_SETFL
, O_NONBLOCK
);
2024 qemu_set_fd_handler2(sigfd
, NULL
, sigfd_handler
, NULL
,
2025 (void *)(unsigned long) sigfd
);
2027 pthread_cond_broadcast(&qemu_system_cond
);
2029 io_thread_sigfd
= sigfd
;
2030 cpu_single_env
= NULL
;
2034 if (qemu_shutdown_requested()) {
2035 monitor_protocol_event(QEVENT_SHUTDOWN
, NULL
);
2036 if (qemu_no_shutdown()) {
2040 } else if (qemu_powerdown_requested()) {
2041 monitor_protocol_event(QEVENT_POWERDOWN
, NULL
);
2042 qemu_irq_raise(qemu_system_powerdown
);
2043 } else if (qemu_reset_requested()) {
2044 qemu_kvm_system_reset();
2045 } else if (kvm_debug_cpu_requested
) {
2046 gdb_set_stop_cpu(kvm_debug_cpu_requested
);
2047 vm_stop(EXCP_DEBUG
);
2048 kvm_debug_cpu_requested
= NULL
;
2050 if (qemu_exit_requested()) {
2055 pause_all_threads();
2056 pthread_mutex_unlock(&qemu_mutex
);
2062 static int destroy_region_works
= 0;
2066 #if !defined(TARGET_I386)
2067 int kvm_arch_init_irq_routing(void)
2075 static int kvm_create_context(void)
2080 kvm_disable_irqchip_creation(kvm_context
);
2083 kvm_disable_pit_creation(kvm_context
);
2085 if (kvm_create(kvm_context
, 0, NULL
) < 0) {
2086 kvm_finalize(kvm_state
);
2089 r
= kvm_arch_qemu_create_context();
2091 kvm_finalize(kvm_state
);
2094 if (kvm_pit
&& !kvm_pit_reinject
) {
2095 if (kvm_reinject_control(kvm_context
, 0)) {
2096 fprintf(stderr
, "failure to disable in-kernel PIT reinjection\n");
2101 destroy_region_works
= kvm_destroy_memory_region_works(kvm_context
);
2104 r
= kvm_arch_init_irq_routing();
2109 kvm_state
->vcpu_events
= 0;
2110 #ifdef KVM_CAP_VCPU_EVENTS
2111 kvm_state
->vcpu_events
= kvm_check_extension(kvm_state
, KVM_CAP_VCPU_EVENTS
);
2114 kvm_state
->debugregs
= 0;
2115 #ifdef KVM_CAP_DEBUGREGS
2116 kvm_state
->debugregs
= kvm_check_extension(kvm_state
, KVM_CAP_DEBUGREGS
);
2121 if (!qemu_kvm_has_gsi_routing()) {
2124 /* if kernel can't do irq routing, interrupt source
2125 * override 0->2 can not be set up as required by hpet,
2129 } else if (!qemu_kvm_has_pit_state2()) {
2141 static int must_use_aliases_source(target_phys_addr_t addr
)
2143 if (destroy_region_works
)
2145 if (addr
== 0xa0000 || addr
== 0xa8000)
2150 static int must_use_aliases_target(target_phys_addr_t addr
)
2152 if (destroy_region_works
)
2154 if (addr
>= 0xe0000000 && addr
< 0x100000000ull
)
2159 static struct mapping
{
2160 target_phys_addr_t phys
;
2164 static int nr_mappings
;
2166 static struct mapping
*find_ram_mapping(ram_addr_t ram_addr
)
2170 for (p
= mappings
; p
< mappings
+ nr_mappings
; ++p
) {
2171 if (p
->ram
<= ram_addr
&& ram_addr
< p
->ram
+ p
->len
) {
2178 static struct mapping
*find_mapping(target_phys_addr_t start_addr
)
2182 for (p
= mappings
; p
< mappings
+ nr_mappings
; ++p
) {
2183 if (p
->phys
<= start_addr
&& start_addr
< p
->phys
+ p
->len
) {
2190 static void drop_mapping(target_phys_addr_t start_addr
)
2192 struct mapping
*p
= find_mapping(start_addr
);
2195 *p
= mappings
[--nr_mappings
];
2199 void kvm_set_phys_mem(target_phys_addr_t start_addr
, ram_addr_t size
,
2200 ram_addr_t phys_offset
)
2203 unsigned long area_flags
;
2208 if (start_addr
+ size
> phys_ram_size
) {
2209 phys_ram_size
= start_addr
+ size
;
2212 phys_offset
&= ~IO_MEM_ROM
;
2213 area_flags
= phys_offset
& ~TARGET_PAGE_MASK
;
2215 if (area_flags
!= IO_MEM_RAM
) {
2217 if (must_use_aliases_source(start_addr
)) {
2218 kvm_destroy_memory_alias(kvm_context
, start_addr
);
2221 if (must_use_aliases_target(start_addr
))
2225 p
= find_mapping(start_addr
);
2227 kvm_unregister_memory_area(kvm_context
, p
->phys
, p
->len
);
2228 drop_mapping(p
->phys
);
2230 start_addr
+= TARGET_PAGE_SIZE
;
2231 if (size
> TARGET_PAGE_SIZE
) {
2232 size
-= TARGET_PAGE_SIZE
;
2240 r
= kvm_is_containing_region(kvm_context
, start_addr
, size
);
2244 if (area_flags
>= TLB_MMIO
)
2248 if (must_use_aliases_source(start_addr
)) {
2249 p
= find_ram_mapping(phys_offset
);
2251 kvm_create_memory_alias(kvm_context
, start_addr
, size
,
2252 p
->phys
+ (phys_offset
- p
->ram
));
2258 r
= kvm_register_phys_mem(kvm_context
, start_addr
,
2259 qemu_get_ram_ptr(phys_offset
), size
, 0);
2261 printf("kvm_cpu_register_physical_memory: failed\n");
2265 drop_mapping(start_addr
);
2266 p
= &mappings
[nr_mappings
++];
2267 p
->phys
= start_addr
;
2268 p
->ram
= phys_offset
;
2276 * dirty pages logging
2278 /* FIXME: use unsigned long pointer instead of unsigned char */
2279 unsigned char *kvm_dirty_bitmap
= NULL
;
2280 int kvm_physical_memory_set_dirty_tracking(int enable
)
2288 if (!kvm_dirty_bitmap
) {
2289 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
2290 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
2291 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
2294 if (kvm_dirty_bitmap
) {
2295 r
= kvm_dirty_pages_log_reset(kvm_context
);
2296 qemu_free(kvm_dirty_bitmap
);
2297 kvm_dirty_bitmap
= NULL
;
2303 /* get kvm's dirty pages bitmap and update qemu's */
2304 static int kvm_get_dirty_pages_log_range(unsigned long start_addr
,
2305 unsigned long *bitmap
,
2306 unsigned long offset
,
2307 unsigned long mem_size
)
2310 unsigned long page_number
, addr
, addr1
, c
;
2311 ram_addr_t ram_addr
;
2312 unsigned int len
= ((mem_size
/ TARGET_PAGE_SIZE
) + HOST_LONG_BITS
- 1) /
2316 * bitmap-traveling is faster than memory-traveling (for addr...)
2317 * especially when most of the memory is not dirty.
2319 for (i
= 0; i
< len
; i
++) {
2320 if (bitmap
[i
] != 0) {
2321 c
= leul_to_cpu(bitmap
[i
]);
2325 page_number
= i
* HOST_LONG_BITS
+ j
;
2326 addr1
= page_number
* TARGET_PAGE_SIZE
;
2327 addr
= offset
+ addr1
;
2328 ram_addr
= cpu_get_physical_page_desc(addr
);
2329 cpu_physical_memory_set_dirty(ram_addr
);
2336 static int kvm_get_dirty_bitmap_cb(unsigned long start
, unsigned long len
,
2337 void *bitmap
, void *opaque
)
2339 return kvm_get_dirty_pages_log_range(start
, bitmap
, start
, len
);
2342 void kvm_qemu_log_memory(target_phys_addr_t start
, target_phys_addr_t size
,
2346 kvm_dirty_pages_log_enable_slot(kvm_context
, start
, size
);
2349 if (must_use_aliases_target(start
))
2352 kvm_dirty_pages_log_disable_slot(kvm_context
, start
, size
);
2356 #ifdef KVM_CAP_IRQCHIP
2358 int kvm_set_irq(int irq
, int level
, int *status
)
2360 return kvm_set_irq_level(kvm_context
, irq
, level
, status
);
2365 void kvm_mutex_unlock(void)
2367 assert(!cpu_single_env
);
2368 pthread_mutex_unlock(&qemu_mutex
);
2371 void kvm_mutex_lock(void)
2373 pthread_mutex_lock(&qemu_mutex
);
2374 cpu_single_env
= NULL
;
2377 void qemu_mutex_unlock_iothread(void)
2383 void qemu_mutex_lock_iothread(void)
2389 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2390 void kvm_add_ioperm_data(struct ioperm_data
*data
)
2392 QLIST_INSERT_HEAD(&ioperm_head
, data
, entries
);
2395 void kvm_remove_ioperm_data(unsigned long start_port
, unsigned long num
)
2397 struct ioperm_data
*data
;
2399 data
= QLIST_FIRST(&ioperm_head
);
2401 struct ioperm_data
*next
= QLIST_NEXT(data
, entries
);
2403 if (data
->start_port
== start_port
&& data
->num
== num
) {
2404 QLIST_REMOVE(data
, entries
);
2412 void kvm_ioperm(CPUState
*env
, void *data
)
2414 if (kvm_enabled() && qemu_system_ready
)
2415 on_vcpu(env
, kvm_arch_do_ioperm
, data
);
2420 int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr
,
2421 target_phys_addr_t end_addr
)
2426 if (must_use_aliases_source(start_addr
))
2430 kvm_get_dirty_pages_range(kvm_context
, start_addr
,
2431 end_addr
- start_addr
, NULL
,
2432 kvm_get_dirty_bitmap_cb
);
2437 int kvm_log_start(target_phys_addr_t phys_addr
, ram_addr_t len
)
2440 if (must_use_aliases_source(phys_addr
))
2445 kvm_qemu_log_memory(phys_addr
, len
, 1);
2450 int kvm_log_stop(target_phys_addr_t phys_addr
, ram_addr_t len
)
2453 if (must_use_aliases_source(phys_addr
))
2458 kvm_qemu_log_memory(phys_addr
, len
, 0);
2463 int kvm_set_boot_cpu_id(uint32_t id
)
2465 return kvm_set_boot_vcpu_id(kvm_context
, id
);
2470 struct kvm_x86_mce_data
{
2472 struct kvm_x86_mce
*mce
;
2476 static void kvm_do_inject_x86_mce(void *_data
)
2478 struct kvm_x86_mce_data
*data
= _data
;
2481 /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
2482 r
= kvm_mce_in_exception(data
->env
);
2484 fprintf(stderr
, "Failed to get MCE status\n");
2485 else if (r
&& !(data
->mce
->status
& MCI_STATUS_AR
))
2487 r
= kvm_set_mce(data
->env
, data
->mce
);
2489 perror("kvm_set_mce FAILED");
2490 if (data
->abort_on_error
)
2496 void kvm_inject_x86_mce(CPUState
*cenv
, int bank
, uint64_t status
,
2497 uint64_t mcg_status
, uint64_t addr
, uint64_t misc
,
2501 struct kvm_x86_mce mce
= {
2504 .mcg_status
= mcg_status
,
2508 struct kvm_x86_mce_data data
= {
2511 .abort_on_error
= abort_on_error
,
2514 if (!cenv
->mcg_cap
) {
2515 fprintf(stderr
, "MCE support is not enabled!\n");
2518 on_vcpu(cenv
, kvm_do_inject_x86_mce
, &data
);