4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
9 #include "config-host.h"
18 #include "qemu-common.h"
25 #include <sys/utsname.h>
26 #include <sys/syscall.h>
28 extern void perror(const char *s
);
30 kvm_context_t kvm_context
;
34 static int qemu_kvm_reset_requested
;
36 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
37 pthread_cond_t qemu_aio_cond
= PTHREAD_COND_INITIALIZER
;
38 pthread_cond_t qemu_vcpu_cond
= PTHREAD_COND_INITIALIZER
;
39 pthread_cond_t qemu_system_cond
= PTHREAD_COND_INITIALIZER
;
40 pthread_cond_t qemu_pause_cond
= PTHREAD_COND_INITIALIZER
;
41 __thread
struct vcpu_info
*vcpu
;
43 static int qemu_system_ready
;
45 #define SIG_IPI (SIGRTMIN+4)
60 static int io_thread_fd
= -1;
61 static int io_thread_sigfd
= -1;
63 static inline unsigned long kvm_get_thread_id(void)
65 return syscall(SYS_gettid
);
68 CPUState
*qemu_kvm_cpu_env(int index
)
70 return vcpu_info
[index
].env
;
73 static void sig_ipi_handler(int n
)
77 void kvm_update_interrupt_request(CPUState
*env
)
84 if (vcpu
&& env
!= vcpu
->env
&& !vcpu_info
[env
->cpu_index
].signalled
)
88 vcpu_info
[env
->cpu_index
].signalled
= 1;
89 if (vcpu_info
[env
->cpu_index
].thread
)
90 pthread_kill(vcpu_info
[env
->cpu_index
].thread
, SIG_IPI
);
95 void kvm_update_after_sipi(CPUState
*env
)
97 vcpu_info
[env
->cpu_index
].sipi_needed
= 1;
98 kvm_update_interrupt_request(env
);
101 void kvm_apic_init(CPUState
*env
)
103 if (env
->cpu_index
!= 0)
104 vcpu_info
[env
->cpu_index
].init
= 1;
105 kvm_update_interrupt_request(env
);
110 static int try_push_interrupts(void *opaque
)
112 return kvm_arch_try_push_interrupts(opaque
);
115 static void post_kvm_run(void *opaque
, int vcpu
)
118 pthread_mutex_lock(&qemu_mutex
);
119 kvm_arch_post_kvm_run(opaque
, vcpu
);
122 static int pre_kvm_run(void *opaque
, int vcpu
)
124 CPUState
*env
= qemu_kvm_cpu_env(vcpu
);
126 kvm_arch_pre_kvm_run(opaque
, vcpu
);
128 if (env
->interrupt_request
& CPU_INTERRUPT_EXIT
)
130 pthread_mutex_unlock(&qemu_mutex
);
134 void kvm_load_registers(CPUState
*env
)
137 kvm_arch_load_regs(env
);
140 void kvm_save_registers(CPUState
*env
)
143 kvm_arch_save_regs(env
);
146 int kvm_cpu_exec(CPUState
*env
)
150 r
= kvm_run(kvm_context
, env
->cpu_index
);
152 printf("kvm_run returned %d\n", r
);
159 extern int vm_running
;
161 static int has_work(CPUState
*env
)
163 if (!vm_running
|| (env
&& vcpu_info
[env
->cpu_index
].stopped
))
165 if (!(env
->hflags
& HF_HALTED_MASK
))
167 return kvm_arch_has_work(env
);
170 static int kvm_eat_signal(CPUState
*env
, int timeout
)
177 ts
.tv_sec
= timeout
/ 1000;
178 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
179 sigemptyset(&waitset
);
180 sigaddset(&waitset
, SIG_IPI
);
182 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
183 if (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
) && !timeout
)
187 pthread_mutex_lock(&qemu_mutex
);
189 cpu_single_env
= vcpu
->env
;
190 if (r
== -1 && !(errno
== EAGAIN
|| errno
== EINTR
)) {
191 printf("sigtimedwait: %s\n", strerror(e
));
197 if (env
&& vcpu_info
[env
->cpu_index
].stop
) {
198 vcpu_info
[env
->cpu_index
].stop
= 0;
199 vcpu_info
[env
->cpu_index
].stopped
= 1;
200 pthread_cond_signal(&qemu_pause_cond
);
202 pthread_mutex_unlock(&qemu_mutex
);
208 static void kvm_eat_signals(CPUState
*env
, int timeout
)
212 while (kvm_eat_signal(env
, 0))
215 r
= kvm_eat_signal(env
, timeout
);
217 while (kvm_eat_signal(env
, 0))
222 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
224 pthread_mutex_unlock(&qemu_mutex
);
225 kvm_eat_signals(env
, timeout
);
226 pthread_mutex_lock(&qemu_mutex
);
227 cpu_single_env
= env
;
228 vcpu_info
[env
->cpu_index
].signalled
= 0;
231 static int all_threads_paused(void)
235 for (i
= 0; i
< smp_cpus
; ++i
)
236 if (vcpu_info
[i
].stop
)
241 static void pause_all_threads(void)
245 for (i
= 0; i
< smp_cpus
; ++i
) {
246 vcpu_info
[i
].stop
= 1;
247 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
249 while (!all_threads_paused()) {
250 CPUState
*env
= cpu_single_env
;
251 pthread_cond_wait(&qemu_pause_cond
, &qemu_mutex
);
252 cpu_single_env
= env
;
256 static void resume_all_threads(void)
260 for (i
= 0; i
< smp_cpus
; ++i
) {
261 vcpu_info
[i
].stop
= 0;
262 vcpu_info
[i
].stopped
= 0;
263 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
267 static void kvm_vm_state_change_handler(void *context
, int running
)
270 resume_all_threads();
275 static void update_regs_for_sipi(CPUState
*env
)
277 kvm_arch_update_regs_for_sipi(env
);
278 vcpu_info
[env
->cpu_index
].sipi_needed
= 0;
279 vcpu_info
[env
->cpu_index
].init
= 0;
282 static void update_regs_for_init(CPUState
*env
)
285 kvm_arch_load_regs(env
);
288 static void setup_kernel_sigmask(CPUState
*env
)
293 sigaddset(&set
, SIGUSR2
);
294 sigaddset(&set
, SIGIO
);
295 sigaddset(&set
, SIGALRM
);
296 sigprocmask(SIG_BLOCK
, &set
, NULL
);
298 sigprocmask(SIG_BLOCK
, NULL
, &set
);
299 sigdelset(&set
, SIG_IPI
);
301 kvm_set_signal_mask(kvm_context
, env
->cpu_index
, &set
);
304 void qemu_kvm_system_reset_request(void)
308 for (i
= 0; i
< smp_cpus
; ++i
) {
309 vcpu_info
[i
].reload_regs
= 1;
310 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
315 static int kvm_main_loop_cpu(CPUState
*env
)
317 struct vcpu_info
*info
= &vcpu_info
[env
->cpu_index
];
319 setup_kernel_sigmask(env
);
321 pthread_mutex_lock(&qemu_mutex
);
322 if (kvm_irqchip_in_kernel(kvm_context
))
323 env
->hflags
&= ~HF_HALTED_MASK
;
325 kvm_qemu_init_env(env
);
326 env
->ready_for_interrupt_injection
= 1;
328 kvm_tpr_vcpu_start(env
);
331 cpu_single_env
= env
;
333 while (!has_work(env
))
334 kvm_main_loop_wait(env
, 1000);
335 if (env
->interrupt_request
& CPU_INTERRUPT_HARD
)
336 env
->hflags
&= ~HF_HALTED_MASK
;
337 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->sipi_needed
)
338 update_regs_for_sipi(env
);
339 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->init
)
340 update_regs_for_init(env
);
341 if (!(env
->hflags
& HF_HALTED_MASK
) && !info
->init
)
343 env
->interrupt_request
&= ~CPU_INTERRUPT_EXIT
;
344 kvm_main_loop_wait(env
, 0);
345 if (info
->reload_regs
) {
346 info
->reload_regs
= 0;
347 if (env
->cpu_index
== 0) /* ap needs to be placed in INIT */
348 kvm_arch_load_regs(env
);
351 pthread_mutex_unlock(&qemu_mutex
);
355 static void *ap_main_loop(void *_env
)
357 CPUState
*env
= _env
;
360 vcpu
= &vcpu_info
[env
->cpu_index
];
362 vcpu
->env
->thread_id
= kvm_get_thread_id();
363 sigfillset(&signals
);
364 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
365 kvm_create_vcpu(kvm_context
, env
->cpu_index
);
366 kvm_qemu_init_env(env
);
368 /* signal VCPU creation */
369 pthread_mutex_lock(&qemu_mutex
);
371 pthread_cond_signal(&qemu_vcpu_cond
);
373 /* and wait for machine initialization */
374 while (!qemu_system_ready
)
375 pthread_cond_wait(&qemu_system_cond
, &qemu_mutex
);
376 pthread_mutex_unlock(&qemu_mutex
);
378 kvm_main_loop_cpu(env
);
382 void kvm_init_new_ap(int cpu
, CPUState
*env
)
384 pthread_create(&vcpu_info
[cpu
].thread
, NULL
, ap_main_loop
, env
);
386 while (vcpu_info
[cpu
].created
== 0)
387 pthread_cond_wait(&qemu_vcpu_cond
, &qemu_mutex
);
390 int kvm_init_ap(void)
395 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
397 signal(SIG_IPI
, sig_ipi_handler
);
401 void qemu_kvm_notify_work(void)
407 if (io_thread_fd
== -1)
410 memcpy(buffer
, &value
, sizeof(value
));
415 len
= write(io_thread_fd
, buffer
+ offset
, 8 - offset
);
416 if (len
== -1 && errno
== EINTR
)
426 fprintf(stderr
, "failed to notify io thread\n");
429 /* If we have signalfd, we mask out the signals we want to handle and then
430 * use signalfd to listen for them. We rely on whatever the current signal
431 * handler is to dispatch the signals when we receive them.
434 static void sigfd_handler(void *opaque
)
436 int fd
= (unsigned long)opaque
;
437 struct signalfd_siginfo info
;
438 struct sigaction action
;
443 len
= read(fd
, &info
, sizeof(info
));
444 } while (len
== -1 && errno
== EINTR
);
446 if (len
== -1 && errno
== EAGAIN
)
449 if (len
!= sizeof(info
)) {
450 printf("read from sigfd returned %ld: %m\n", len
);
454 sigaction(info
.ssi_signo
, NULL
, &action
);
455 if (action
.sa_handler
)
456 action
.sa_handler(info
.ssi_signo
);
458 if (info
.ssi_signo
== SIGUSR2
) {
459 pthread_cond_signal(&qemu_aio_cond
);
464 /* Used to break IO thread out of select */
465 static void io_thread_wakeup(void *opaque
)
467 int fd
= (unsigned long)opaque
;
474 len
= read(fd
, buffer
+ offset
, 8 - offset
);
475 if (len
== -1 && errno
== EINTR
)
485 int kvm_main_loop(void)
491 io_thread
= pthread_self();
492 qemu_system_ready
= 1;
494 if (kvm_eventfd(fds
) == -1) {
495 fprintf(stderr
, "failed to create eventfd\n");
499 qemu_set_fd_handler2(fds
[0], NULL
, io_thread_wakeup
, NULL
,
500 (void *)(unsigned long)fds
[0]);
502 io_thread_fd
= fds
[1];
505 sigaddset(&mask
, SIGIO
);
506 sigaddset(&mask
, SIGALRM
);
507 sigaddset(&mask
, SIGUSR2
);
508 sigprocmask(SIG_BLOCK
, &mask
, NULL
);
510 sigfd
= kvm_signalfd(&mask
);
512 fprintf(stderr
, "failed to create signalfd\n");
516 fcntl(sigfd
, F_SETFL
, O_NONBLOCK
);
518 qemu_set_fd_handler2(sigfd
, NULL
, sigfd_handler
, NULL
,
519 (void *)(unsigned long)sigfd
);
521 pthread_cond_broadcast(&qemu_system_cond
);
523 io_thread_sigfd
= sigfd
;
524 cpu_single_env
= NULL
;
527 main_loop_wait(1000);
528 if (qemu_shutdown_requested())
530 else if (qemu_powerdown_requested())
531 qemu_system_powerdown();
532 else if (qemu_reset_requested()) {
533 pthread_kill(vcpu_info
[0].thread
, SIG_IPI
);
534 qemu_kvm_reset_requested
= 1;
539 pthread_mutex_unlock(&qemu_mutex
);
544 static int kvm_debug(void *opaque
, int vcpu
)
546 CPUState
*env
= cpu_single_env
;
548 env
->exception_index
= EXCP_DEBUG
;
552 static int kvm_inb(void *opaque
, uint16_t addr
, uint8_t *data
)
554 *data
= cpu_inb(0, addr
);
558 static int kvm_inw(void *opaque
, uint16_t addr
, uint16_t *data
)
560 *data
= cpu_inw(0, addr
);
564 static int kvm_inl(void *opaque
, uint16_t addr
, uint32_t *data
)
566 *data
= cpu_inl(0, addr
);
570 #define PM_IO_BASE 0xb000
572 static int kvm_outb(void *opaque
, uint16_t addr
, uint8_t data
)
577 cpu_outb(0, 0xb3, 0);
584 x
= cpu_inw(0, PM_IO_BASE
+ 4);
586 cpu_outw(0, PM_IO_BASE
+ 4, x
);
593 x
= cpu_inw(0, PM_IO_BASE
+ 4);
595 cpu_outw(0, PM_IO_BASE
+ 4, x
);
603 cpu_outb(0, addr
, data
);
607 static int kvm_outw(void *opaque
, uint16_t addr
, uint16_t data
)
609 cpu_outw(0, addr
, data
);
613 static int kvm_outl(void *opaque
, uint16_t addr
, uint32_t data
)
615 cpu_outl(0, addr
, data
);
619 static int kvm_mmio_read(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
621 cpu_physical_memory_rw(addr
, data
, len
, 0);
625 static int kvm_mmio_write(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
627 cpu_physical_memory_rw(addr
, data
, len
, 1);
631 static int kvm_io_window(void *opaque
)
637 static int kvm_halt(void *opaque
, int vcpu
)
639 return kvm_arch_halt(opaque
, vcpu
);
642 static int kvm_shutdown(void *opaque
, int vcpu
)
644 qemu_system_reset_request();
648 static struct kvm_callbacks qemu_kvm_ops
= {
656 .mmio_read
= kvm_mmio_read
,
657 .mmio_write
= kvm_mmio_write
,
659 .shutdown
= kvm_shutdown
,
660 .io_window
= kvm_io_window
,
661 .try_push_interrupts
= try_push_interrupts
,
662 .post_kvm_run
= post_kvm_run
,
663 .pre_kvm_run
= pre_kvm_run
,
665 .tpr_access
= handle_tpr_access
,
668 .powerpc_dcr_read
= handle_powerpc_dcr_read
,
669 .powerpc_dcr_write
= handle_powerpc_dcr_write
,
675 /* Try to initialize kvm */
676 kvm_context
= kvm_init(&qemu_kvm_ops
, cpu_single_env
);
680 pthread_mutex_lock(&qemu_mutex
);
685 int kvm_qemu_create_context(void)
689 kvm_disable_irqchip_creation(kvm_context
);
692 kvm_disable_pit_creation(kvm_context
);
694 if (kvm_create(kvm_context
, phys_ram_size
, (void**)&phys_ram_base
) < 0) {
698 r
= kvm_arch_qemu_create_context();
704 void kvm_qemu_destroy(void)
706 kvm_finalize(kvm_context
);
709 void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr
,
711 unsigned long phys_offset
)
713 #ifdef KVM_CAP_USER_MEMORY
716 r
= kvm_check_extension(kvm_context
, KVM_CAP_USER_MEMORY
);
718 if (!(phys_offset
& ~TARGET_PAGE_MASK
)) {
719 r
= kvm_is_allocated_mem(kvm_context
, start_addr
, size
);
722 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
724 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
725 r
= kvm_register_userspace_phys_mem(kvm_context
, start_addr
,
726 phys_ram_base
+ phys_offset
,
729 if (phys_offset
& IO_MEM_ROM
) {
730 phys_offset
&= ~IO_MEM_ROM
;
731 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
733 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
734 r
= kvm_register_userspace_phys_mem(kvm_context
, start_addr
,
735 phys_ram_base
+ phys_offset
,
739 printf("kvm_cpu_register_physical_memory: failed\n");
745 if (phys_offset
& IO_MEM_ROM
) {
746 phys_offset
&= ~IO_MEM_ROM
;
747 memcpy(phys_ram_base
+ start_addr
, phys_ram_base
+ phys_offset
, size
);
751 int kvm_qemu_check_extension(int ext
)
753 return kvm_check_extension(kvm_context
, ext
);
756 int kvm_qemu_init_env(CPUState
*cenv
)
758 return kvm_arch_qemu_init_env(cenv
);
761 int kvm_update_debugger(CPUState
*env
)
763 struct kvm_debug_guest dbg
;
767 if (env
->nb_breakpoints
|| env
->singlestep_enabled
) {
769 for (i
= 0; i
< 4 && i
< env
->nb_breakpoints
; ++i
) {
770 dbg
.breakpoints
[i
].enabled
= 1;
771 dbg
.breakpoints
[i
].address
= env
->breakpoints
[i
];
773 dbg
.singlestep
= env
->singlestep_enabled
;
775 return kvm_guest_debug(kvm_context
, env
->cpu_index
, &dbg
);
780 * dirty pages logging
782 /* FIXME: use unsigned long pointer instead of unsigned char */
783 unsigned char *kvm_dirty_bitmap
= NULL
;
784 int kvm_physical_memory_set_dirty_tracking(int enable
)
792 if (!kvm_dirty_bitmap
) {
793 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
794 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
795 if (kvm_dirty_bitmap
== NULL
) {
796 perror("Failed to allocate dirty pages bitmap");
800 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
805 if (kvm_dirty_bitmap
) {
806 r
= kvm_dirty_pages_log_reset(kvm_context
);
807 qemu_free(kvm_dirty_bitmap
);
808 kvm_dirty_bitmap
= NULL
;
814 /* get kvm's dirty pages bitmap and update qemu's */
815 int kvm_get_dirty_pages_log_range(unsigned long start_addr
,
816 unsigned char *bitmap
,
818 unsigned long mem_size
)
820 unsigned int i
, j
, n
=0;
822 unsigned page_number
, addr
, addr1
;
823 unsigned int len
= ((mem_size
/TARGET_PAGE_SIZE
) + 7) / 8;
826 * bitmap-traveling is faster than memory-traveling (for addr...)
827 * especially when most of the memory is not dirty.
829 for (i
=0; i
<len
; i
++) {
834 page_number
= i
* 8 + j
;
835 addr1
= page_number
* TARGET_PAGE_SIZE
;
836 addr
= offset
+ addr1
;
837 cpu_physical_memory_set_dirty(addr
);
843 int kvm_get_dirty_bitmap_cb(unsigned long start
, unsigned long len
,
844 void *bitmap
, void *opaque
)
846 return kvm_get_dirty_pages_log_range(start
, bitmap
, start
, len
);
850 * get kvm's dirty pages bitmap and update qemu's
851 * we only care about physical ram, which resides in slots 0 and 3
853 int kvm_update_dirty_pages_log(void)
858 r
= kvm_get_dirty_pages_range(kvm_context
, 0, phys_ram_size
,
859 kvm_dirty_bitmap
, NULL
,
860 kvm_get_dirty_bitmap_cb
);
864 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap
)
866 unsigned int bsize
= BITMAP_SIZE(phys_ram_size
);
867 unsigned int brsize
= BITMAP_SIZE(ram_size
);
868 unsigned int extra_pages
= (phys_ram_size
- ram_size
) / TARGET_PAGE_SIZE
;
869 unsigned int extra_bytes
= (extra_pages
+7)/8;
870 unsigned int hole_start
= BITMAP_SIZE(0xa0000);
871 unsigned int hole_end
= BITMAP_SIZE(0xc0000);
873 memset(bitmap
, 0xFF, brsize
+ extra_bytes
);
874 memset(bitmap
+ hole_start
, 0, hole_end
- hole_start
);
875 memset(bitmap
+ brsize
+ extra_bytes
, 0, bsize
- brsize
- extra_bytes
);
880 #ifdef KVM_CAP_IRQCHIP
882 int kvm_set_irq(int irq
, int level
)
884 return kvm_set_irq_level(kvm_context
, irq
, level
);
889 void qemu_kvm_aio_wait_start(void)
893 void qemu_kvm_aio_wait(void)
895 CPUState
*cpu_single
= cpu_single_env
;
897 if (!cpu_single_env
) {
898 if (io_thread_sigfd
!= -1) {
903 FD_SET(io_thread_sigfd
, &rfds
);
905 /* this is a rare case where we do want to hold qemu_mutex
906 * while sleeping. We cannot allow anything else to run
908 ret
= select(io_thread_sigfd
+ 1, &rfds
, NULL
, NULL
, NULL
);
909 if (ret
> 0 && FD_ISSET(io_thread_sigfd
, &rfds
))
910 sigfd_handler((void *)(unsigned long)io_thread_sigfd
);
914 pthread_cond_wait(&qemu_aio_cond
, &qemu_mutex
);
915 cpu_single_env
= cpu_single
;
919 void qemu_kvm_aio_wait_end(void)
923 int qemu_kvm_get_dirty_pages(unsigned long phys_addr
, void *buf
)
925 return kvm_get_dirty_pages(kvm_context
, phys_addr
, buf
);
928 void *kvm_cpu_create_phys_mem(target_phys_addr_t start_addr
,
929 unsigned long size
, int log
, int writable
)
931 return kvm_create_phys_mem(kvm_context
, start_addr
, size
, log
, writable
);
934 void kvm_cpu_destroy_phys_mem(target_phys_addr_t start_addr
,
937 kvm_destroy_phys_mem(kvm_context
, start_addr
, size
);
940 void kvm_mutex_unlock(void)
942 pthread_mutex_unlock(&qemu_mutex
);
945 void kvm_mutex_lock(void)
947 pthread_mutex_lock(&qemu_mutex
);
948 cpu_single_env
= NULL
;