4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
9 #include "config-host.h"
18 #include "qemu-common.h"
25 #include <sys/utsname.h>
26 #include <sys/syscall.h>
28 extern void perror(const char *s
);
30 kvm_context_t kvm_context
;
34 static int qemu_kvm_reset_requested
;
36 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
37 pthread_cond_t qemu_aio_cond
= PTHREAD_COND_INITIALIZER
;
38 pthread_cond_t qemu_vcpu_cond
= PTHREAD_COND_INITIALIZER
;
39 pthread_cond_t qemu_system_cond
= PTHREAD_COND_INITIALIZER
;
40 pthread_cond_t qemu_pause_cond
= PTHREAD_COND_INITIALIZER
;
41 __thread
struct vcpu_info
*vcpu
;
43 static int qemu_system_ready
;
45 #define SIG_IPI (SIGRTMIN+4)
60 static int io_thread_fd
= -1;
61 static int io_thread_sigfd
= -1;
63 static inline unsigned long kvm_get_thread_id(void)
65 return syscall(SYS_gettid
);
68 CPUState
*qemu_kvm_cpu_env(int index
)
70 return vcpu_info
[index
].env
;
73 static void sig_ipi_handler(int n
)
77 void kvm_update_interrupt_request(CPUState
*env
)
84 if (vcpu
&& env
!= vcpu
->env
&& !vcpu_info
[env
->cpu_index
].signalled
)
88 vcpu_info
[env
->cpu_index
].signalled
= 1;
89 if (vcpu_info
[env
->cpu_index
].thread
)
90 pthread_kill(vcpu_info
[env
->cpu_index
].thread
, SIG_IPI
);
95 void kvm_update_after_sipi(CPUState
*env
)
97 vcpu_info
[env
->cpu_index
].sipi_needed
= 1;
98 kvm_update_interrupt_request(env
);
101 void kvm_apic_init(CPUState
*env
)
103 if (env
->cpu_index
!= 0)
104 vcpu_info
[env
->cpu_index
].init
= 1;
105 kvm_update_interrupt_request(env
);
110 static int try_push_interrupts(void *opaque
)
112 return kvm_arch_try_push_interrupts(opaque
);
115 static void post_kvm_run(void *opaque
, int vcpu
)
118 pthread_mutex_lock(&qemu_mutex
);
119 kvm_arch_post_kvm_run(opaque
, vcpu
);
122 static int pre_kvm_run(void *opaque
, int vcpu
)
124 CPUState
*env
= qemu_kvm_cpu_env(vcpu
);
126 kvm_arch_pre_kvm_run(opaque
, vcpu
);
128 if (env
->interrupt_request
& CPU_INTERRUPT_EXIT
)
130 pthread_mutex_unlock(&qemu_mutex
);
134 void kvm_load_registers(CPUState
*env
)
137 kvm_arch_load_regs(env
);
140 void kvm_save_registers(CPUState
*env
)
143 kvm_arch_save_regs(env
);
146 int kvm_cpu_exec(CPUState
*env
)
150 r
= kvm_run(kvm_context
, env
->cpu_index
);
152 printf("kvm_run returned %d\n", r
);
159 extern int vm_running
;
161 static int has_work(CPUState
*env
)
163 if (!vm_running
|| (env
&& vcpu_info
[env
->cpu_index
].stopped
))
165 if (!(env
->hflags
& HF_HALTED_MASK
))
167 return kvm_arch_has_work(env
);
170 static int kvm_eat_signal(CPUState
*env
, int timeout
)
177 ts
.tv_sec
= timeout
/ 1000;
178 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
179 sigemptyset(&waitset
);
180 sigaddset(&waitset
, SIG_IPI
);
182 r
= sigtimedwait(&waitset
, &siginfo
, &ts
);
183 if (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
) && !timeout
)
187 pthread_mutex_lock(&qemu_mutex
);
189 cpu_single_env
= vcpu
->env
;
190 if (r
== -1 && !(errno
== EAGAIN
|| errno
== EINTR
)) {
191 printf("sigtimedwait: %s\n", strerror(e
));
197 if (env
&& vcpu_info
[env
->cpu_index
].stop
) {
198 vcpu_info
[env
->cpu_index
].stop
= 0;
199 vcpu_info
[env
->cpu_index
].stopped
= 1;
200 pthread_cond_signal(&qemu_pause_cond
);
202 pthread_mutex_unlock(&qemu_mutex
);
208 static void kvm_eat_signals(CPUState
*env
, int timeout
)
212 while (kvm_eat_signal(env
, 0))
215 r
= kvm_eat_signal(env
, timeout
);
217 while (kvm_eat_signal(env
, 0))
222 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
224 pthread_mutex_unlock(&qemu_mutex
);
225 kvm_eat_signals(env
, timeout
);
226 pthread_mutex_lock(&qemu_mutex
);
227 cpu_single_env
= env
;
228 vcpu_info
[env
->cpu_index
].signalled
= 0;
231 static int all_threads_paused(void)
235 for (i
= 0; i
< smp_cpus
; ++i
)
236 if (vcpu_info
[i
].stop
)
241 static void pause_all_threads(void)
245 for (i
= 0; i
< smp_cpus
; ++i
) {
246 vcpu_info
[i
].stop
= 1;
247 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
249 while (!all_threads_paused()) {
250 CPUState
*env
= cpu_single_env
;
251 pthread_cond_wait(&qemu_pause_cond
, &qemu_mutex
);
252 cpu_single_env
= env
;
256 static void resume_all_threads(void)
260 for (i
= 0; i
< smp_cpus
; ++i
) {
261 vcpu_info
[i
].stop
= 0;
262 vcpu_info
[i
].stopped
= 0;
263 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
267 static void kvm_vm_state_change_handler(void *context
, int running
)
270 resume_all_threads();
275 static void update_regs_for_sipi(CPUState
*env
)
277 kvm_arch_update_regs_for_sipi(env
);
278 vcpu_info
[env
->cpu_index
].sipi_needed
= 0;
279 vcpu_info
[env
->cpu_index
].init
= 0;
282 static void update_regs_for_init(CPUState
*env
)
285 kvm_arch_load_regs(env
);
288 static void setup_kernel_sigmask(CPUState
*env
)
293 sigaddset(&set
, SIGUSR2
);
294 sigaddset(&set
, SIGIO
);
295 sigaddset(&set
, SIGALRM
);
296 sigprocmask(SIG_BLOCK
, &set
, NULL
);
298 sigprocmask(SIG_BLOCK
, NULL
, &set
);
299 sigdelset(&set
, SIG_IPI
);
301 kvm_set_signal_mask(kvm_context
, env
->cpu_index
, &set
);
304 void qemu_kvm_system_reset_request(void)
308 for (i
= 0; i
< smp_cpus
; ++i
) {
309 vcpu_info
[i
].reload_regs
= 1;
310 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
315 static int kvm_main_loop_cpu(CPUState
*env
)
317 struct vcpu_info
*info
= &vcpu_info
[env
->cpu_index
];
319 setup_kernel_sigmask(env
);
321 pthread_mutex_lock(&qemu_mutex
);
322 if (kvm_irqchip_in_kernel(kvm_context
))
323 env
->hflags
&= ~HF_HALTED_MASK
;
325 kvm_qemu_init_env(env
);
326 env
->ready_for_interrupt_injection
= 1;
328 kvm_tpr_vcpu_start(env
);
331 cpu_single_env
= env
;
333 while (!has_work(env
))
334 kvm_main_loop_wait(env
, 1000);
335 if (env
->interrupt_request
& CPU_INTERRUPT_HARD
)
336 env
->hflags
&= ~HF_HALTED_MASK
;
337 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->sipi_needed
)
338 update_regs_for_sipi(env
);
339 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->init
)
340 update_regs_for_init(env
);
341 if (!(env
->hflags
& HF_HALTED_MASK
) && !info
->init
)
343 env
->interrupt_request
&= ~CPU_INTERRUPT_EXIT
;
344 kvm_main_loop_wait(env
, 0);
345 if (info
->reload_regs
) {
346 info
->reload_regs
= 0;
347 if (env
->cpu_index
== 0) /* ap needs to be placed in INIT */
348 kvm_arch_load_regs(env
);
351 pthread_mutex_unlock(&qemu_mutex
);
355 static void *ap_main_loop(void *_env
)
357 CPUState
*env
= _env
;
360 vcpu
= &vcpu_info
[env
->cpu_index
];
362 vcpu
->env
->thread_id
= kvm_get_thread_id();
363 sigfillset(&signals
);
364 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
365 kvm_create_vcpu(kvm_context
, env
->cpu_index
);
366 kvm_qemu_init_env(env
);
368 /* signal VCPU creation */
369 pthread_mutex_lock(&qemu_mutex
);
371 pthread_cond_signal(&qemu_vcpu_cond
);
373 /* and wait for machine initialization */
374 while (!qemu_system_ready
)
375 pthread_cond_wait(&qemu_system_cond
, &qemu_mutex
);
376 pthread_mutex_unlock(&qemu_mutex
);
378 kvm_main_loop_cpu(env
);
382 void kvm_init_new_ap(int cpu
, CPUState
*env
)
384 pthread_create(&vcpu_info
[cpu
].thread
, NULL
, ap_main_loop
, env
);
386 while (vcpu_info
[cpu
].created
== 0)
387 pthread_cond_wait(&qemu_vcpu_cond
, &qemu_mutex
);
390 int kvm_init_ap(void)
395 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
397 signal(SIG_IPI
, sig_ipi_handler
);
401 void qemu_kvm_notify_work(void)
407 if (io_thread_fd
== -1)
410 memcpy(buffer
, &value
, sizeof(value
));
415 len
= write(io_thread_fd
, buffer
+ offset
, 8 - offset
);
416 if (len
== -1 && errno
== EINTR
)
426 fprintf(stderr
, "failed to notify io thread\n");
429 static int received_signal
;
431 /* QEMU relies on periodically breaking out of select via EINTR to poll for IO
432 and timer signals. Since we're now using a file descriptor to handle
433 signals, select() won't be interrupted by a signal. We need to forcefully
434 break the select() loop when a signal is received hence
435 kvm_check_received_signal(). */
437 int kvm_check_received_signal(void)
439 if (received_signal
) {
447 /* If we have signalfd, we mask out the signals we want to handle and then
448 * use signalfd to listen for them. We rely on whatever the current signal
449 * handler is to dispatch the signals when we receive them.
452 static void sigfd_handler(void *opaque
)
454 int fd
= (unsigned long)opaque
;
455 struct signalfd_siginfo info
;
456 struct sigaction action
;
461 len
= read(fd
, &info
, sizeof(info
));
462 } while (len
== -1 && errno
== EINTR
);
464 if (len
== -1 && errno
== EAGAIN
)
467 if (len
!= sizeof(info
)) {
468 printf("read from sigfd returned %ld: %m\n", len
);
472 sigaction(info
.ssi_signo
, NULL
, &action
);
473 if (action
.sa_handler
)
474 action
.sa_handler(info
.ssi_signo
);
476 if (info
.ssi_signo
== SIGUSR2
) {
477 pthread_cond_signal(&qemu_aio_cond
);
484 /* Used to break IO thread out of select */
485 static void io_thread_wakeup(void *opaque
)
487 int fd
= (unsigned long)opaque
;
494 len
= read(fd
, buffer
+ offset
, 8 - offset
);
495 if (len
== -1 && errno
== EINTR
)
507 int kvm_main_loop(void)
513 io_thread
= pthread_self();
514 qemu_system_ready
= 1;
516 if (kvm_eventfd(fds
) == -1) {
517 fprintf(stderr
, "failed to create eventfd\n");
521 qemu_set_fd_handler2(fds
[0], NULL
, io_thread_wakeup
, NULL
,
522 (void *)(unsigned long)fds
[0]);
524 io_thread_fd
= fds
[1];
527 sigaddset(&mask
, SIGIO
);
528 sigaddset(&mask
, SIGALRM
);
529 sigaddset(&mask
, SIGUSR2
);
530 sigprocmask(SIG_BLOCK
, &mask
, NULL
);
532 sigfd
= kvm_signalfd(&mask
);
534 fprintf(stderr
, "failed to create signalfd\n");
538 fcntl(sigfd
, F_SETFL
, O_NONBLOCK
);
540 qemu_set_fd_handler2(sigfd
, NULL
, sigfd_handler
, NULL
,
541 (void *)(unsigned long)sigfd
);
543 pthread_cond_broadcast(&qemu_system_cond
);
545 io_thread_sigfd
= sigfd
;
546 cpu_single_env
= NULL
;
549 main_loop_wait(1000);
550 if (qemu_shutdown_requested())
552 else if (qemu_powerdown_requested())
553 qemu_system_powerdown();
554 else if (qemu_reset_requested()) {
555 pthread_kill(vcpu_info
[0].thread
, SIG_IPI
);
556 qemu_kvm_reset_requested
= 1;
561 pthread_mutex_unlock(&qemu_mutex
);
566 static int kvm_debug(void *opaque
, int vcpu
)
568 CPUState
*env
= cpu_single_env
;
570 env
->exception_index
= EXCP_DEBUG
;
574 static int kvm_inb(void *opaque
, uint16_t addr
, uint8_t *data
)
576 *data
= cpu_inb(0, addr
);
580 static int kvm_inw(void *opaque
, uint16_t addr
, uint16_t *data
)
582 *data
= cpu_inw(0, addr
);
586 static int kvm_inl(void *opaque
, uint16_t addr
, uint32_t *data
)
588 *data
= cpu_inl(0, addr
);
592 #define PM_IO_BASE 0xb000
594 static int kvm_outb(void *opaque
, uint16_t addr
, uint8_t data
)
599 cpu_outb(0, 0xb3, 0);
606 x
= cpu_inw(0, PM_IO_BASE
+ 4);
608 cpu_outw(0, PM_IO_BASE
+ 4, x
);
615 x
= cpu_inw(0, PM_IO_BASE
+ 4);
617 cpu_outw(0, PM_IO_BASE
+ 4, x
);
625 cpu_outb(0, addr
, data
);
629 static int kvm_outw(void *opaque
, uint16_t addr
, uint16_t data
)
631 cpu_outw(0, addr
, data
);
635 static int kvm_outl(void *opaque
, uint16_t addr
, uint32_t data
)
637 cpu_outl(0, addr
, data
);
641 static int kvm_mmio_read(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
643 cpu_physical_memory_rw(addr
, data
, len
, 0);
647 static int kvm_mmio_write(void *opaque
, uint64_t addr
, uint8_t *data
, int len
)
649 cpu_physical_memory_rw(addr
, data
, len
, 1);
653 static int kvm_io_window(void *opaque
)
659 static int kvm_halt(void *opaque
, int vcpu
)
661 return kvm_arch_halt(opaque
, vcpu
);
664 static int kvm_shutdown(void *opaque
, int vcpu
)
666 qemu_system_reset_request();
670 static struct kvm_callbacks qemu_kvm_ops
= {
678 .mmio_read
= kvm_mmio_read
,
679 .mmio_write
= kvm_mmio_write
,
681 .shutdown
= kvm_shutdown
,
682 .io_window
= kvm_io_window
,
683 .try_push_interrupts
= try_push_interrupts
,
684 .post_kvm_run
= post_kvm_run
,
685 .pre_kvm_run
= pre_kvm_run
,
687 .tpr_access
= handle_tpr_access
,
690 .powerpc_dcr_read
= handle_powerpc_dcr_read
,
691 .powerpc_dcr_write
= handle_powerpc_dcr_write
,
697 /* Try to initialize kvm */
698 kvm_context
= kvm_init(&qemu_kvm_ops
, cpu_single_env
);
702 pthread_mutex_lock(&qemu_mutex
);
707 int kvm_qemu_create_context(void)
711 kvm_disable_irqchip_creation(kvm_context
);
714 kvm_disable_pit_creation(kvm_context
);
716 if (kvm_create(kvm_context
, phys_ram_size
, (void**)&phys_ram_base
) < 0) {
720 r
= kvm_arch_qemu_create_context();
726 void kvm_qemu_destroy(void)
728 kvm_finalize(kvm_context
);
731 void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr
,
733 unsigned long phys_offset
)
735 #ifdef KVM_CAP_USER_MEMORY
738 r
= kvm_check_extension(kvm_context
, KVM_CAP_USER_MEMORY
);
740 if (!(phys_offset
& ~TARGET_PAGE_MASK
)) {
741 r
= kvm_is_allocated_mem(kvm_context
, start_addr
, size
);
744 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
746 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
747 r
= kvm_register_userspace_phys_mem(kvm_context
, start_addr
,
748 phys_ram_base
+ phys_offset
,
751 if (phys_offset
& IO_MEM_ROM
) {
752 phys_offset
&= ~IO_MEM_ROM
;
753 r
= kvm_is_intersecting_mem(kvm_context
, start_addr
);
755 kvm_create_mem_hole(kvm_context
, start_addr
, size
);
756 r
= kvm_register_userspace_phys_mem(kvm_context
, start_addr
,
757 phys_ram_base
+ phys_offset
,
761 printf("kvm_cpu_register_physical_memory: failed\n");
767 if (phys_offset
& IO_MEM_ROM
) {
768 phys_offset
&= ~IO_MEM_ROM
;
769 memcpy(phys_ram_base
+ start_addr
, phys_ram_base
+ phys_offset
, size
);
773 int kvm_qemu_check_extension(int ext
)
775 return kvm_check_extension(kvm_context
, ext
);
778 int kvm_qemu_init_env(CPUState
*cenv
)
780 return kvm_arch_qemu_init_env(cenv
);
783 int kvm_update_debugger(CPUState
*env
)
785 struct kvm_debug_guest dbg
;
789 if (env
->nb_breakpoints
|| env
->singlestep_enabled
) {
791 for (i
= 0; i
< 4 && i
< env
->nb_breakpoints
; ++i
) {
792 dbg
.breakpoints
[i
].enabled
= 1;
793 dbg
.breakpoints
[i
].address
= env
->breakpoints
[i
];
795 dbg
.singlestep
= env
->singlestep_enabled
;
797 return kvm_guest_debug(kvm_context
, env
->cpu_index
, &dbg
);
802 * dirty pages logging
804 /* FIXME: use unsigned long pointer instead of unsigned char */
805 unsigned char *kvm_dirty_bitmap
= NULL
;
806 int kvm_physical_memory_set_dirty_tracking(int enable
)
814 if (!kvm_dirty_bitmap
) {
815 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
816 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
817 if (kvm_dirty_bitmap
== NULL
) {
818 perror("Failed to allocate dirty pages bitmap");
822 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
827 if (kvm_dirty_bitmap
) {
828 r
= kvm_dirty_pages_log_reset(kvm_context
);
829 qemu_free(kvm_dirty_bitmap
);
830 kvm_dirty_bitmap
= NULL
;
836 /* get kvm's dirty pages bitmap and update qemu's */
837 int kvm_get_dirty_pages_log_range(unsigned long start_addr
,
838 unsigned char *bitmap
,
840 unsigned long mem_size
)
842 unsigned int i
, j
, n
=0;
844 unsigned page_number
, addr
, addr1
;
845 unsigned int len
= ((mem_size
/TARGET_PAGE_SIZE
) + 7) / 8;
848 * bitmap-traveling is faster than memory-traveling (for addr...)
849 * especially when most of the memory is not dirty.
851 for (i
=0; i
<len
; i
++) {
856 page_number
= i
* 8 + j
;
857 addr1
= page_number
* TARGET_PAGE_SIZE
;
858 addr
= offset
+ addr1
;
859 cpu_physical_memory_set_dirty(addr
);
865 int kvm_get_dirty_bitmap_cb(unsigned long start
, unsigned long len
,
866 void *bitmap
, void *opaque
)
868 return kvm_get_dirty_pages_log_range(start
, bitmap
, start
, len
);
872 * get kvm's dirty pages bitmap and update qemu's
873 * we only care about physical ram, which resides in slots 0 and 3
875 int kvm_update_dirty_pages_log(void)
880 r
= kvm_get_dirty_pages_range(kvm_context
, 0, phys_ram_size
,
881 kvm_dirty_bitmap
, NULL
,
882 kvm_get_dirty_bitmap_cb
);
886 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap
)
888 unsigned int bsize
= BITMAP_SIZE(phys_ram_size
);
889 unsigned int brsize
= BITMAP_SIZE(ram_size
);
890 unsigned int extra_pages
= (phys_ram_size
- ram_size
) / TARGET_PAGE_SIZE
;
891 unsigned int extra_bytes
= (extra_pages
+7)/8;
892 unsigned int hole_start
= BITMAP_SIZE(0xa0000);
893 unsigned int hole_end
= BITMAP_SIZE(0xc0000);
895 memset(bitmap
, 0xFF, brsize
+ extra_bytes
);
896 memset(bitmap
+ hole_start
, 0, hole_end
- hole_start
);
897 memset(bitmap
+ brsize
+ extra_bytes
, 0, bsize
- brsize
- extra_bytes
);
902 #ifdef KVM_CAP_IRQCHIP
904 int kvm_set_irq(int irq
, int level
)
906 return kvm_set_irq_level(kvm_context
, irq
, level
);
911 void qemu_kvm_aio_wait_start(void)
915 void qemu_kvm_aio_wait(void)
917 CPUState
*cpu_single
= cpu_single_env
;
919 if (!cpu_single_env
) {
920 if (io_thread_sigfd
!= -1) {
925 FD_SET(io_thread_sigfd
, &rfds
);
927 /* this is a rare case where we do want to hold qemu_mutex
928 * while sleeping. We cannot allow anything else to run
930 ret
= select(io_thread_sigfd
+ 1, &rfds
, NULL
, NULL
, NULL
);
931 if (ret
> 0 && FD_ISSET(io_thread_sigfd
, &rfds
))
932 sigfd_handler((void *)(unsigned long)io_thread_sigfd
);
936 pthread_cond_wait(&qemu_aio_cond
, &qemu_mutex
);
937 cpu_single_env
= cpu_single
;
941 void qemu_kvm_aio_wait_end(void)
945 int qemu_kvm_get_dirty_pages(unsigned long phys_addr
, void *buf
)
947 return kvm_get_dirty_pages(kvm_context
, phys_addr
, buf
);
950 void *kvm_cpu_create_phys_mem(target_phys_addr_t start_addr
,
951 unsigned long size
, int log
, int writable
)
953 return kvm_create_phys_mem(kvm_context
, start_addr
, size
, log
, writable
);
956 void kvm_cpu_destroy_phys_mem(target_phys_addr_t start_addr
,
959 kvm_destroy_phys_mem(kvm_context
, start_addr
, size
);
962 void kvm_mutex_unlock(void)
964 pthread_mutex_unlock(&qemu_mutex
);
967 void kvm_mutex_lock(void)
969 pthread_mutex_lock(&qemu_mutex
);
970 cpu_single_env
= NULL
;