Don't wait for aio completion signals outside vcpu > 0
[qemu-kvm/fedora.git] / qemu-kvm.c
blob6c8c39162bef39fb85101ae39b05efa33644fe00
2 #include "config.h"
3 #include "config-host.h"
5 #ifdef USE_KVM
6 #define KVM_ALLOWED_DEFAULT 1
7 #else
8 #define KVM_ALLOWED_DEFAULT 0
9 #endif
11 int kvm_allowed = KVM_ALLOWED_DEFAULT;
12 int kvm_irqchip = 1;
14 #ifdef USE_KVM
16 #include <string.h>
17 #include "hw/hw.h"
18 #include "sysemu.h"
20 #include "qemu-kvm.h"
21 #include <libkvm.h>
22 #include <pthread.h>
23 #include <sys/utsname.h>
25 extern void perror(const char *s);
27 kvm_context_t kvm_context;
29 extern int smp_cpus;
31 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
32 pthread_cond_t qemu_aio_cond = PTHREAD_COND_INITIALIZER;
33 __thread CPUState *vcpu_env;
35 static sigset_t io_sigset, io_negsigset;
37 static int wait_hack;
39 #define SIG_IPI (SIGRTMIN+4)
41 struct vcpu_info {
42 int sipi_needed;
43 int init;
44 pthread_t thread;
45 int signalled;
46 int stop;
47 int stopped;
48 } vcpu_info[4];
50 static void sig_ipi_handler(int n)
54 void kvm_update_interrupt_request(CPUState *env)
56 if (env && env != vcpu_env) {
57 if (vcpu_info[env->cpu_index].signalled)
58 return;
59 vcpu_info[env->cpu_index].signalled = 1;
60 if (vcpu_info[env->cpu_index].thread)
61 pthread_kill(vcpu_info[env->cpu_index].thread, SIG_IPI);
65 void kvm_update_after_sipi(CPUState *env)
67 vcpu_info[env->cpu_index].sipi_needed = 1;
68 kvm_update_interrupt_request(env);
71 * the qemu bios waits using a busy loop that's much too short for
72 * kvm. add a wait after the first sipi.
75 static int first_sipi = 1;
77 if (first_sipi) {
78 wait_hack = 1;
79 first_sipi = 0;
84 void kvm_apic_init(CPUState *env)
86 if (env->cpu_index != 0)
87 vcpu_info[env->cpu_index].init = 1;
88 kvm_update_interrupt_request(env);
91 #include <signal.h>
93 static int try_push_interrupts(void *opaque)
95 return kvm_arch_try_push_interrupts(opaque);
98 static void post_kvm_run(void *opaque, int vcpu)
101 pthread_mutex_lock(&qemu_mutex);
102 kvm_arch_post_kvm_run(opaque, vcpu);
105 static int pre_kvm_run(void *opaque, int vcpu)
107 CPUState *env = cpu_single_env;
109 if (env->cpu_index == 0 && wait_hack) {
110 int i;
112 wait_hack = 0;
114 pthread_mutex_unlock(&qemu_mutex);
115 for (i = 0; i < 10; ++i)
116 usleep(1000);
117 pthread_mutex_lock(&qemu_mutex);
120 kvm_arch_pre_kvm_run(opaque, vcpu);
122 if (env->interrupt_request & CPU_INTERRUPT_EXIT)
123 return 1;
124 pthread_mutex_unlock(&qemu_mutex);
125 return 0;
128 void kvm_load_registers(CPUState *env)
130 if (kvm_allowed)
131 kvm_arch_load_regs(env);
134 void kvm_save_registers(CPUState *env)
136 if (kvm_allowed)
137 kvm_arch_save_regs(env);
140 int kvm_cpu_exec(CPUState *env)
142 int r;
144 r = kvm_run(kvm_context, env->cpu_index);
145 if (r < 0) {
146 printf("kvm_run returned %d\n", r);
147 exit(1);
150 return 0;
153 extern int vm_running;
155 static int has_work(CPUState *env)
157 if (!vm_running)
158 return 0;
159 if (!(env->hflags & HF_HALTED_MASK))
160 return 1;
161 return kvm_arch_has_work(env);
164 static int kvm_eat_signal(CPUState *env, int timeout)
166 struct timespec ts;
167 int r, e, ret = 0;
168 siginfo_t siginfo;
169 struct sigaction sa;
171 ts.tv_sec = timeout / 1000;
172 ts.tv_nsec = (timeout % 1000) * 1000000;
173 r = sigtimedwait(&io_sigset, &siginfo, &ts);
174 if (r == -1 && (errno == EAGAIN || errno == EINTR) && !timeout)
175 return 0;
176 e = errno;
177 pthread_mutex_lock(&qemu_mutex);
178 cpu_single_env = vcpu_env;
179 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
180 printf("sigtimedwait: %s\n", strerror(e));
181 exit(1);
183 if (r != -1) {
184 sigaction(siginfo.si_signo, NULL, &sa);
185 sa.sa_handler(siginfo.si_signo);
186 if (siginfo.si_signo == SIGUSR2)
187 pthread_cond_signal(&qemu_aio_cond);
188 ret = 1;
190 pthread_mutex_unlock(&qemu_mutex);
192 return ret;
196 static void kvm_eat_signals(CPUState *env, int timeout)
198 int r = 0;
200 while (kvm_eat_signal(env, 0))
201 r = 1;
202 if (!r && timeout) {
203 r = kvm_eat_signal(env, timeout);
204 if (r)
205 while (kvm_eat_signal(env, 0))
209 * we call select() even if no signal was received, to account for
210 * for which there is no signal handler installed.
212 pthread_mutex_lock(&qemu_mutex);
213 cpu_single_env = vcpu_env;
214 main_loop_wait(0);
215 pthread_mutex_unlock(&qemu_mutex);
218 static void kvm_main_loop_wait(CPUState *env, int timeout)
220 pthread_mutex_unlock(&qemu_mutex);
221 if (env->cpu_index == 0)
222 kvm_eat_signals(env, timeout);
223 else {
224 if (!kvm_irqchip_in_kernel(kvm_context) &&
225 (timeout || vcpu_info[env->cpu_index].stopped)) {
226 sigset_t set;
227 int n;
229 paused:
230 sigemptyset(&set);
231 sigaddset(&set, SIG_IPI);
232 sigwait(&set, &n);
233 } else {
234 struct timespec ts;
235 siginfo_t siginfo;
236 sigset_t set;
238 ts.tv_sec = 0;
239 ts.tv_nsec = 0;
240 sigemptyset(&set);
241 sigaddset(&set, SIG_IPI);
242 sigtimedwait(&set, &siginfo, &ts);
244 if (vcpu_info[env->cpu_index].stop) {
245 vcpu_info[env->cpu_index].stop = 0;
246 vcpu_info[env->cpu_index].stopped = 1;
247 pthread_kill(vcpu_info[0].thread, SIG_IPI);
248 goto paused;
251 pthread_mutex_lock(&qemu_mutex);
252 cpu_single_env = env;
253 vcpu_info[env->cpu_index].signalled = 0;
256 static int all_threads_paused(void)
258 int i;
260 for (i = 1; i < smp_cpus; ++i)
261 if (vcpu_info[i].stopped)
262 return 0;
263 return 1;
266 static void pause_other_threads(void)
268 int i;
270 for (i = 1; i < smp_cpus; ++i) {
271 vcpu_info[i].stop = 1;
272 pthread_kill(vcpu_info[i].thread, SIG_IPI);
274 while (!all_threads_paused())
275 kvm_eat_signals(vcpu_env, 0);
278 static void resume_other_threads(void)
280 int i;
282 for (i = 1; i < smp_cpus; ++i) {
283 vcpu_info[i].stop = 0;
284 vcpu_info[i].stopped = 0;
285 pthread_kill(vcpu_info[i].thread, SIG_IPI);
289 static void kvm_vm_state_change_handler(void *context, int running)
291 if (running)
292 resume_other_threads();
293 else
294 pause_other_threads();
297 static void update_regs_for_sipi(CPUState *env)
299 kvm_arch_update_regs_for_sipi(env);
300 vcpu_info[env->cpu_index].sipi_needed = 0;
301 vcpu_info[env->cpu_index].init = 0;
304 static void update_regs_for_init(CPUState *env)
306 cpu_reset(env);
307 kvm_arch_load_regs(env);
310 static void setup_kernel_sigmask(CPUState *env)
312 sigset_t set;
314 sigprocmask(SIG_BLOCK, NULL, &set);
315 sigdelset(&set, SIG_IPI);
316 if (env->cpu_index == 0)
317 sigandset(&set, &set, &io_negsigset);
319 kvm_set_signal_mask(kvm_context, env->cpu_index, &set);
322 static int kvm_main_loop_cpu(CPUState *env)
324 struct vcpu_info *info = &vcpu_info[env->cpu_index];
326 setup_kernel_sigmask(env);
327 pthread_mutex_lock(&qemu_mutex);
329 kvm_qemu_init_env(env);
330 env->ready_for_interrupt_injection = 1;
332 cpu_single_env = env;
333 #ifdef TARGET_I386
334 kvm_tpr_opt_setup(env);
335 #endif
336 while (1) {
337 while (!has_work(env))
338 kvm_main_loop_wait(env, 10);
339 if (env->interrupt_request & CPU_INTERRUPT_HARD)
340 env->hflags &= ~HF_HALTED_MASK;
341 if (!kvm_irqchip_in_kernel(kvm_context) && info->sipi_needed)
342 update_regs_for_sipi(env);
343 if (!kvm_irqchip_in_kernel(kvm_context) && info->init)
344 update_regs_for_init(env);
345 if (!(env->hflags & HF_HALTED_MASK) && !info->init)
346 kvm_cpu_exec(env);
347 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
348 kvm_main_loop_wait(env, 0);
349 if (qemu_shutdown_requested())
350 break;
351 else if (qemu_powerdown_requested())
352 qemu_system_powerdown();
353 else if (qemu_reset_requested()) {
354 env->interrupt_request = 0;
355 qemu_system_reset();
356 kvm_arch_load_regs(env);
359 pthread_mutex_unlock(&qemu_mutex);
360 return 0;
363 static void *ap_main_loop(void *_env)
365 CPUState *env = _env;
366 sigset_t signals;
368 vcpu_env = env;
369 sigfillset(&signals);
370 //sigdelset(&signals, SIG_IPI);
371 sigprocmask(SIG_BLOCK, &signals, NULL);
372 kvm_create_vcpu(kvm_context, env->cpu_index);
373 kvm_qemu_init_env(env);
374 if (kvm_irqchip_in_kernel(kvm_context))
375 env->hflags &= ~HF_HALTED_MASK;
376 kvm_main_loop_cpu(env);
377 return NULL;
380 static void kvm_add_signal(int signum)
382 sigaddset(&io_sigset, signum);
383 sigdelset(&io_negsigset, signum);
384 sigprocmask(SIG_BLOCK, &io_sigset, NULL);
387 int kvm_init_ap(void)
389 CPUState *env = first_cpu->next_cpu;
390 int i;
392 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
393 sigemptyset(&io_sigset);
394 sigfillset(&io_negsigset);
395 kvm_add_signal(SIGIO);
396 kvm_add_signal(SIGALRM);
397 kvm_add_signal(SIGUSR2);
398 if (!kvm_irqchip_in_kernel(kvm_context))
399 kvm_add_signal(SIG_IPI);
401 vcpu_env = first_cpu;
402 signal(SIG_IPI, sig_ipi_handler);
403 for (i = 1; i < smp_cpus; ++i) {
404 pthread_create(&vcpu_info[i].thread, NULL, ap_main_loop, env);
405 env = env->next_cpu;
407 return 0;
410 int kvm_main_loop(void)
412 vcpu_info[0].thread = pthread_self();
413 pthread_mutex_unlock(&qemu_mutex);
414 return kvm_main_loop_cpu(first_cpu);
417 static int kvm_debug(void *opaque, int vcpu)
419 CPUState *env = cpu_single_env;
421 env->exception_index = EXCP_DEBUG;
422 return 1;
425 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
427 *data = cpu_inb(0, addr);
428 return 0;
431 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
433 *data = cpu_inw(0, addr);
434 return 0;
437 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
439 *data = cpu_inl(0, addr);
440 return 0;
443 #define PM_IO_BASE 0xb000
445 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
447 if (addr == 0xb2) {
448 switch (data) {
449 case 0: {
450 cpu_outb(0, 0xb3, 0);
451 break;
453 case 0xf0: {
454 unsigned x;
456 /* enable acpi */
457 x = cpu_inw(0, PM_IO_BASE + 4);
458 x &= ~1;
459 cpu_outw(0, PM_IO_BASE + 4, x);
460 break;
462 case 0xf1: {
463 unsigned x;
465 /* enable acpi */
466 x = cpu_inw(0, PM_IO_BASE + 4);
467 x |= 1;
468 cpu_outw(0, PM_IO_BASE + 4, x);
469 break;
471 default:
472 break;
474 return 0;
476 cpu_outb(0, addr, data);
477 return 0;
480 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
482 cpu_outw(0, addr, data);
483 return 0;
486 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
488 cpu_outl(0, addr, data);
489 return 0;
492 static int kvm_mmio_read(void *opaque, uint64_t addr, uint8_t *data, int len)
494 cpu_physical_memory_rw(addr, data, len, 0);
495 return 0;
498 static int kvm_mmio_write(void *opaque, uint64_t addr, uint8_t *data, int len)
500 cpu_physical_memory_rw(addr, data, len, 1);
501 return 0;
504 static int kvm_io_window(void *opaque)
506 return 1;
510 static int kvm_halt(void *opaque, int vcpu)
512 return kvm_arch_halt(opaque, vcpu);
515 static int kvm_shutdown(void *opaque, int vcpu)
517 qemu_system_reset_request();
518 return 1;
521 static struct kvm_callbacks qemu_kvm_ops = {
522 .debug = kvm_debug,
523 .inb = kvm_inb,
524 .inw = kvm_inw,
525 .inl = kvm_inl,
526 .outb = kvm_outb,
527 .outw = kvm_outw,
528 .outl = kvm_outl,
529 .mmio_read = kvm_mmio_read,
530 .mmio_write = kvm_mmio_write,
531 .halt = kvm_halt,
532 .shutdown = kvm_shutdown,
533 .io_window = kvm_io_window,
534 .try_push_interrupts = try_push_interrupts,
535 .post_kvm_run = post_kvm_run,
536 .pre_kvm_run = pre_kvm_run,
537 #ifdef TARGET_I386
538 .tpr_access = handle_tpr_access,
539 #endif
540 #ifdef TARGET_PPC
541 .powerpc_dcr_read = handle_powerpc_dcr_read,
542 .powerpc_dcr_write = handle_powerpc_dcr_write,
543 #endif
546 int kvm_qemu_init()
548 /* Try to initialize kvm */
549 kvm_context = kvm_init(&qemu_kvm_ops, cpu_single_env);
550 if (!kvm_context) {
551 return -1;
553 pthread_mutex_lock(&qemu_mutex);
555 return 0;
558 int kvm_qemu_create_context(void)
560 int r;
561 if (!kvm_irqchip) {
562 kvm_disable_irqchip_creation(kvm_context);
564 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
565 kvm_qemu_destroy();
566 return -1;
568 r = kvm_arch_qemu_create_context();
569 if(r <0)
570 kvm_qemu_destroy();
571 return 0;
574 void kvm_qemu_destroy(void)
576 kvm_finalize(kvm_context);
579 void kvm_cpu_register_physical_memory(target_phys_addr_t start_addr,
580 unsigned long size,
581 unsigned long phys_offset)
583 #ifdef KVM_CAP_USER_MEMORY
584 int r = 0;
586 r = kvm_check_extension(kvm_context, KVM_CAP_USER_MEMORY);
587 if (r) {
588 if (!(phys_offset & ~TARGET_PAGE_MASK)) {
589 r = kvm_is_allocated_mem(kvm_context, start_addr, size);
590 if (r)
591 return;
592 r = kvm_is_intersecting_mem(kvm_context, start_addr);
593 if (r)
594 kvm_create_mem_hole(kvm_context, start_addr, size);
595 r = kvm_register_userspace_phys_mem(kvm_context, start_addr,
596 phys_ram_base + phys_offset,
597 size, 0);
599 if (phys_offset & IO_MEM_ROM) {
600 phys_offset &= ~IO_MEM_ROM;
601 r = kvm_is_intersecting_mem(kvm_context, start_addr);
602 if (r)
603 kvm_create_mem_hole(kvm_context, start_addr, size);
604 r = kvm_register_userspace_phys_mem(kvm_context, start_addr,
605 phys_ram_base + phys_offset,
606 size, 0);
608 if (r < 0) {
609 printf("kvm_cpu_register_physical_memory: failed\n");
610 exit(1);
612 return;
614 #endif
615 if (phys_offset & IO_MEM_ROM) {
616 phys_offset &= ~IO_MEM_ROM;
617 memcpy(phys_ram_base + start_addr, phys_ram_base + phys_offset, size);
621 int kvm_qemu_check_extension(int ext)
623 return kvm_check_extension(kvm_context, ext);
626 int kvm_qemu_init_env(CPUState *cenv)
628 return kvm_arch_qemu_init_env(cenv);
631 int kvm_update_debugger(CPUState *env)
633 struct kvm_debug_guest dbg;
634 int i;
636 dbg.enabled = 0;
637 if (env->nb_breakpoints || env->singlestep_enabled) {
638 dbg.enabled = 1;
639 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
640 dbg.breakpoints[i].enabled = 1;
641 dbg.breakpoints[i].address = env->breakpoints[i];
643 dbg.singlestep = env->singlestep_enabled;
645 return kvm_guest_debug(kvm_context, env->cpu_index, &dbg);
650 * dirty pages logging
652 /* FIXME: use unsigned long pointer instead of unsigned char */
653 unsigned char *kvm_dirty_bitmap = NULL;
654 int kvm_physical_memory_set_dirty_tracking(int enable)
656 int r = 0;
658 if (!kvm_allowed)
659 return 0;
661 if (enable) {
662 if (!kvm_dirty_bitmap) {
663 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
664 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
665 if (kvm_dirty_bitmap == NULL) {
666 perror("Failed to allocate dirty pages bitmap");
667 r=-1;
669 else {
670 r = kvm_dirty_pages_log_enable_all(kvm_context);
674 else {
675 if (kvm_dirty_bitmap) {
676 r = kvm_dirty_pages_log_reset(kvm_context);
677 qemu_free(kvm_dirty_bitmap);
678 kvm_dirty_bitmap = NULL;
681 return r;
684 /* get kvm's dirty pages bitmap and update qemu's */
685 int kvm_get_dirty_pages_log_range(unsigned long start_addr,
686 unsigned char *bitmap,
687 unsigned int offset,
688 unsigned long mem_size)
690 unsigned int i, j, n=0;
691 unsigned char c;
692 unsigned page_number, addr, addr1;
693 unsigned int len = ((mem_size/TARGET_PAGE_SIZE) + 7) / 8;
696 * bitmap-traveling is faster than memory-traveling (for addr...)
697 * especially when most of the memory is not dirty.
699 for (i=0; i<len; i++) {
700 c = bitmap[i];
701 while (c>0) {
702 j = ffsl(c) - 1;
703 c &= ~(1u<<j);
704 page_number = i * 8 + j;
705 addr1 = page_number * TARGET_PAGE_SIZE;
706 addr = offset + addr1;
707 cpu_physical_memory_set_dirty(addr);
708 n++;
711 return 0;
713 int kvm_get_dirty_bitmap_cb(unsigned long start, unsigned long len,
714 void *bitmap, void *opaque)
716 return kvm_get_dirty_pages_log_range(start, bitmap, start, len);
720 * get kvm's dirty pages bitmap and update qemu's
721 * we only care about physical ram, which resides in slots 0 and 3
723 int kvm_update_dirty_pages_log(void)
725 int r = 0;
728 r = kvm_get_dirty_pages_range(kvm_context, 0, phys_ram_size,
729 kvm_dirty_bitmap, NULL,
730 kvm_get_dirty_bitmap_cb);
731 return r;
734 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap)
736 unsigned int bsize = BITMAP_SIZE(phys_ram_size);
737 unsigned int brsize = BITMAP_SIZE(ram_size);
738 unsigned int extra_pages = (phys_ram_size - ram_size) / TARGET_PAGE_SIZE;
739 unsigned int extra_bytes = (extra_pages +7)/8;
740 unsigned int hole_start = BITMAP_SIZE(0xa0000);
741 unsigned int hole_end = BITMAP_SIZE(0xc0000);
743 memset(bitmap, 0xFF, brsize + extra_bytes);
744 memset(bitmap + hole_start, 0, hole_end - hole_start);
745 memset(bitmap + brsize + extra_bytes, 0, bsize - brsize - extra_bytes);
747 return 0;
750 #ifdef KVM_CAP_IRQCHIP
752 int kvm_set_irq(int irq, int level)
754 return kvm_set_irq_level(kvm_context, irq, level);
757 #endif
759 void qemu_kvm_aio_wait_start(void)
763 void qemu_kvm_aio_wait(void)
765 if (!cpu_single_env || cpu_single_env->cpu_index == 0) {
766 pthread_mutex_unlock(&qemu_mutex);
767 kvm_eat_signal(cpu_single_env, 1000);
768 pthread_mutex_lock(&qemu_mutex);
769 } else {
770 pthread_cond_wait(&qemu_aio_cond, &qemu_mutex);
774 void qemu_kvm_aio_wait_end(void)
778 #endif