kvm: external module: update for slab updates
[qemu-kvm/fedora.git] / qemu-kvm.c
blob4ba93d8dc1d1d437e048bc9adecd1affea5564f2
2 #include "config.h"
3 #include "config-host.h"
5 #ifdef USE_KVM
6 #define KVM_ALLOWED_DEFAULT 1
7 #else
8 #define KVM_ALLOWED_DEFAULT 0
9 #endif
11 int kvm_allowed = KVM_ALLOWED_DEFAULT;
13 #ifdef USE_KVM
15 #include <string.h>
16 #include "vl.h"
18 #include "qemu-kvm.h"
19 #include <kvmctl.h>
20 #include <pthread.h>
21 #include <sys/utsname.h>
23 #define MSR_IA32_TSC 0x10
25 extern void perror(const char *s);
27 kvm_context_t kvm_context;
28 static struct kvm_msr_list *kvm_msr_list;
29 static int kvm_has_msr_star;
31 extern int smp_cpus;
33 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
34 static __thread CPUState *vcpu_env;
36 static sigset_t io_sigset, io_negsigset;
38 static int wait_hack;
40 #define SIG_IPI (SIGRTMIN+4)
42 struct vcpu_info {
43 int sipi_needed;
44 int init;
45 pthread_t thread;
46 int signalled;
47 } vcpu_info[4];
49 static void sig_ipi_handler(int n)
53 void kvm_update_interrupt_request(CPUState *env)
55 if (env && env != vcpu_env) {
56 if (vcpu_info[env->cpu_index].signalled)
57 return;
58 vcpu_info[env->cpu_index].signalled = 1;
59 if (vcpu_info[env->cpu_index].thread)
60 pthread_kill(vcpu_info[env->cpu_index].thread, SIG_IPI);
64 void kvm_update_after_sipi(CPUState *env)
66 vcpu_info[env->cpu_index].sipi_needed = 1;
67 kvm_update_interrupt_request(env);
70 * the qemu bios waits using a busy loop that's much too short for
71 * kvm. add a wait after the first sipi.
74 static int first_sipi = 1;
76 if (first_sipi) {
77 wait_hack = 1;
78 first_sipi = 0;
83 void kvm_apic_init(CPUState *env)
85 vcpu_info[env->cpu_index].init = 1;
86 kvm_update_interrupt_request(env);
89 static void set_msr_entry(struct kvm_msr_entry *entry, uint32_t index,
90 uint64_t data)
92 entry->index = index;
93 entry->data = data;
96 /* returns 0 on success, non-0 on failure */
97 static int get_msr_entry(struct kvm_msr_entry *entry, CPUState *env)
99 switch (entry->index) {
100 case MSR_IA32_SYSENTER_CS:
101 env->sysenter_cs = entry->data;
102 break;
103 case MSR_IA32_SYSENTER_ESP:
104 env->sysenter_esp = entry->data;
105 break;
106 case MSR_IA32_SYSENTER_EIP:
107 env->sysenter_eip = entry->data;
108 break;
109 case MSR_STAR:
110 env->star = entry->data;
111 break;
112 #ifdef TARGET_X86_64
113 case MSR_CSTAR:
114 env->cstar = entry->data;
115 break;
116 case MSR_KERNELGSBASE:
117 env->kernelgsbase = entry->data;
118 break;
119 case MSR_FMASK:
120 env->fmask = entry->data;
121 break;
122 case MSR_LSTAR:
123 env->lstar = entry->data;
124 break;
125 #endif
126 case MSR_IA32_TSC:
127 env->tsc = entry->data;
128 break;
129 default:
130 printf("Warning unknown msr index 0x%x\n", entry->index);
131 return 1;
133 return 0;
136 #ifdef TARGET_X86_64
137 #define MSR_COUNT 9
138 #else
139 #define MSR_COUNT 5
140 #endif
142 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
144 lhs->selector = rhs->selector;
145 lhs->base = rhs->base;
146 lhs->limit = rhs->limit;
147 lhs->type = 3;
148 lhs->present = 1;
149 lhs->dpl = 3;
150 lhs->db = 0;
151 lhs->s = 1;
152 lhs->l = 0;
153 lhs->g = 0;
154 lhs->avl = 0;
155 lhs->unusable = 0;
158 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
160 unsigned flags = rhs->flags;
161 lhs->selector = rhs->selector;
162 lhs->base = rhs->base;
163 lhs->limit = rhs->limit;
164 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
165 lhs->present = (flags & DESC_P_MASK) != 0;
166 lhs->dpl = rhs->selector & 3;
167 lhs->db = (flags >> DESC_B_SHIFT) & 1;
168 lhs->s = (flags & DESC_S_MASK) != 0;
169 lhs->l = (flags >> DESC_L_SHIFT) & 1;
170 lhs->g = (flags & DESC_G_MASK) != 0;
171 lhs->avl = (flags & DESC_AVL_MASK) != 0;
172 lhs->unusable = 0;
175 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
177 lhs->selector = rhs->selector;
178 lhs->base = rhs->base;
179 lhs->limit = rhs->limit;
180 lhs->flags =
181 (rhs->type << DESC_TYPE_SHIFT)
182 | (rhs->present * DESC_P_MASK)
183 | (rhs->dpl << DESC_DPL_SHIFT)
184 | (rhs->db << DESC_B_SHIFT)
185 | (rhs->s * DESC_S_MASK)
186 | (rhs->l << DESC_L_SHIFT)
187 | (rhs->g * DESC_G_MASK)
188 | (rhs->avl * DESC_AVL_MASK);
191 /* the reset values of qemu are not compatible to SVM
192 * this function is used to fix the segment descriptor values */
193 static void fix_realmode_dataseg(struct kvm_segment *seg)
195 seg->type = 0x02;
196 seg->present = 1;
197 seg->s = 1;
200 static void load_regs(CPUState *env)
202 struct kvm_regs regs;
203 struct kvm_fpu fpu;
204 struct kvm_sregs sregs;
205 struct kvm_msr_entry msrs[MSR_COUNT];
206 int rc, n, i;
208 regs.rax = env->regs[R_EAX];
209 regs.rbx = env->regs[R_EBX];
210 regs.rcx = env->regs[R_ECX];
211 regs.rdx = env->regs[R_EDX];
212 regs.rsi = env->regs[R_ESI];
213 regs.rdi = env->regs[R_EDI];
214 regs.rsp = env->regs[R_ESP];
215 regs.rbp = env->regs[R_EBP];
216 #ifdef TARGET_X86_64
217 regs.r8 = env->regs[8];
218 regs.r9 = env->regs[9];
219 regs.r10 = env->regs[10];
220 regs.r11 = env->regs[11];
221 regs.r12 = env->regs[12];
222 regs.r13 = env->regs[13];
223 regs.r14 = env->regs[14];
224 regs.r15 = env->regs[15];
225 #endif
227 regs.rflags = env->eflags;
228 regs.rip = env->eip;
230 kvm_set_regs(kvm_context, env->cpu_index, &regs);
232 memset(&fpu, 0, sizeof fpu);
233 fpu.fsw = env->fpus & ~(7 << 11);
234 fpu.fsw |= (env->fpstt & 7) << 11;
235 fpu.fcw = env->fpuc;
236 for (i = 0; i < 8; ++i)
237 fpu.ftwx |= (!env->fptags[i]) << i;
238 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
239 memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
240 fpu.mxcsr = env->mxcsr;
241 kvm_set_fpu(kvm_context, env->cpu_index, &fpu);
243 memcpy(sregs.interrupt_bitmap, env->kvm_interrupt_bitmap, sizeof(sregs.interrupt_bitmap));
245 if ((env->eflags & VM_MASK)) {
246 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
247 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
248 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
249 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
250 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
251 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
252 } else {
253 set_seg(&sregs.cs, &env->segs[R_CS]);
254 set_seg(&sregs.ds, &env->segs[R_DS]);
255 set_seg(&sregs.es, &env->segs[R_ES]);
256 set_seg(&sregs.fs, &env->segs[R_FS]);
257 set_seg(&sregs.gs, &env->segs[R_GS]);
258 set_seg(&sregs.ss, &env->segs[R_SS]);
260 if (env->cr[0] & CR0_PE_MASK) {
261 /* force ss cpl to cs cpl */
262 sregs.ss.selector = (sregs.ss.selector & ~3) |
263 (sregs.cs.selector & 3);
264 sregs.ss.dpl = sregs.ss.selector & 3;
267 if (!(env->cr[0] & CR0_PG_MASK)) {
268 fix_realmode_dataseg(&sregs.cs);
269 fix_realmode_dataseg(&sregs.ds);
270 fix_realmode_dataseg(&sregs.es);
271 fix_realmode_dataseg(&sregs.fs);
272 fix_realmode_dataseg(&sregs.gs);
273 fix_realmode_dataseg(&sregs.ss);
277 set_seg(&sregs.tr, &env->tr);
278 set_seg(&sregs.ldt, &env->ldt);
280 sregs.idt.limit = env->idt.limit;
281 sregs.idt.base = env->idt.base;
282 sregs.gdt.limit = env->gdt.limit;
283 sregs.gdt.base = env->gdt.base;
285 sregs.cr0 = env->cr[0];
286 sregs.cr2 = env->cr[2];
287 sregs.cr3 = env->cr[3];
288 sregs.cr4 = env->cr[4];
290 sregs.apic_base = cpu_get_apic_base(env);
291 sregs.efer = env->efer;
292 sregs.cr8 = cpu_get_apic_tpr(env);
294 kvm_set_sregs(kvm_context, env->cpu_index, &sregs);
296 /* msrs */
297 n = 0;
298 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
299 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
300 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
301 if (kvm_has_msr_star)
302 set_msr_entry(&msrs[n++], MSR_STAR, env->star);
303 set_msr_entry(&msrs[n++], MSR_IA32_TSC, env->tsc);
304 #ifdef TARGET_X86_64
305 set_msr_entry(&msrs[n++], MSR_CSTAR, env->cstar);
306 set_msr_entry(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
307 set_msr_entry(&msrs[n++], MSR_FMASK, env->fmask);
308 set_msr_entry(&msrs[n++], MSR_LSTAR , env->lstar);
309 #endif
311 rc = kvm_set_msrs(kvm_context, env->cpu_index, msrs, n);
312 if (rc == -1)
313 perror("kvm_set_msrs FAILED");
317 static void save_regs(CPUState *env)
319 struct kvm_regs regs;
320 struct kvm_fpu fpu;
321 struct kvm_sregs sregs;
322 struct kvm_msr_entry msrs[MSR_COUNT];
323 uint32_t hflags;
324 uint32_t i, n, rc;
326 kvm_get_regs(kvm_context, env->cpu_index, &regs);
328 env->regs[R_EAX] = regs.rax;
329 env->regs[R_EBX] = regs.rbx;
330 env->regs[R_ECX] = regs.rcx;
331 env->regs[R_EDX] = regs.rdx;
332 env->regs[R_ESI] = regs.rsi;
333 env->regs[R_EDI] = regs.rdi;
334 env->regs[R_ESP] = regs.rsp;
335 env->regs[R_EBP] = regs.rbp;
336 #ifdef TARGET_X86_64
337 env->regs[8] = regs.r8;
338 env->regs[9] = regs.r9;
339 env->regs[10] = regs.r10;
340 env->regs[11] = regs.r11;
341 env->regs[12] = regs.r12;
342 env->regs[13] = regs.r13;
343 env->regs[14] = regs.r14;
344 env->regs[15] = regs.r15;
345 #endif
347 env->eflags = regs.rflags;
348 env->eip = regs.rip;
350 kvm_get_fpu(kvm_context, env->cpu_index, &fpu);
351 env->fpstt = (fpu.fsw >> 11) & 7;
352 env->fpus = fpu.fsw;
353 env->fpuc = fpu.fcw;
354 for (i = 0; i < 8; ++i)
355 env->fptags[i] = !((fpu.ftwx >> i) & 1);
356 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
357 memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
358 env->mxcsr = fpu.mxcsr;
360 kvm_get_sregs(kvm_context, env->cpu_index, &sregs);
362 memcpy(env->kvm_interrupt_bitmap, sregs.interrupt_bitmap, sizeof(env->kvm_interrupt_bitmap));
364 get_seg(&env->segs[R_CS], &sregs.cs);
365 get_seg(&env->segs[R_DS], &sregs.ds);
366 get_seg(&env->segs[R_ES], &sregs.es);
367 get_seg(&env->segs[R_FS], &sregs.fs);
368 get_seg(&env->segs[R_GS], &sregs.gs);
369 get_seg(&env->segs[R_SS], &sregs.ss);
371 get_seg(&env->tr, &sregs.tr);
372 get_seg(&env->ldt, &sregs.ldt);
374 env->idt.limit = sregs.idt.limit;
375 env->idt.base = sregs.idt.base;
376 env->gdt.limit = sregs.gdt.limit;
377 env->gdt.base = sregs.gdt.base;
379 env->cr[0] = sregs.cr0;
380 env->cr[2] = sregs.cr2;
381 env->cr[3] = sregs.cr3;
382 env->cr[4] = sregs.cr4;
384 cpu_set_apic_base(env, sregs.apic_base);
386 env->efer = sregs.efer;
387 //cpu_set_apic_tpr(env, sregs.cr8);
389 #define HFLAG_COPY_MASK ~( \
390 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
391 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
392 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
393 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
397 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
398 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
399 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
400 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
401 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
402 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
403 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
405 if (env->efer & MSR_EFER_LMA) {
406 hflags |= HF_LMA_MASK;
409 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
410 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
411 } else {
412 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
413 (DESC_B_SHIFT - HF_CS32_SHIFT);
414 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
415 (DESC_B_SHIFT - HF_SS32_SHIFT);
416 if (!(env->cr[0] & CR0_PE_MASK) ||
417 (env->eflags & VM_MASK) ||
418 !(hflags & HF_CS32_MASK)) {
419 hflags |= HF_ADDSEG_MASK;
420 } else {
421 hflags |= ((env->segs[R_DS].base |
422 env->segs[R_ES].base |
423 env->segs[R_SS].base) != 0) <<
424 HF_ADDSEG_SHIFT;
427 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
428 env->cc_src = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
429 env->df = 1 - (2 * ((env->eflags >> 10) & 1));
430 env->cc_op = CC_OP_EFLAGS;
431 env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
433 /* msrs */
434 n = 0;
435 msrs[n++].index = MSR_IA32_SYSENTER_CS;
436 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
437 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
438 if (kvm_has_msr_star)
439 msrs[n++].index = MSR_STAR;
440 msrs[n++].index = MSR_IA32_TSC;
441 #ifdef TARGET_X86_64
442 msrs[n++].index = MSR_CSTAR;
443 msrs[n++].index = MSR_KERNELGSBASE;
444 msrs[n++].index = MSR_FMASK;
445 msrs[n++].index = MSR_LSTAR;
446 #endif
447 rc = kvm_get_msrs(kvm_context, env->cpu_index, msrs, n);
448 if (rc == -1) {
449 perror("kvm_get_msrs FAILED");
451 else {
452 n = rc; /* actual number of MSRs */
453 for (i=0 ; i<n; i++) {
454 if (get_msr_entry(&msrs[i], env))
455 return;
460 #include <signal.h>
463 static int try_push_interrupts(void *opaque)
465 CPUState *env = cpu_single_env;
466 int r, irq;
468 if (env->ready_for_interrupt_injection &&
469 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
470 (env->eflags & IF_MASK)) {
471 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
472 irq = cpu_get_pic_interrupt(env);
473 if (irq >= 0) {
474 r = kvm_inject_irq(kvm_context, env->cpu_index, irq);
475 if (r < 0)
476 printf("cpu %d fail inject %x\n", env->cpu_index, irq);
480 return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0;
483 static void post_kvm_run(void *opaque, int vcpu)
485 CPUState *env = vcpu_env;
487 pthread_mutex_lock(&qemu_mutex);
488 cpu_single_env = env;
489 env->eflags = kvm_get_interrupt_flag(kvm_context, vcpu)
490 ? env->eflags | IF_MASK : env->eflags & ~IF_MASK;
491 env->ready_for_interrupt_injection
492 = kvm_is_ready_for_interrupt_injection(kvm_context, vcpu);
493 //cpu_set_apic_tpr(env, kvm_run->cr8);
494 cpu_set_apic_base(env, kvm_get_apic_base(kvm_context, vcpu));
497 static int pre_kvm_run(void *opaque, int vcpu)
499 CPUState *env = cpu_single_env;
501 if (env->cpu_index == 0 && wait_hack) {
502 int i;
504 wait_hack = 0;
506 pthread_mutex_unlock(&qemu_mutex);
507 for (i = 0; i < 10; ++i)
508 usleep(1000);
509 pthread_mutex_lock(&qemu_mutex);
512 kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env));
513 if (env->interrupt_request & CPU_INTERRUPT_EXIT)
514 return 1;
515 pthread_mutex_unlock(&qemu_mutex);
516 return 0;
519 void kvm_load_registers(CPUState *env)
521 if (kvm_allowed)
522 load_regs(env);
525 void kvm_save_registers(CPUState *env)
527 if (kvm_allowed)
528 save_regs(env);
531 int kvm_cpu_exec(CPUState *env)
533 int r;
535 r = kvm_run(kvm_context, env->cpu_index);
536 if (r < 0) {
537 printf("kvm_run returned %d\n", r);
538 exit(1);
541 return 0;
544 extern int vm_running;
546 static int has_work(CPUState *env)
548 if (!vm_running)
549 return 0;
550 if (!(env->hflags & HF_HALTED_MASK))
551 return 1;
552 if (env->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXIT))
553 return 1;
554 return 0;
557 static int kvm_eat_signal(CPUState *env, int timeout)
559 struct timespec ts;
560 int r, e, ret = 0;
561 siginfo_t siginfo;
562 struct sigaction sa;
564 ts.tv_sec = timeout / 1000;
565 ts.tv_nsec = (timeout % 1000) * 1000000;
566 r = sigtimedwait(&io_sigset, &siginfo, &ts);
567 if (r == -1 && (errno == EAGAIN || errno == EINTR) && !timeout)
568 return 0;
569 e = errno;
570 pthread_mutex_lock(&qemu_mutex);
571 cpu_single_env = vcpu_env;
572 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
573 printf("sigtimedwait: %s\n", strerror(e));
574 exit(1);
576 if (r != -1) {
577 sigaction(siginfo.si_signo, NULL, &sa);
578 sa.sa_handler(siginfo.si_signo);
579 ret = 1;
581 pthread_mutex_unlock(&qemu_mutex);
583 return ret;
587 static int kvm_eat_signals(CPUState *env, int timeout)
589 int r = 0;
591 while (kvm_eat_signal(env, 0))
592 r = 1;
593 if (!r && timeout) {
594 r = kvm_eat_signal(env, timeout);
595 if (r)
596 while (kvm_eat_signal(env, 0))
600 * we call select() even if no signal was received, to account for
601 * for which there is no signal handler installed.
603 pthread_mutex_lock(&qemu_mutex);
604 cpu_single_env = vcpu_env;
605 main_loop_wait(0);
606 pthread_mutex_unlock(&qemu_mutex);
609 static void kvm_main_loop_wait(CPUState *env, int timeout)
611 if (vcpu_info[env->cpu_index].signalled && timeout)
612 goto shortcut;
613 pthread_mutex_unlock(&qemu_mutex);
614 if (env->cpu_index == 0)
615 kvm_eat_signals(env, timeout);
616 else
617 if (timeout) {
618 sigset_t set;
619 int n;
621 sigemptyset(&set);
622 sigaddset(&set, SIG_IPI);
623 sigwait(&set, &n);
625 pthread_mutex_lock(&qemu_mutex);
626 cpu_single_env = env;
627 shortcut:
628 vcpu_info[env->cpu_index].signalled = 0;
631 static void update_regs_for_sipi(CPUState *env)
633 SegmentCache cs = env->segs[R_CS];
635 save_regs(env);
636 env->segs[R_CS] = cs;
637 env->eip = 0;
638 load_regs(env);
639 vcpu_info[env->cpu_index].sipi_needed = 0;
640 vcpu_info[env->cpu_index].init = 0;
643 static void update_regs_for_init(CPUState *env)
645 cpu_reset(env);
646 load_regs(env);
649 static void setup_kernel_sigmask(CPUState *env)
651 sigset_t set;
653 sigprocmask(SIG_BLOCK, NULL, &set);
654 sigdelset(&set, SIG_IPI);
655 if (env->cpu_index == 0)
656 sigandset(&set, &set, &io_negsigset);
658 kvm_set_signal_mask(kvm_context, env->cpu_index, &set);
661 static int kvm_main_loop_cpu(CPUState *env)
663 struct vcpu_info *info = &vcpu_info[env->cpu_index];
665 setup_kernel_sigmask(env);
666 pthread_mutex_lock(&qemu_mutex);
667 cpu_single_env = env;
668 while (1) {
669 while (!has_work(env))
670 kvm_main_loop_wait(env, 10);
671 if (env->interrupt_request & CPU_INTERRUPT_HARD)
672 env->hflags &= ~HF_HALTED_MASK;
673 if (info->sipi_needed)
674 update_regs_for_sipi(env);
675 if (info->init)
676 update_regs_for_init(env);
677 if (!(env->hflags & HF_HALTED_MASK) && !info->init)
678 kvm_cpu_exec(env);
679 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
680 kvm_main_loop_wait(env, 0);
681 if (qemu_shutdown_requested())
682 break;
683 else if (qemu_powerdown_requested())
684 qemu_system_powerdown();
685 else if (qemu_reset_requested()) {
686 env->interrupt_request = 0;
687 qemu_system_reset();
688 load_regs(env);
691 pthread_mutex_unlock(&qemu_mutex);
692 return 0;
695 static void *ap_main_loop(void *_env)
697 CPUState *env = _env;
698 sigset_t signals;
700 vcpu_env = env;
701 sigfillset(&signals);
702 //sigdelset(&signals, SIG_IPI);
703 sigprocmask(SIG_BLOCK, &signals, NULL);
704 kvm_create_vcpu(kvm_context, env->cpu_index);
705 kvm_qemu_init_env(env);
706 kvm_main_loop_cpu(env);
707 return NULL;
710 static void kvm_add_signal(int signum)
712 sigaddset(&io_sigset, signum);
713 sigdelset(&io_negsigset, signum);
714 sigprocmask(SIG_BLOCK, &io_sigset, NULL);
717 int kvm_main_loop(void)
719 CPUState *env = first_cpu->next_cpu;
720 int i;
722 sigemptyset(&io_sigset);
723 sigfillset(&io_negsigset);
724 kvm_add_signal(SIGIO);
725 kvm_add_signal(SIGALRM);
726 kvm_add_signal(SIGUSR2);
727 kvm_add_signal(SIG_IPI);
729 vcpu_env = first_cpu;
730 signal(SIG_IPI, sig_ipi_handler);
731 for (i = 1; i < smp_cpus; ++i) {
732 pthread_create(&vcpu_info[i].thread, NULL, ap_main_loop, env);
733 env = env->next_cpu;
735 vcpu_info[0].thread = pthread_self();
736 return kvm_main_loop_cpu(first_cpu);
739 static int kvm_debug(void *opaque, int vcpu)
741 CPUState *env = cpu_single_env;
743 env->exception_index = EXCP_DEBUG;
744 return 1;
747 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
749 *data = cpu_inb(0, addr);
750 return 0;
753 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
755 *data = cpu_inw(0, addr);
756 return 0;
759 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
761 *data = cpu_inl(0, addr);
762 return 0;
765 #define PM_IO_BASE 0xb000
767 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
769 if (addr == 0xb2) {
770 switch (data) {
771 case 0: {
772 cpu_outb(0, 0xb3, 0);
773 break;
775 case 0xf0: {
776 unsigned x;
778 /* enable acpi */
779 x = cpu_inw(0, PM_IO_BASE + 4);
780 x &= ~1;
781 cpu_outw(0, PM_IO_BASE + 4, x);
782 break;
784 case 0xf1: {
785 unsigned x;
787 /* enable acpi */
788 x = cpu_inw(0, PM_IO_BASE + 4);
789 x |= 1;
790 cpu_outw(0, PM_IO_BASE + 4, x);
791 break;
793 default:
794 break;
796 return 0;
798 cpu_outb(0, addr, data);
799 return 0;
802 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
804 cpu_outw(0, addr, data);
805 return 0;
808 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
810 cpu_outl(0, addr, data);
811 return 0;
814 static int kvm_readb(void *opaque, uint64_t addr, uint8_t *data)
816 *data = ldub_phys(addr);
817 return 0;
820 static int kvm_readw(void *opaque, uint64_t addr, uint16_t *data)
822 *data = lduw_phys(addr);
823 return 0;
826 static int kvm_readl(void *opaque, uint64_t addr, uint32_t *data)
828 *data = ldl_phys(addr);
829 return 0;
832 static int kvm_readq(void *opaque, uint64_t addr, uint64_t *data)
834 *data = ldq_phys(addr);
835 return 0;
838 static int kvm_writeb(void *opaque, uint64_t addr, uint8_t data)
840 stb_phys(addr, data);
841 return 0;
844 static int kvm_writew(void *opaque, uint64_t addr, uint16_t data)
846 stw_phys(addr, data);
847 return 0;
850 static int kvm_writel(void *opaque, uint64_t addr, uint32_t data)
852 stl_phys(addr, data);
853 return 0;
856 static int kvm_writeq(void *opaque, uint64_t addr, uint64_t data)
858 stq_phys(addr, data);
859 return 0;
862 static int kvm_io_window(void *opaque)
864 return 1;
868 static int kvm_halt(void *opaque, int vcpu)
870 CPUState *env = cpu_single_env;
872 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
873 (env->eflags & IF_MASK))) {
874 env->hflags |= HF_HALTED_MASK;
875 env->exception_index = EXCP_HLT;
878 return 1;
881 static int kvm_shutdown(void *opaque, int vcpu)
883 qemu_system_reset_request();
884 return 1;
887 static struct kvm_callbacks qemu_kvm_ops = {
888 .debug = kvm_debug,
889 .inb = kvm_inb,
890 .inw = kvm_inw,
891 .inl = kvm_inl,
892 .outb = kvm_outb,
893 .outw = kvm_outw,
894 .outl = kvm_outl,
895 .readb = kvm_readb,
896 .readw = kvm_readw,
897 .readl = kvm_readl,
898 .readq = kvm_readq,
899 .writeb = kvm_writeb,
900 .writew = kvm_writew,
901 .writel = kvm_writel,
902 .writeq = kvm_writeq,
903 .halt = kvm_halt,
904 .shutdown = kvm_shutdown,
905 .io_window = kvm_io_window,
906 .try_push_interrupts = try_push_interrupts,
907 .post_kvm_run = post_kvm_run,
908 .pre_kvm_run = pre_kvm_run,
911 int kvm_qemu_init()
913 /* Try to initialize kvm */
914 kvm_context = kvm_init(&qemu_kvm_ops, cpu_single_env);
915 if (!kvm_context) {
916 return -1;
919 return 0;
922 int kvm_qemu_create_context(void)
924 int i;
926 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
927 kvm_qemu_destroy();
928 return -1;
930 kvm_msr_list = kvm_get_msr_list(kvm_context);
931 if (!kvm_msr_list) {
932 kvm_qemu_destroy();
933 return -1;
935 for (i = 0; i < kvm_msr_list->nmsrs; ++i)
936 if (kvm_msr_list->indices[i] == MSR_STAR)
937 kvm_has_msr_star = 1;
938 return 0;
941 void kvm_qemu_destroy(void)
943 kvm_finalize(kvm_context);
946 static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx,
947 uint32_t *ecx, uint32_t *edx)
949 uint32_t vec[4];
951 vec[0] = function;
952 asm volatile (
953 #ifdef __x86_64__
954 "sub $128, %%rsp \n\t" /* skip red zone */
955 "push %0; push %%rsi \n\t"
956 "push %%rax; push %%rbx; push %%rcx; push %%rdx \n\t"
957 "mov 8*5(%%rsp), %%rsi \n\t"
958 "mov (%%rsi), %%eax \n\t"
959 "cpuid \n\t"
960 "mov %%eax, (%%rsi) \n\t"
961 "mov %%ebx, 4(%%rsi) \n\t"
962 "mov %%ecx, 8(%%rsi) \n\t"
963 "mov %%edx, 12(%%rsi) \n\t"
964 "pop %%rdx; pop %%rcx; pop %%rbx; pop %%rax \n\t"
965 "pop %%rsi; pop %0 \n\t"
966 "add $128, %%rsp"
967 #else
968 "push %0; push %%esi \n\t"
969 "push %%eax; push %%ebx; push %%ecx; push %%edx \n\t"
970 "mov 4*5(%%esp), %%esi \n\t"
971 "mov (%%esi), %%eax \n\t"
972 "cpuid \n\t"
973 "mov %%eax, (%%esi) \n\t"
974 "mov %%ebx, 4(%%esi) \n\t"
975 "mov %%ecx, 8(%%esi) \n\t"
976 "mov %%edx, 12(%%esi) \n\t"
977 "pop %%edx; pop %%ecx; pop %%ebx; pop %%eax \n\t"
978 "pop %%esi; pop %0 \n\t"
979 #endif
980 : : "rm"(vec) : "memory");
981 if (eax)
982 *eax = vec[0];
983 if (ebx)
984 *ebx = vec[1];
985 if (ecx)
986 *ecx = vec[2];
987 if (edx)
988 *edx = vec[3];
991 static void do_cpuid_ent(struct kvm_cpuid_entry *e, uint32_t function,
992 CPUState *env)
994 env->regs[R_EAX] = function;
995 qemu_kvm_cpuid_on_env(env);
996 e->function = function;
997 e->eax = env->regs[R_EAX];
998 e->ebx = env->regs[R_EBX];
999 e->ecx = env->regs[R_ECX];
1000 e->edx = env->regs[R_EDX];
1001 if (function == 0x80000001) {
1002 uint32_t h_eax, h_edx;
1003 struct utsname utsname;
1004 int lm_capable_kernel;
1006 host_cpuid(function, &h_eax, NULL, NULL, &h_edx);
1007 uname(&utsname);
1008 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
1010 // long mode
1011 if ((h_edx & 0x20000000) == 0 || !lm_capable_kernel)
1012 e->edx &= ~0x20000000u;
1013 // syscall
1014 if ((h_edx & 0x00000800) == 0)
1015 e->edx &= ~0x00000800u;
1016 // nx
1017 if ((h_edx & 0x00100000) == 0)
1018 e->edx &= ~0x00100000u;
1020 // sysenter isn't supported on compatibility mode on AMD. and syscall
1021 // isn't supported in compatibility mode on Intel. so advertise the
1022 // actuall cpu, and say goodbye to migration between different vendors
1023 // is you use compatibility mode.
1024 if (function == 0) {
1025 uint32_t bcd[3];
1027 host_cpuid(0, NULL, &bcd[0], &bcd[1], &bcd[2]);
1028 e->ebx = bcd[0];
1029 e->ecx = bcd[1];
1030 e->edx = bcd[2];
1034 int kvm_qemu_init_env(CPUState *cenv)
1036 struct kvm_cpuid_entry cpuid_ent[100];
1037 int cpuid_nent = 0;
1038 CPUState copy;
1039 uint32_t i, limit;
1041 copy = *cenv;
1043 copy.regs[R_EAX] = 0;
1044 qemu_kvm_cpuid_on_env(&copy);
1045 limit = copy.regs[R_EAX];
1047 for (i = 0; i <= limit; ++i)
1048 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1050 copy.regs[R_EAX] = 0x80000000;
1051 qemu_kvm_cpuid_on_env(&copy);
1052 limit = copy.regs[R_EAX];
1054 for (i = 0x80000000; i <= limit; ++i)
1055 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1057 kvm_setup_cpuid(kvm_context, cenv->cpu_index, cpuid_nent, cpuid_ent);
1059 return 0;
1062 int kvm_update_debugger(CPUState *env)
1064 struct kvm_debug_guest dbg;
1065 int i;
1067 dbg.enabled = 0;
1068 if (env->nb_breakpoints || env->singlestep_enabled) {
1069 dbg.enabled = 1;
1070 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
1071 dbg.breakpoints[i].enabled = 1;
1072 dbg.breakpoints[i].address = env->breakpoints[i];
1074 dbg.singlestep = env->singlestep_enabled;
1076 return kvm_guest_debug(kvm_context, env->cpu_index, &dbg);
1081 * dirty pages logging
1083 /* FIXME: use unsigned long pointer instead of unsigned char */
1084 unsigned char *kvm_dirty_bitmap = NULL;
1085 int kvm_physical_memory_set_dirty_tracking(int enable)
1087 int r = 0;
1089 if (!kvm_allowed)
1090 return 0;
1092 if (enable) {
1093 if (!kvm_dirty_bitmap) {
1094 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
1095 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
1096 if (kvm_dirty_bitmap == NULL) {
1097 perror("Failed to allocate dirty pages bitmap");
1098 r=-1;
1100 else {
1101 r = kvm_dirty_pages_log_enable_all(kvm_context);
1105 else {
1106 if (kvm_dirty_bitmap) {
1107 r = kvm_dirty_pages_log_reset(kvm_context);
1108 qemu_free(kvm_dirty_bitmap);
1109 kvm_dirty_bitmap = NULL;
1112 return r;
1115 /* get kvm's dirty pages bitmap and update qemu's */
1116 int kvm_get_dirty_pages_log_slot(int slot,
1117 unsigned char *bitmap,
1118 unsigned int offset,
1119 unsigned int len)
1121 int r;
1122 unsigned int i, j, n=0;
1123 unsigned char c;
1124 unsigned page_number, addr, addr1;
1126 memset(bitmap, 0, len);
1127 r = kvm_get_dirty_pages(kvm_context, slot, bitmap);
1128 if (r)
1129 return r;
1132 * bitmap-traveling is faster than memory-traveling (for addr...)
1133 * especially when most of the memory is not dirty.
1135 for (i=0; i<len; i++) {
1136 c = bitmap[i];
1137 while (c>0) {
1138 j = ffsl(c) - 1;
1139 c &= ~(1u<<j);
1140 page_number = i * 8 + j;
1141 addr1 = page_number * TARGET_PAGE_SIZE;
1142 addr = offset + addr1;
1143 cpu_physical_memory_set_dirty(addr);
1144 n++;
1147 return 0;
1151 * get kvm's dirty pages bitmap and update qemu's
1152 * we only care about physical ram, which resides in slots 0 and 3
1154 int kvm_update_dirty_pages_log(void)
1156 int r = 0, len;
1158 len = BITMAP_SIZE(0xa0000);
1159 r = kvm_get_dirty_pages_log_slot(3, kvm_dirty_bitmap, 0 , len);
1160 len = BITMAP_SIZE(phys_ram_size - 0xc0000);
1161 r = r || kvm_get_dirty_pages_log_slot(0, kvm_dirty_bitmap, 0xc0000, len);
1162 return r;
1165 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap)
1167 int r=0, len, offset;
1169 len = BITMAP_SIZE(phys_ram_size);
1170 memset(bitmap, 0, len);
1172 r = kvm_get_mem_map(kvm_context, 3, bitmap);
1173 if (r)
1174 goto out;
1176 offset = BITMAP_SIZE(0xc0000);
1177 r = kvm_get_mem_map(kvm_context, 0, bitmap + offset);
1179 out:
1180 return r;
1182 #endif