Fix configurations with more than 4GB of memory
[qemu-kvm/fedora.git] / qemu-kvm.c
blob93a7f17b08fd2ec4b5d03553349ce94557e638e3
2 #include "config.h"
3 #include "config-host.h"
5 #ifdef USE_KVM
6 #define KVM_ALLOWED_DEFAULT 1
7 #else
8 #define KVM_ALLOWED_DEFAULT 0
9 #endif
11 int kvm_allowed = KVM_ALLOWED_DEFAULT;
12 static int lm_capable_kernel;
13 int kvm_irqchip = 1;
15 #ifdef USE_KVM
17 #include <string.h>
18 #include "vl.h"
20 #include "qemu-kvm.h"
21 #include <kvmctl.h>
22 #include <pthread.h>
23 #include <sys/utsname.h>
25 #define MSR_IA32_TSC 0x10
27 extern void perror(const char *s);
29 kvm_context_t kvm_context;
30 static struct kvm_msr_list *kvm_msr_list;
31 static int kvm_has_msr_star;
33 extern int smp_cpus;
34 extern unsigned int kvm_shadow_memory;
36 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
37 static __thread CPUState *vcpu_env;
39 static sigset_t io_sigset, io_negsigset;
41 static int wait_hack;
43 #define SIG_IPI (SIGRTMIN+4)
45 struct vcpu_info {
46 int sipi_needed;
47 int init;
48 pthread_t thread;
49 int signalled;
50 int stop;
51 int stopped;
52 } vcpu_info[4];
54 static void sig_ipi_handler(int n)
58 void kvm_update_interrupt_request(CPUState *env)
60 if (env && env != vcpu_env) {
61 if (vcpu_info[env->cpu_index].signalled)
62 return;
63 vcpu_info[env->cpu_index].signalled = 1;
64 if (vcpu_info[env->cpu_index].thread)
65 pthread_kill(vcpu_info[env->cpu_index].thread, SIG_IPI);
69 void kvm_update_after_sipi(CPUState *env)
71 vcpu_info[env->cpu_index].sipi_needed = 1;
72 kvm_update_interrupt_request(env);
75 * the qemu bios waits using a busy loop that's much too short for
76 * kvm. add a wait after the first sipi.
79 static int first_sipi = 1;
81 if (first_sipi) {
82 wait_hack = 1;
83 first_sipi = 0;
88 void kvm_apic_init(CPUState *env)
90 if (env->cpu_index != 0)
91 vcpu_info[env->cpu_index].init = 1;
92 kvm_update_interrupt_request(env);
95 static void set_msr_entry(struct kvm_msr_entry *entry, uint32_t index,
96 uint64_t data)
98 entry->index = index;
99 entry->data = data;
102 /* returns 0 on success, non-0 on failure */
103 static int get_msr_entry(struct kvm_msr_entry *entry, CPUState *env)
105 switch (entry->index) {
106 case MSR_IA32_SYSENTER_CS:
107 env->sysenter_cs = entry->data;
108 break;
109 case MSR_IA32_SYSENTER_ESP:
110 env->sysenter_esp = entry->data;
111 break;
112 case MSR_IA32_SYSENTER_EIP:
113 env->sysenter_eip = entry->data;
114 break;
115 case MSR_STAR:
116 env->star = entry->data;
117 break;
118 #ifdef TARGET_X86_64
119 case MSR_CSTAR:
120 env->cstar = entry->data;
121 break;
122 case MSR_KERNELGSBASE:
123 env->kernelgsbase = entry->data;
124 break;
125 case MSR_FMASK:
126 env->fmask = entry->data;
127 break;
128 case MSR_LSTAR:
129 env->lstar = entry->data;
130 break;
131 #endif
132 case MSR_IA32_TSC:
133 env->tsc = entry->data;
134 break;
135 default:
136 printf("Warning unknown msr index 0x%x\n", entry->index);
137 return 1;
139 return 0;
142 #ifdef TARGET_X86_64
143 #define MSR_COUNT 9
144 #else
145 #define MSR_COUNT 5
146 #endif
148 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
150 lhs->selector = rhs->selector;
151 lhs->base = rhs->base;
152 lhs->limit = rhs->limit;
153 lhs->type = 3;
154 lhs->present = 1;
155 lhs->dpl = 3;
156 lhs->db = 0;
157 lhs->s = 1;
158 lhs->l = 0;
159 lhs->g = 0;
160 lhs->avl = 0;
161 lhs->unusable = 0;
164 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
166 unsigned flags = rhs->flags;
167 lhs->selector = rhs->selector;
168 lhs->base = rhs->base;
169 lhs->limit = rhs->limit;
170 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
171 lhs->present = (flags & DESC_P_MASK) != 0;
172 lhs->dpl = rhs->selector & 3;
173 lhs->db = (flags >> DESC_B_SHIFT) & 1;
174 lhs->s = (flags & DESC_S_MASK) != 0;
175 lhs->l = (flags >> DESC_L_SHIFT) & 1;
176 lhs->g = (flags & DESC_G_MASK) != 0;
177 lhs->avl = (flags & DESC_AVL_MASK) != 0;
178 lhs->unusable = 0;
181 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
183 lhs->selector = rhs->selector;
184 lhs->base = rhs->base;
185 lhs->limit = rhs->limit;
186 lhs->flags =
187 (rhs->type << DESC_TYPE_SHIFT)
188 | (rhs->present * DESC_P_MASK)
189 | (rhs->dpl << DESC_DPL_SHIFT)
190 | (rhs->db << DESC_B_SHIFT)
191 | (rhs->s * DESC_S_MASK)
192 | (rhs->l << DESC_L_SHIFT)
193 | (rhs->g * DESC_G_MASK)
194 | (rhs->avl * DESC_AVL_MASK);
197 /* the reset values of qemu are not compatible to SVM
198 * this function is used to fix the segment descriptor values */
199 static void fix_realmode_dataseg(struct kvm_segment *seg)
201 seg->type = 0x02;
202 seg->present = 1;
203 seg->s = 1;
206 static void load_regs(CPUState *env)
208 struct kvm_regs regs;
209 struct kvm_fpu fpu;
210 struct kvm_sregs sregs;
211 struct kvm_msr_entry msrs[MSR_COUNT];
212 int rc, n, i;
214 regs.rax = env->regs[R_EAX];
215 regs.rbx = env->regs[R_EBX];
216 regs.rcx = env->regs[R_ECX];
217 regs.rdx = env->regs[R_EDX];
218 regs.rsi = env->regs[R_ESI];
219 regs.rdi = env->regs[R_EDI];
220 regs.rsp = env->regs[R_ESP];
221 regs.rbp = env->regs[R_EBP];
222 #ifdef TARGET_X86_64
223 regs.r8 = env->regs[8];
224 regs.r9 = env->regs[9];
225 regs.r10 = env->regs[10];
226 regs.r11 = env->regs[11];
227 regs.r12 = env->regs[12];
228 regs.r13 = env->regs[13];
229 regs.r14 = env->regs[14];
230 regs.r15 = env->regs[15];
231 #endif
233 regs.rflags = env->eflags;
234 regs.rip = env->eip;
236 kvm_set_regs(kvm_context, env->cpu_index, &regs);
238 memset(&fpu, 0, sizeof fpu);
239 fpu.fsw = env->fpus & ~(7 << 11);
240 fpu.fsw |= (env->fpstt & 7) << 11;
241 fpu.fcw = env->fpuc;
242 for (i = 0; i < 8; ++i)
243 fpu.ftwx |= (!env->fptags[i]) << i;
244 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
245 memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
246 fpu.mxcsr = env->mxcsr;
247 kvm_set_fpu(kvm_context, env->cpu_index, &fpu);
249 memcpy(sregs.interrupt_bitmap, env->kvm_interrupt_bitmap, sizeof(sregs.interrupt_bitmap));
251 if ((env->eflags & VM_MASK)) {
252 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
253 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
254 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
255 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
256 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
257 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
258 } else {
259 set_seg(&sregs.cs, &env->segs[R_CS]);
260 set_seg(&sregs.ds, &env->segs[R_DS]);
261 set_seg(&sregs.es, &env->segs[R_ES]);
262 set_seg(&sregs.fs, &env->segs[R_FS]);
263 set_seg(&sregs.gs, &env->segs[R_GS]);
264 set_seg(&sregs.ss, &env->segs[R_SS]);
266 if (env->cr[0] & CR0_PE_MASK) {
267 /* force ss cpl to cs cpl */
268 sregs.ss.selector = (sregs.ss.selector & ~3) |
269 (sregs.cs.selector & 3);
270 sregs.ss.dpl = sregs.ss.selector & 3;
273 if (!(env->cr[0] & CR0_PG_MASK)) {
274 fix_realmode_dataseg(&sregs.cs);
275 fix_realmode_dataseg(&sregs.ds);
276 fix_realmode_dataseg(&sregs.es);
277 fix_realmode_dataseg(&sregs.fs);
278 fix_realmode_dataseg(&sregs.gs);
279 fix_realmode_dataseg(&sregs.ss);
283 set_seg(&sregs.tr, &env->tr);
284 set_seg(&sregs.ldt, &env->ldt);
286 sregs.idt.limit = env->idt.limit;
287 sregs.idt.base = env->idt.base;
288 sregs.gdt.limit = env->gdt.limit;
289 sregs.gdt.base = env->gdt.base;
291 sregs.cr0 = env->cr[0];
292 sregs.cr2 = env->cr[2];
293 sregs.cr3 = env->cr[3];
294 sregs.cr4 = env->cr[4];
296 sregs.apic_base = cpu_get_apic_base(env);
297 sregs.efer = env->efer;
298 sregs.cr8 = cpu_get_apic_tpr(env);
300 kvm_set_sregs(kvm_context, env->cpu_index, &sregs);
302 /* msrs */
303 n = 0;
304 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
305 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
306 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
307 if (kvm_has_msr_star)
308 set_msr_entry(&msrs[n++], MSR_STAR, env->star);
309 set_msr_entry(&msrs[n++], MSR_IA32_TSC, env->tsc);
310 #ifdef TARGET_X86_64
311 if (lm_capable_kernel) {
312 set_msr_entry(&msrs[n++], MSR_CSTAR, env->cstar);
313 set_msr_entry(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
314 set_msr_entry(&msrs[n++], MSR_FMASK, env->fmask);
315 set_msr_entry(&msrs[n++], MSR_LSTAR , env->lstar);
317 #endif
319 rc = kvm_set_msrs(kvm_context, env->cpu_index, msrs, n);
320 if (rc == -1)
321 perror("kvm_set_msrs FAILED");
325 static void save_regs(CPUState *env)
327 struct kvm_regs regs;
328 struct kvm_fpu fpu;
329 struct kvm_sregs sregs;
330 struct kvm_msr_entry msrs[MSR_COUNT];
331 uint32_t hflags;
332 uint32_t i, n, rc;
334 kvm_get_regs(kvm_context, env->cpu_index, &regs);
336 env->regs[R_EAX] = regs.rax;
337 env->regs[R_EBX] = regs.rbx;
338 env->regs[R_ECX] = regs.rcx;
339 env->regs[R_EDX] = regs.rdx;
340 env->regs[R_ESI] = regs.rsi;
341 env->regs[R_EDI] = regs.rdi;
342 env->regs[R_ESP] = regs.rsp;
343 env->regs[R_EBP] = regs.rbp;
344 #ifdef TARGET_X86_64
345 env->regs[8] = regs.r8;
346 env->regs[9] = regs.r9;
347 env->regs[10] = regs.r10;
348 env->regs[11] = regs.r11;
349 env->regs[12] = regs.r12;
350 env->regs[13] = regs.r13;
351 env->regs[14] = regs.r14;
352 env->regs[15] = regs.r15;
353 #endif
355 env->eflags = regs.rflags;
356 env->eip = regs.rip;
358 kvm_get_fpu(kvm_context, env->cpu_index, &fpu);
359 env->fpstt = (fpu.fsw >> 11) & 7;
360 env->fpus = fpu.fsw;
361 env->fpuc = fpu.fcw;
362 for (i = 0; i < 8; ++i)
363 env->fptags[i] = !((fpu.ftwx >> i) & 1);
364 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
365 memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
366 env->mxcsr = fpu.mxcsr;
368 kvm_get_sregs(kvm_context, env->cpu_index, &sregs);
370 memcpy(env->kvm_interrupt_bitmap, sregs.interrupt_bitmap, sizeof(env->kvm_interrupt_bitmap));
372 get_seg(&env->segs[R_CS], &sregs.cs);
373 get_seg(&env->segs[R_DS], &sregs.ds);
374 get_seg(&env->segs[R_ES], &sregs.es);
375 get_seg(&env->segs[R_FS], &sregs.fs);
376 get_seg(&env->segs[R_GS], &sregs.gs);
377 get_seg(&env->segs[R_SS], &sregs.ss);
379 get_seg(&env->tr, &sregs.tr);
380 get_seg(&env->ldt, &sregs.ldt);
382 env->idt.limit = sregs.idt.limit;
383 env->idt.base = sregs.idt.base;
384 env->gdt.limit = sregs.gdt.limit;
385 env->gdt.base = sregs.gdt.base;
387 env->cr[0] = sregs.cr0;
388 env->cr[2] = sregs.cr2;
389 env->cr[3] = sregs.cr3;
390 env->cr[4] = sregs.cr4;
392 cpu_set_apic_base(env, sregs.apic_base);
394 env->efer = sregs.efer;
395 //cpu_set_apic_tpr(env, sregs.cr8);
397 #define HFLAG_COPY_MASK ~( \
398 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
399 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
400 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
401 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
405 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
406 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
407 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
408 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
409 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
410 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
411 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
413 if (env->efer & MSR_EFER_LMA) {
414 hflags |= HF_LMA_MASK;
417 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
418 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
419 } else {
420 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
421 (DESC_B_SHIFT - HF_CS32_SHIFT);
422 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
423 (DESC_B_SHIFT - HF_SS32_SHIFT);
424 if (!(env->cr[0] & CR0_PE_MASK) ||
425 (env->eflags & VM_MASK) ||
426 !(hflags & HF_CS32_MASK)) {
427 hflags |= HF_ADDSEG_MASK;
428 } else {
429 hflags |= ((env->segs[R_DS].base |
430 env->segs[R_ES].base |
431 env->segs[R_SS].base) != 0) <<
432 HF_ADDSEG_SHIFT;
435 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
436 env->cc_src = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
437 env->df = 1 - (2 * ((env->eflags >> 10) & 1));
438 env->cc_op = CC_OP_EFLAGS;
439 env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
441 /* msrs */
442 n = 0;
443 msrs[n++].index = MSR_IA32_SYSENTER_CS;
444 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
445 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
446 if (kvm_has_msr_star)
447 msrs[n++].index = MSR_STAR;
448 msrs[n++].index = MSR_IA32_TSC;
449 #ifdef TARGET_X86_64
450 if (lm_capable_kernel) {
451 msrs[n++].index = MSR_CSTAR;
452 msrs[n++].index = MSR_KERNELGSBASE;
453 msrs[n++].index = MSR_FMASK;
454 msrs[n++].index = MSR_LSTAR;
456 #endif
457 rc = kvm_get_msrs(kvm_context, env->cpu_index, msrs, n);
458 if (rc == -1) {
459 perror("kvm_get_msrs FAILED");
461 else {
462 n = rc; /* actual number of MSRs */
463 for (i=0 ; i<n; i++) {
464 if (get_msr_entry(&msrs[i], env))
465 return;
470 #include <signal.h>
473 static int try_push_interrupts(void *opaque)
475 CPUState *env = cpu_single_env;
476 int r, irq;
478 if (env->ready_for_interrupt_injection &&
479 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
480 (env->eflags & IF_MASK)) {
481 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
482 irq = cpu_get_pic_interrupt(env);
483 if (irq >= 0) {
484 r = kvm_inject_irq(kvm_context, env->cpu_index, irq);
485 if (r < 0)
486 printf("cpu %d fail inject %x\n", env->cpu_index, irq);
490 return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0;
493 static void post_kvm_run(void *opaque, int vcpu)
495 CPUState *env = vcpu_env;
497 pthread_mutex_lock(&qemu_mutex);
498 cpu_single_env = env;
499 env->eflags = kvm_get_interrupt_flag(kvm_context, vcpu)
500 ? env->eflags | IF_MASK : env->eflags & ~IF_MASK;
501 env->ready_for_interrupt_injection
502 = kvm_is_ready_for_interrupt_injection(kvm_context, vcpu);
504 cpu_set_apic_tpr(env, kvm_get_cr8(kvm_context, vcpu));
505 cpu_set_apic_base(env, kvm_get_apic_base(kvm_context, vcpu));
508 static int pre_kvm_run(void *opaque, int vcpu)
510 CPUState *env = cpu_single_env;
512 if (env->cpu_index == 0 && wait_hack) {
513 int i;
515 wait_hack = 0;
517 pthread_mutex_unlock(&qemu_mutex);
518 for (i = 0; i < 10; ++i)
519 usleep(1000);
520 pthread_mutex_lock(&qemu_mutex);
523 if (!kvm_irqchip_in_kernel(kvm_context))
524 kvm_set_cr8(kvm_context, vcpu, cpu_get_apic_tpr(env));
525 if (env->interrupt_request & CPU_INTERRUPT_EXIT)
526 return 1;
527 pthread_mutex_unlock(&qemu_mutex);
528 return 0;
531 void kvm_load_registers(CPUState *env)
533 if (kvm_allowed)
534 load_regs(env);
537 void kvm_save_registers(CPUState *env)
539 if (kvm_allowed)
540 save_regs(env);
543 int kvm_cpu_exec(CPUState *env)
545 int r;
547 r = kvm_run(kvm_context, env->cpu_index);
548 if (r < 0) {
549 printf("kvm_run returned %d\n", r);
550 exit(1);
553 return 0;
556 extern int vm_running;
558 static int has_work(CPUState *env)
560 if (!vm_running)
561 return 0;
562 if (!(env->hflags & HF_HALTED_MASK))
563 return 1;
564 if (env->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXIT))
565 return 1;
566 return 0;
569 static int kvm_eat_signal(CPUState *env, int timeout)
571 struct timespec ts;
572 int r, e, ret = 0;
573 siginfo_t siginfo;
574 struct sigaction sa;
576 ts.tv_sec = timeout / 1000;
577 ts.tv_nsec = (timeout % 1000) * 1000000;
578 r = sigtimedwait(&io_sigset, &siginfo, &ts);
579 if (r == -1 && (errno == EAGAIN || errno == EINTR) && !timeout)
580 return 0;
581 e = errno;
582 pthread_mutex_lock(&qemu_mutex);
583 cpu_single_env = vcpu_env;
584 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
585 printf("sigtimedwait: %s\n", strerror(e));
586 exit(1);
588 if (r != -1) {
589 sigaction(siginfo.si_signo, NULL, &sa);
590 sa.sa_handler(siginfo.si_signo);
591 ret = 1;
593 pthread_mutex_unlock(&qemu_mutex);
595 return ret;
599 static void kvm_eat_signals(CPUState *env, int timeout)
601 int r = 0;
603 while (kvm_eat_signal(env, 0))
604 r = 1;
605 if (!r && timeout) {
606 r = kvm_eat_signal(env, timeout);
607 if (r)
608 while (kvm_eat_signal(env, 0))
612 * we call select() even if no signal was received, to account for
613 * for which there is no signal handler installed.
615 pthread_mutex_lock(&qemu_mutex);
616 cpu_single_env = vcpu_env;
617 main_loop_wait(0);
618 pthread_mutex_unlock(&qemu_mutex);
621 static void kvm_main_loop_wait(CPUState *env, int timeout)
623 pthread_mutex_unlock(&qemu_mutex);
624 if (env->cpu_index == 0)
625 kvm_eat_signals(env, timeout);
626 else {
627 if (!kvm_irqchip_in_kernel(kvm_context) &&
628 (timeout || vcpu_info[env->cpu_index].stopped)) {
629 sigset_t set;
630 int n;
632 paused:
633 sigemptyset(&set);
634 sigaddset(&set, SIG_IPI);
635 sigwait(&set, &n);
636 } else {
637 struct timespec ts;
638 siginfo_t siginfo;
639 sigset_t set;
641 ts.tv_sec = 0;
642 ts.tv_nsec = 0;
643 sigemptyset(&set);
644 sigaddset(&set, SIG_IPI);
645 sigtimedwait(&set, &siginfo, &ts);
647 if (vcpu_info[env->cpu_index].stop) {
648 vcpu_info[env->cpu_index].stop = 0;
649 vcpu_info[env->cpu_index].stopped = 1;
650 pthread_kill(vcpu_info[0].thread, SIG_IPI);
651 goto paused;
654 pthread_mutex_lock(&qemu_mutex);
655 cpu_single_env = env;
656 vcpu_info[env->cpu_index].signalled = 0;
659 static int all_threads_paused(void)
661 int i;
663 for (i = 1; i < smp_cpus; ++i)
664 if (vcpu_info[i].stopped)
665 return 0;
666 return 1;
669 static void pause_other_threads(void)
671 int i;
673 for (i = 1; i < smp_cpus; ++i) {
674 vcpu_info[i].stop = 1;
675 pthread_kill(vcpu_info[i].thread, SIG_IPI);
677 while (!all_threads_paused())
678 kvm_eat_signals(vcpu_env, 0);
681 static void resume_other_threads(void)
683 int i;
685 for (i = 1; i < smp_cpus; ++i) {
686 vcpu_info[i].stop = 0;
687 vcpu_info[i].stopped = 0;
688 pthread_kill(vcpu_info[i].thread, SIG_IPI);
692 static void kvm_vm_state_change_handler(void *context, int running)
694 if (running)
695 resume_other_threads();
696 else
697 pause_other_threads();
700 static void update_regs_for_sipi(CPUState *env)
702 SegmentCache cs = env->segs[R_CS];
704 save_regs(env);
705 env->segs[R_CS] = cs;
706 env->eip = 0;
707 load_regs(env);
708 vcpu_info[env->cpu_index].sipi_needed = 0;
709 vcpu_info[env->cpu_index].init = 0;
712 static void update_regs_for_init(CPUState *env)
714 cpu_reset(env);
715 load_regs(env);
718 static void setup_kernel_sigmask(CPUState *env)
720 sigset_t set;
722 sigprocmask(SIG_BLOCK, NULL, &set);
723 sigdelset(&set, SIG_IPI);
724 if (env->cpu_index == 0)
725 sigandset(&set, &set, &io_negsigset);
727 kvm_set_signal_mask(kvm_context, env->cpu_index, &set);
730 static int kvm_main_loop_cpu(CPUState *env)
732 struct vcpu_info *info = &vcpu_info[env->cpu_index];
734 setup_kernel_sigmask(env);
735 pthread_mutex_lock(&qemu_mutex);
736 cpu_single_env = env;
737 while (1) {
738 while (!has_work(env))
739 kvm_main_loop_wait(env, 10);
740 if (env->interrupt_request & CPU_INTERRUPT_HARD)
741 env->hflags &= ~HF_HALTED_MASK;
742 if (!kvm_irqchip_in_kernel(kvm_context) && info->sipi_needed)
743 update_regs_for_sipi(env);
744 if (!kvm_irqchip_in_kernel(kvm_context) && info->init)
745 update_regs_for_init(env);
746 if (!(env->hflags & HF_HALTED_MASK) && !info->init)
747 kvm_cpu_exec(env);
748 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
749 kvm_main_loop_wait(env, 0);
750 if (qemu_shutdown_requested())
751 break;
752 else if (qemu_powerdown_requested())
753 qemu_system_powerdown();
754 else if (qemu_reset_requested()) {
755 env->interrupt_request = 0;
756 qemu_system_reset();
757 load_regs(env);
760 pthread_mutex_unlock(&qemu_mutex);
761 return 0;
764 static void *ap_main_loop(void *_env)
766 CPUState *env = _env;
767 sigset_t signals;
769 vcpu_env = env;
770 sigfillset(&signals);
771 //sigdelset(&signals, SIG_IPI);
772 sigprocmask(SIG_BLOCK, &signals, NULL);
773 kvm_create_vcpu(kvm_context, env->cpu_index);
774 kvm_qemu_init_env(env);
775 if (kvm_irqchip_in_kernel(kvm_context))
776 env->hflags &= ~HF_HALTED_MASK;
777 kvm_main_loop_cpu(env);
778 return NULL;
781 static void kvm_add_signal(int signum)
783 sigaddset(&io_sigset, signum);
784 sigdelset(&io_negsigset, signum);
785 sigprocmask(SIG_BLOCK, &io_sigset, NULL);
788 int kvm_init_ap(void)
790 CPUState *env = first_cpu->next_cpu;
791 int i;
793 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
794 sigemptyset(&io_sigset);
795 sigfillset(&io_negsigset);
796 kvm_add_signal(SIGIO);
797 kvm_add_signal(SIGALRM);
798 kvm_add_signal(SIGUSR2);
799 if (!kvm_irqchip_in_kernel(kvm_context))
800 kvm_add_signal(SIG_IPI);
802 vcpu_env = first_cpu;
803 signal(SIG_IPI, sig_ipi_handler);
804 for (i = 1; i < smp_cpus; ++i) {
805 pthread_create(&vcpu_info[i].thread, NULL, ap_main_loop, env);
806 env = env->next_cpu;
808 return 0;
811 int kvm_main_loop(void)
813 vcpu_info[0].thread = pthread_self();
814 return kvm_main_loop_cpu(first_cpu);
817 static int kvm_debug(void *opaque, int vcpu)
819 CPUState *env = cpu_single_env;
821 env->exception_index = EXCP_DEBUG;
822 return 1;
825 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
827 *data = cpu_inb(0, addr);
828 return 0;
831 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
833 *data = cpu_inw(0, addr);
834 return 0;
837 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
839 *data = cpu_inl(0, addr);
840 return 0;
843 #define PM_IO_BASE 0xb000
845 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
847 if (addr == 0xb2) {
848 switch (data) {
849 case 0: {
850 cpu_outb(0, 0xb3, 0);
851 break;
853 case 0xf0: {
854 unsigned x;
856 /* enable acpi */
857 x = cpu_inw(0, PM_IO_BASE + 4);
858 x &= ~1;
859 cpu_outw(0, PM_IO_BASE + 4, x);
860 break;
862 case 0xf1: {
863 unsigned x;
865 /* enable acpi */
866 x = cpu_inw(0, PM_IO_BASE + 4);
867 x |= 1;
868 cpu_outw(0, PM_IO_BASE + 4, x);
869 break;
871 default:
872 break;
874 return 0;
876 cpu_outb(0, addr, data);
877 return 0;
880 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
882 cpu_outw(0, addr, data);
883 return 0;
886 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
888 cpu_outl(0, addr, data);
889 return 0;
892 static int kvm_readb(void *opaque, uint64_t addr, uint8_t *data)
894 *data = ldub_phys(addr);
895 return 0;
898 static int kvm_readw(void *opaque, uint64_t addr, uint16_t *data)
900 *data = lduw_phys(addr);
901 return 0;
904 static int kvm_readl(void *opaque, uint64_t addr, uint32_t *data)
906 /* hack: Red Hat 7.1 generates some wierd accesses. */
907 if (addr > 0xa0000 - 4 && addr < 0xa0000) {
908 *data = 0;
909 return 0;
912 *data = ldl_phys(addr);
913 return 0;
916 static int kvm_readq(void *opaque, uint64_t addr, uint64_t *data)
918 *data = ldq_phys(addr);
919 return 0;
922 static int kvm_writeb(void *opaque, uint64_t addr, uint8_t data)
924 stb_phys(addr, data);
925 return 0;
928 static int kvm_writew(void *opaque, uint64_t addr, uint16_t data)
930 stw_phys(addr, data);
931 return 0;
934 static int kvm_writel(void *opaque, uint64_t addr, uint32_t data)
936 stl_phys(addr, data);
937 return 0;
940 static int kvm_writeq(void *opaque, uint64_t addr, uint64_t data)
942 stq_phys(addr, data);
943 return 0;
946 static int kvm_io_window(void *opaque)
948 return 1;
952 static int kvm_halt(void *opaque, int vcpu)
954 CPUState *env = cpu_single_env;
956 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
957 (env->eflags & IF_MASK))) {
958 env->hflags |= HF_HALTED_MASK;
959 env->exception_index = EXCP_HLT;
962 return 1;
965 static int kvm_shutdown(void *opaque, int vcpu)
967 qemu_system_reset_request();
968 return 1;
971 static struct kvm_callbacks qemu_kvm_ops = {
972 .debug = kvm_debug,
973 .inb = kvm_inb,
974 .inw = kvm_inw,
975 .inl = kvm_inl,
976 .outb = kvm_outb,
977 .outw = kvm_outw,
978 .outl = kvm_outl,
979 .readb = kvm_readb,
980 .readw = kvm_readw,
981 .readl = kvm_readl,
982 .readq = kvm_readq,
983 .writeb = kvm_writeb,
984 .writew = kvm_writew,
985 .writel = kvm_writel,
986 .writeq = kvm_writeq,
987 .halt = kvm_halt,
988 .shutdown = kvm_shutdown,
989 .io_window = kvm_io_window,
990 .try_push_interrupts = try_push_interrupts,
991 .post_kvm_run = post_kvm_run,
992 .pre_kvm_run = pre_kvm_run,
995 int kvm_qemu_init()
997 /* Try to initialize kvm */
998 kvm_context = kvm_init(&qemu_kvm_ops, cpu_single_env);
999 if (!kvm_context) {
1000 return -1;
1003 return 0;
1006 int kvm_qemu_create_context(void)
1008 int i;
1010 if (!kvm_irqchip) {
1011 kvm_disable_irqchip_creation(kvm_context);
1013 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
1014 kvm_qemu_destroy();
1015 return -1;
1017 if (kvm_shadow_memory)
1018 kvm_set_shadow_pages(kvm_context, kvm_shadow_memory);
1019 kvm_msr_list = kvm_get_msr_list(kvm_context);
1020 if (!kvm_msr_list) {
1021 kvm_qemu_destroy();
1022 return -1;
1024 for (i = 0; i < kvm_msr_list->nmsrs; ++i)
1025 if (kvm_msr_list->indices[i] == MSR_STAR)
1026 kvm_has_msr_star = 1;
1027 return 0;
1030 void kvm_qemu_destroy(void)
1032 kvm_finalize(kvm_context);
1035 static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx,
1036 uint32_t *ecx, uint32_t *edx)
1038 uint32_t vec[4];
1040 vec[0] = function;
1041 asm volatile (
1042 #ifdef __x86_64__
1043 "sub $128, %%rsp \n\t" /* skip red zone */
1044 "push %0; push %%rsi \n\t"
1045 "push %%rax; push %%rbx; push %%rcx; push %%rdx \n\t"
1046 "mov 8*5(%%rsp), %%rsi \n\t"
1047 "mov (%%rsi), %%eax \n\t"
1048 "cpuid \n\t"
1049 "mov %%eax, (%%rsi) \n\t"
1050 "mov %%ebx, 4(%%rsi) \n\t"
1051 "mov %%ecx, 8(%%rsi) \n\t"
1052 "mov %%edx, 12(%%rsi) \n\t"
1053 "pop %%rdx; pop %%rcx; pop %%rbx; pop %%rax \n\t"
1054 "pop %%rsi; pop %0 \n\t"
1055 "add $128, %%rsp"
1056 #else
1057 "push %0; push %%esi \n\t"
1058 "push %%eax; push %%ebx; push %%ecx; push %%edx \n\t"
1059 "mov 4*5(%%esp), %%esi \n\t"
1060 "mov (%%esi), %%eax \n\t"
1061 "cpuid \n\t"
1062 "mov %%eax, (%%esi) \n\t"
1063 "mov %%ebx, 4(%%esi) \n\t"
1064 "mov %%ecx, 8(%%esi) \n\t"
1065 "mov %%edx, 12(%%esi) \n\t"
1066 "pop %%edx; pop %%ecx; pop %%ebx; pop %%eax \n\t"
1067 "pop %%esi; pop %0 \n\t"
1068 #endif
1069 : : "rm"(vec) : "memory");
1070 if (eax)
1071 *eax = vec[0];
1072 if (ebx)
1073 *ebx = vec[1];
1074 if (ecx)
1075 *ecx = vec[2];
1076 if (edx)
1077 *edx = vec[3];
1080 static void do_cpuid_ent(struct kvm_cpuid_entry *e, uint32_t function,
1081 CPUState *env)
1083 env->regs[R_EAX] = function;
1084 qemu_kvm_cpuid_on_env(env);
1085 e->function = function;
1086 e->eax = env->regs[R_EAX];
1087 e->ebx = env->regs[R_EBX];
1088 e->ecx = env->regs[R_ECX];
1089 e->edx = env->regs[R_EDX];
1090 if (function == 0x80000001) {
1091 uint32_t h_eax, h_edx;
1092 struct utsname utsname;
1094 host_cpuid(function, &h_eax, NULL, NULL, &h_edx);
1095 uname(&utsname);
1096 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
1098 // long mode
1099 if ((h_edx & 0x20000000) == 0 || !lm_capable_kernel)
1100 e->edx &= ~0x20000000u;
1101 // syscall
1102 if ((h_edx & 0x00000800) == 0)
1103 e->edx &= ~0x00000800u;
1104 // nx
1105 if ((h_edx & 0x00100000) == 0)
1106 e->edx &= ~0x00100000u;
1108 // sysenter isn't supported on compatibility mode on AMD. and syscall
1109 // isn't supported in compatibility mode on Intel. so advertise the
1110 // actuall cpu, and say goodbye to migration between different vendors
1111 // is you use compatibility mode.
1112 if (function == 0) {
1113 uint32_t bcd[3];
1115 host_cpuid(0, NULL, &bcd[0], &bcd[1], &bcd[2]);
1116 e->ebx = bcd[0];
1117 e->ecx = bcd[1];
1118 e->edx = bcd[2];
1122 int kvm_qemu_init_env(CPUState *cenv)
1124 struct kvm_cpuid_entry cpuid_ent[100];
1125 #ifdef KVM_CPUID_SIGNATURE
1126 struct kvm_cpuid_entry *pv_ent;
1127 uint32_t signature[3];
1128 #endif
1129 int cpuid_nent = 0;
1130 CPUState copy;
1131 uint32_t i, limit;
1133 copy = *cenv;
1135 #ifdef KVM_CPUID_SIGNATURE
1136 /* Paravirtualization CPUIDs */
1137 memcpy(signature, "KVMKVMKVM", 12);
1138 pv_ent = &cpuid_ent[cpuid_nent++];
1139 memset(pv_ent, 0, sizeof(*pv_ent));
1140 pv_ent->function = KVM_CPUID_SIGNATURE;
1141 pv_ent->eax = 0;
1142 pv_ent->ebx = signature[0];
1143 pv_ent->ecx = signature[1];
1144 pv_ent->edx = signature[2];
1146 pv_ent = &cpuid_ent[cpuid_nent++];
1147 memset(pv_ent, 0, sizeof(*pv_ent));
1148 pv_ent->function = KVM_CPUID_FEATURES;
1149 pv_ent->eax = 0;
1150 #endif
1152 copy.regs[R_EAX] = 0;
1153 qemu_kvm_cpuid_on_env(&copy);
1154 limit = copy.regs[R_EAX];
1156 for (i = 0; i <= limit; ++i)
1157 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1159 copy.regs[R_EAX] = 0x80000000;
1160 qemu_kvm_cpuid_on_env(&copy);
1161 limit = copy.regs[R_EAX];
1163 for (i = 0x80000000; i <= limit; ++i)
1164 do_cpuid_ent(&cpuid_ent[cpuid_nent++], i, &copy);
1166 kvm_setup_cpuid(kvm_context, cenv->cpu_index, cpuid_nent, cpuid_ent);
1168 return 0;
1171 int kvm_update_debugger(CPUState *env)
1173 struct kvm_debug_guest dbg;
1174 int i;
1176 dbg.enabled = 0;
1177 if (env->nb_breakpoints || env->singlestep_enabled) {
1178 dbg.enabled = 1;
1179 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
1180 dbg.breakpoints[i].enabled = 1;
1181 dbg.breakpoints[i].address = env->breakpoints[i];
1183 dbg.singlestep = env->singlestep_enabled;
1185 return kvm_guest_debug(kvm_context, env->cpu_index, &dbg);
1190 * dirty pages logging
1192 /* FIXME: use unsigned long pointer instead of unsigned char */
1193 unsigned char *kvm_dirty_bitmap = NULL;
1194 int kvm_physical_memory_set_dirty_tracking(int enable)
1196 int r = 0;
1198 if (!kvm_allowed)
1199 return 0;
1201 if (enable) {
1202 if (!kvm_dirty_bitmap) {
1203 unsigned bitmap_size = BITMAP_SIZE(phys_ram_size);
1204 kvm_dirty_bitmap = qemu_malloc(bitmap_size);
1205 if (kvm_dirty_bitmap == NULL) {
1206 perror("Failed to allocate dirty pages bitmap");
1207 r=-1;
1209 else {
1210 r = kvm_dirty_pages_log_enable_all(kvm_context);
1214 else {
1215 if (kvm_dirty_bitmap) {
1216 r = kvm_dirty_pages_log_reset(kvm_context);
1217 qemu_free(kvm_dirty_bitmap);
1218 kvm_dirty_bitmap = NULL;
1221 return r;
1224 /* get kvm's dirty pages bitmap and update qemu's */
1225 int kvm_get_dirty_pages_log_slot(int slot,
1226 unsigned char *bitmap,
1227 unsigned int offset,
1228 unsigned int len)
1230 int r;
1231 unsigned int i, j, n=0;
1232 unsigned char c;
1233 unsigned page_number, addr, addr1;
1235 memset(bitmap, 0, len);
1236 r = kvm_get_dirty_pages(kvm_context, slot, bitmap);
1237 if (r)
1238 return r;
1241 * bitmap-traveling is faster than memory-traveling (for addr...)
1242 * especially when most of the memory is not dirty.
1244 for (i=0; i<len; i++) {
1245 c = bitmap[i];
1246 while (c>0) {
1247 j = ffsl(c) - 1;
1248 c &= ~(1u<<j);
1249 page_number = i * 8 + j;
1250 addr1 = page_number * TARGET_PAGE_SIZE;
1251 addr = offset + addr1;
1252 cpu_physical_memory_set_dirty(addr);
1253 n++;
1256 return 0;
1260 * get kvm's dirty pages bitmap and update qemu's
1261 * we only care about physical ram, which resides in slots 0 and 3
1263 int kvm_update_dirty_pages_log(void)
1265 int r = 0, len;
1267 len = BITMAP_SIZE(0xa0000);
1268 r = kvm_get_dirty_pages_log_slot(3, kvm_dirty_bitmap, 0 , len);
1269 len = BITMAP_SIZE(phys_ram_size - 0xc0000);
1270 r = r || kvm_get_dirty_pages_log_slot(0, kvm_dirty_bitmap, 0xc0000, len);
1271 return r;
1274 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap)
1276 int r=0, len, offset;
1278 len = BITMAP_SIZE(phys_ram_size);
1279 memset(bitmap, 0, len);
1281 r = kvm_get_mem_map(kvm_context, 3, bitmap);
1282 if (r)
1283 goto out;
1285 offset = BITMAP_SIZE(0xc0000);
1286 r = kvm_get_mem_map(kvm_context, 0, bitmap + offset);
1288 out:
1289 return r;
1292 #ifdef KVM_CAP_IRQCHIP
1294 int kvm_set_irq(int irq, int level)
1296 return kvm_set_irq_level(kvm_context, irq, level);
1299 #endif
1301 #endif