kvm: support guest reboot
[qemu-kvm/fedora.git] / qemu-kvm.c
blob553b788e9abb8936c2df53cf184c7421f584f603
2 #include "config.h"
3 #include "config-host.h"
5 #ifdef USE_KVM
7 #include "exec.h"
9 #include "qemu-kvm.h"
10 #include <kvmctl.h>
11 #include <string.h>
13 #define MSR_IA32_TSC 0x10
15 extern void perror(const char *s);
17 int kvm_allowed = 1;
18 kvm_context_t kvm_context;
19 static struct kvm_msr_list *kvm_msr_list;
20 static int kvm_has_msr_star;
22 #define NR_CPU 16
23 static CPUState *saved_env[NR_CPU];
25 static void set_msr_entry(struct kvm_msr_entry *entry, uint32_t index,
26 uint64_t data)
28 entry->index = index;
29 entry->data = data;
32 /* returns 0 on success, non-0 on failure */
33 static int get_msr_entry(struct kvm_msr_entry *entry, CPUState *env)
35 switch (entry->index) {
36 case MSR_IA32_SYSENTER_CS:
37 env->sysenter_cs = entry->data;
38 break;
39 case MSR_IA32_SYSENTER_ESP:
40 env->sysenter_esp = entry->data;
41 break;
42 case MSR_IA32_SYSENTER_EIP:
43 env->sysenter_eip = entry->data;
44 break;
45 case MSR_STAR:
46 env->star = entry->data;
47 break;
48 #ifdef TARGET_X86_64
49 case MSR_CSTAR:
50 env->cstar = entry->data;
51 break;
52 case MSR_KERNELGSBASE:
53 env->kernelgsbase = entry->data;
54 break;
55 case MSR_FMASK:
56 env->fmask = entry->data;
57 break;
58 case MSR_LSTAR:
59 env->lstar = entry->data;
60 break;
61 #endif
62 case MSR_IA32_TSC:
63 env->tsc = entry->data;
64 break;
65 default:
66 printf("Warning unknown msr index 0x%x\n", entry->index);
67 return 1;
69 return 0;
72 #ifdef TARGET_X86_64
73 #define MSR_COUNT 9
74 #else
75 #define MSR_COUNT 5
76 #endif
78 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
80 lhs->selector = rhs->selector;
81 lhs->base = rhs->base;
82 lhs->limit = rhs->limit;
83 lhs->type = 3;
84 lhs->present = 1;
85 lhs->dpl = 3;
86 lhs->db = 0;
87 lhs->s = 1;
88 lhs->l = 0;
89 lhs->g = 0;
90 lhs->avl = 0;
91 lhs->unusable = 0;
94 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
96 unsigned flags = rhs->flags;
97 lhs->selector = rhs->selector;
98 lhs->base = rhs->base;
99 lhs->limit = rhs->limit;
100 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
101 lhs->present = (flags & DESC_P_MASK) != 0;
102 lhs->dpl = rhs->selector & 3;
103 lhs->db = (flags >> DESC_B_SHIFT) & 1;
104 lhs->s = (flags & DESC_S_MASK) != 0;
105 lhs->l = (flags >> DESC_L_SHIFT) & 1;
106 lhs->g = (flags & DESC_G_MASK) != 0;
107 lhs->avl = (flags & DESC_AVL_MASK) != 0;
108 lhs->unusable = 0;
111 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
113 lhs->selector = rhs->selector;
114 lhs->base = rhs->base;
115 lhs->limit = rhs->limit;
116 lhs->flags =
117 (rhs->type << DESC_TYPE_SHIFT)
118 | (rhs->present * DESC_P_MASK)
119 | (rhs->dpl << DESC_DPL_SHIFT)
120 | (rhs->db << DESC_B_SHIFT)
121 | (rhs->s * DESC_S_MASK)
122 | (rhs->l << DESC_L_SHIFT)
123 | (rhs->g * DESC_G_MASK)
124 | (rhs->avl * DESC_AVL_MASK);
127 /* the reset values of qemu are not compatible to SVM
128 * this function is used to fix the segment descriptor values */
129 static void fix_realmode_dataseg(struct kvm_segment *seg)
131 seg->type = 0x02;
132 seg->present = 1;
133 seg->s = 1;
136 static void load_regs(CPUState *env)
138 struct kvm_regs regs;
139 struct kvm_sregs sregs;
140 struct kvm_msr_entry msrs[MSR_COUNT];
141 int rc, n;
143 /* hack: save env */
144 if (!saved_env[0])
145 saved_env[0] = env;
147 regs.rax = env->regs[R_EAX];
148 regs.rbx = env->regs[R_EBX];
149 regs.rcx = env->regs[R_ECX];
150 regs.rdx = env->regs[R_EDX];
151 regs.rsi = env->regs[R_ESI];
152 regs.rdi = env->regs[R_EDI];
153 regs.rsp = env->regs[R_ESP];
154 regs.rbp = env->regs[R_EBP];
155 #ifdef TARGET_X86_64
156 regs.r8 = env->regs[8];
157 regs.r9 = env->regs[9];
158 regs.r10 = env->regs[10];
159 regs.r11 = env->regs[11];
160 regs.r12 = env->regs[12];
161 regs.r13 = env->regs[13];
162 regs.r14 = env->regs[14];
163 regs.r15 = env->regs[15];
164 #endif
166 regs.rflags = env->eflags;
167 regs.rip = env->eip;
169 kvm_set_regs(kvm_context, 0, &regs);
171 memcpy(sregs.interrupt_bitmap, env->kvm_interrupt_bitmap, sizeof(sregs.interrupt_bitmap));
173 if ((env->eflags & VM_MASK)) {
174 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
175 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
176 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
177 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
178 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
179 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
180 } else {
181 set_seg(&sregs.cs, &env->segs[R_CS]);
182 set_seg(&sregs.ds, &env->segs[R_DS]);
183 set_seg(&sregs.es, &env->segs[R_ES]);
184 set_seg(&sregs.fs, &env->segs[R_FS]);
185 set_seg(&sregs.gs, &env->segs[R_GS]);
186 set_seg(&sregs.ss, &env->segs[R_SS]);
188 if (env->cr[0] & CR0_PE_MASK) {
189 /* force ss cpl to cs cpl */
190 sregs.ss.selector = (sregs.ss.selector & ~3) |
191 (sregs.cs.selector & 3);
192 sregs.ss.dpl = sregs.ss.selector & 3;
195 if (!(env->cr[0] & CR0_PG_MASK)) {
196 fix_realmode_dataseg(&sregs.ds);
197 fix_realmode_dataseg(&sregs.es);
198 fix_realmode_dataseg(&sregs.ss);
202 set_seg(&sregs.tr, &env->tr);
203 set_seg(&sregs.ldt, &env->ldt);
205 sregs.idt.limit = env->idt.limit;
206 sregs.idt.base = env->idt.base;
207 sregs.gdt.limit = env->gdt.limit;
208 sregs.gdt.base = env->gdt.base;
210 sregs.cr0 = env->cr[0];
211 sregs.cr2 = env->cr[2];
212 sregs.cr3 = env->cr[3];
213 sregs.cr4 = env->cr[4];
215 sregs.apic_base = cpu_get_apic_base(env);
216 sregs.efer = env->efer;
217 sregs.cr8 = cpu_get_apic_tpr(env);
219 kvm_set_sregs(kvm_context, 0, &sregs);
221 /* msrs */
222 n = 0;
223 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
224 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
225 set_msr_entry(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
226 if (kvm_has_msr_star)
227 set_msr_entry(&msrs[n++], MSR_STAR, env->star);
228 set_msr_entry(&msrs[n++], MSR_IA32_TSC, env->tsc);
229 #ifdef TARGET_X86_64
230 set_msr_entry(&msrs[n++], MSR_CSTAR, env->cstar);
231 set_msr_entry(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
232 set_msr_entry(&msrs[n++], MSR_FMASK, env->fmask);
233 set_msr_entry(&msrs[n++], MSR_LSTAR , env->lstar);
234 #endif
236 rc = kvm_set_msrs(kvm_context, 0, msrs, n);
237 if (rc == -1)
238 perror("kvm_set_msrs FAILED");
242 static void save_regs(CPUState *env)
244 struct kvm_regs regs;
245 struct kvm_sregs sregs;
246 struct kvm_msr_entry msrs[MSR_COUNT];
247 uint32_t hflags;
248 uint32_t i, n, rc;
250 kvm_get_regs(kvm_context, 0, &regs);
252 env->regs[R_EAX] = regs.rax;
253 env->regs[R_EBX] = regs.rbx;
254 env->regs[R_ECX] = regs.rcx;
255 env->regs[R_EDX] = regs.rdx;
256 env->regs[R_ESI] = regs.rsi;
257 env->regs[R_EDI] = regs.rdi;
258 env->regs[R_ESP] = regs.rsp;
259 env->regs[R_EBP] = regs.rbp;
260 #ifdef TARGET_X86_64
261 env->regs[8] = regs.r8;
262 env->regs[9] = regs.r9;
263 env->regs[10] = regs.r10;
264 env->regs[11] = regs.r11;
265 env->regs[12] = regs.r12;
266 env->regs[13] = regs.r13;
267 env->regs[14] = regs.r14;
268 env->regs[15] = regs.r15;
269 #endif
271 env->eflags = regs.rflags;
272 env->eip = regs.rip;
274 kvm_get_sregs(kvm_context, 0, &sregs);
276 memcpy(env->kvm_interrupt_bitmap, sregs.interrupt_bitmap, sizeof(env->kvm_interrupt_bitmap));
278 get_seg(&env->segs[R_CS], &sregs.cs);
279 get_seg(&env->segs[R_DS], &sregs.ds);
280 get_seg(&env->segs[R_ES], &sregs.es);
281 get_seg(&env->segs[R_FS], &sregs.fs);
282 get_seg(&env->segs[R_GS], &sregs.gs);
283 get_seg(&env->segs[R_SS], &sregs.ss);
285 get_seg(&env->tr, &sregs.tr);
286 get_seg(&env->ldt, &sregs.ldt);
288 env->idt.limit = sregs.idt.limit;
289 env->idt.base = sregs.idt.base;
290 env->gdt.limit = sregs.gdt.limit;
291 env->gdt.base = sregs.gdt.base;
293 env->cr[0] = sregs.cr0;
294 env->cr[2] = sregs.cr2;
295 env->cr[3] = sregs.cr3;
296 env->cr[4] = sregs.cr4;
298 cpu_set_apic_base(env, sregs.apic_base);
300 env->efer = sregs.efer;
301 cpu_set_apic_tpr(env, sregs.cr8);
303 #define HFLAG_COPY_MASK ~( \
304 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
305 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
306 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
307 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
311 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
312 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
313 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
314 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
315 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
316 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
317 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
319 if (env->efer & MSR_EFER_LMA) {
320 hflags |= HF_LMA_MASK;
323 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
324 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
325 } else {
326 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
327 (DESC_B_SHIFT - HF_CS32_SHIFT);
328 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
329 (DESC_B_SHIFT - HF_SS32_SHIFT);
330 if (!(env->cr[0] & CR0_PE_MASK) ||
331 (env->eflags & VM_MASK) ||
332 !(hflags & HF_CS32_MASK)) {
333 hflags |= HF_ADDSEG_MASK;
334 } else {
335 hflags |= ((env->segs[R_DS].base |
336 env->segs[R_ES].base |
337 env->segs[R_SS].base) != 0) <<
338 HF_ADDSEG_SHIFT;
341 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
342 CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
343 DF = 1 - (2 * ((env->eflags >> 10) & 1));
344 CC_OP = CC_OP_EFLAGS;
345 env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
347 tlb_flush(env, 1);
349 /* msrs */
350 n = 0;
351 msrs[n++].index = MSR_IA32_SYSENTER_CS;
352 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
353 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
354 if (kvm_has_msr_star)
355 msrs[n++].index = MSR_STAR;
356 msrs[n++].index = MSR_IA32_TSC;
357 #ifdef TARGET_X86_64
358 msrs[n++].index = MSR_CSTAR;
359 msrs[n++].index = MSR_KERNELGSBASE;
360 msrs[n++].index = MSR_FMASK;
361 msrs[n++].index = MSR_LSTAR;
362 #endif
363 rc = kvm_get_msrs(kvm_context, 0, msrs, n);
364 if (rc == -1) {
365 perror("kvm_get_msrs FAILED");
367 else {
368 n = rc; /* actual number of MSRs */
369 for (i=0 ; i<n; i++) {
370 if (get_msr_entry(&msrs[i], env))
371 return;
376 #include <signal.h>
379 static int try_push_interrupts(void *opaque)
381 CPUState **envs = opaque, *env;
382 env = envs[0];
384 if (env->ready_for_interrupt_injection &&
385 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
386 (env->eflags & IF_MASK)) {
387 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
388 // for now using cpu 0
389 kvm_inject_irq(kvm_context, 0, cpu_get_pic_interrupt(env));
392 return (env->interrupt_request & CPU_INTERRUPT_HARD) != 0;
395 static void post_kvm_run(void *opaque, struct kvm_run *kvm_run)
397 CPUState **envs = opaque, *env;
398 env = envs[0];
400 env->eflags = (kvm_run->if_flag) ? env->eflags | IF_MASK:env->eflags & ~IF_MASK;
401 env->ready_for_interrupt_injection = kvm_run->ready_for_interrupt_injection;
402 cpu_set_apic_tpr(env, kvm_run->cr8);
403 cpu_set_apic_base(env, kvm_run->apic_base);
406 static void pre_kvm_run(void *opaque, struct kvm_run *kvm_run)
408 CPUState **envs = opaque, *env;
409 env = envs[0];
411 kvm_run->cr8 = cpu_get_apic_tpr(env);
414 void kvm_load_registers(CPUState *env)
416 load_regs(env);
419 void kvm_save_registers(CPUState *env)
421 save_regs(env);
424 int kvm_cpu_exec(CPUState *env)
426 int pending = (!env->ready_for_interrupt_injection ||
427 ((env->interrupt_request & CPU_INTERRUPT_HARD) &&
428 (env->eflags & IF_MASK)));
430 if (!pending && (env->interrupt_request & CPU_INTERRUPT_EXIT)) {
431 env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
432 env->exception_index = EXCP_INTERRUPT;
433 cpu_loop_exit();
437 if (!saved_env[0])
438 saved_env[0] = env;
440 kvm_run(kvm_context, 0);
442 return 0;
446 static int kvm_cpuid(void *opaque, uint64_t *rax, uint64_t *rbx,
447 uint64_t *rcx, uint64_t *rdx)
449 CPUState **envs = opaque;
450 CPUState *saved_env;
451 uint32_t eax = *rax;
453 saved_env = env;
454 env = envs[0];
456 env->regs[R_EAX] = *rax;
457 env->regs[R_EBX] = *rbx;
458 env->regs[R_ECX] = *rcx;
459 env->regs[R_EDX] = *rdx;
460 helper_cpuid();
461 *rdx = env->regs[R_EDX];
462 *rcx = env->regs[R_ECX];
463 *rbx = env->regs[R_EBX];
464 *rax = env->regs[R_EAX];
465 // don't report long mode/syscall/nx if no native support
466 if (eax == 0x80000001) {
467 unsigned long h_eax = eax, h_edx;
470 // push/pop hack to workaround gcc 3 register pressure trouble
471 asm (
472 #ifdef __x86_64__
473 "push %%rbx; push %%rcx; cpuid; pop %%rcx; pop %%rbx"
474 #else
475 "push %%ebx; push %%ecx; cpuid; pop %%ecx; pop %%ebx"
476 #endif
477 : "+a"(h_eax), "=d"(h_edx));
479 // long mode
480 if ((h_edx & 0x20000000) == 0)
481 *rdx &= ~0x20000000ull;
482 // syscall
483 if ((h_edx & 0x00000800) == 0)
484 *rdx &= ~0x00000800ull;
485 // nx
486 if ((h_edx & 0x00100000) == 0)
487 *rdx &= ~0x00100000ull;
489 env = saved_env;
490 return 0;
493 static int kvm_debug(void *opaque, int vcpu)
495 CPUState **envs = opaque;
497 env = envs[0];
498 env->exception_index = EXCP_DEBUG;
499 return 1;
502 static int kvm_inb(void *opaque, uint16_t addr, uint8_t *data)
504 *data = cpu_inb(0, addr);
505 return 0;
508 static int kvm_inw(void *opaque, uint16_t addr, uint16_t *data)
510 *data = cpu_inw(0, addr);
511 return 0;
514 static int kvm_inl(void *opaque, uint16_t addr, uint32_t *data)
516 *data = cpu_inl(0, addr);
517 return 0;
520 static int kvm_outb(void *opaque, uint16_t addr, uint8_t data)
522 cpu_outb(0, addr, data);
523 return 0;
526 static int kvm_outw(void *opaque, uint16_t addr, uint16_t data)
528 cpu_outw(0, addr, data);
529 return 0;
532 static int kvm_outl(void *opaque, uint16_t addr, uint32_t data)
534 cpu_outl(0, addr, data);
535 return 0;
538 static int kvm_readb(void *opaque, uint64_t addr, uint8_t *data)
540 *data = ldub_phys(addr);
541 return 0;
544 static int kvm_readw(void *opaque, uint64_t addr, uint16_t *data)
546 *data = lduw_phys(addr);
547 return 0;
550 static int kvm_readl(void *opaque, uint64_t addr, uint32_t *data)
552 *data = ldl_phys(addr);
553 return 0;
556 static int kvm_readq(void *opaque, uint64_t addr, uint64_t *data)
558 *data = ldq_phys(addr);
559 return 0;
562 static int kvm_writeb(void *opaque, uint64_t addr, uint8_t data)
564 stb_phys(addr, data);
565 return 0;
568 static int kvm_writew(void *opaque, uint64_t addr, uint16_t data)
570 stw_phys(addr, data);
571 return 0;
574 static int kvm_writel(void *opaque, uint64_t addr, uint32_t data)
576 stl_phys(addr, data);
577 return 0;
580 static int kvm_writeq(void *opaque, uint64_t addr, uint64_t data)
582 stq_phys(addr, data);
583 return 0;
586 static int kvm_io_window(void *opaque)
588 return 1;
592 static int kvm_halt(void *opaque, int vcpu)
594 CPUState **envs = opaque, *env;
596 env = envs[0];
597 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
598 (env->eflags & IF_MASK))) {
599 env->hflags |= HF_HALTED_MASK;
600 env->exception_index = EXCP_HLT;
603 return 1;
606 static int kvm_shutdown(void *opaque, int vcpu)
608 qemu_system_reset_request();
609 return 1;
612 static struct kvm_callbacks qemu_kvm_ops = {
613 .cpuid = kvm_cpuid,
614 .debug = kvm_debug,
615 .inb = kvm_inb,
616 .inw = kvm_inw,
617 .inl = kvm_inl,
618 .outb = kvm_outb,
619 .outw = kvm_outw,
620 .outl = kvm_outl,
621 .readb = kvm_readb,
622 .readw = kvm_readw,
623 .readl = kvm_readl,
624 .readq = kvm_readq,
625 .writeb = kvm_writeb,
626 .writew = kvm_writew,
627 .writel = kvm_writel,
628 .writeq = kvm_writeq,
629 .halt = kvm_halt,
630 .shutdown = kvm_shutdown,
631 .io_window = kvm_io_window,
632 .try_push_interrupts = try_push_interrupts,
633 .post_kvm_run = post_kvm_run,
634 .pre_kvm_run = pre_kvm_run,
637 int kvm_qemu_init()
639 /* Try to initialize kvm */
640 kvm_context = kvm_init(&qemu_kvm_ops, saved_env);
641 if (!kvm_context) {
642 return -1;
645 return 0;
648 int kvm_qemu_create_context(void)
650 int i;
652 if (kvm_create(kvm_context, phys_ram_size, (void**)&phys_ram_base) < 0) {
653 kvm_qemu_destroy();
654 return -1;
656 kvm_msr_list = kvm_get_msr_list(kvm_context);
657 if (!kvm_msr_list) {
658 kvm_qemu_destroy();
659 return -1;
661 for (i = 0; i < kvm_msr_list->nmsrs; ++i)
662 if (kvm_msr_list->indices[i] == MSR_STAR)
663 kvm_has_msr_star = 1;
664 return 0;
667 void kvm_qemu_destroy(void)
669 kvm_finalize(kvm_context);
672 int kvm_update_debugger(CPUState *env)
674 struct kvm_debug_guest dbg;
675 int i;
677 dbg.enabled = 0;
678 if (env->nb_breakpoints || env->singlestep_enabled) {
679 dbg.enabled = 1;
680 for (i = 0; i < 4 && i < env->nb_breakpoints; ++i) {
681 dbg.breakpoints[i].enabled = 1;
682 dbg.breakpoints[i].address = env->breakpoints[i];
684 dbg.singlestep = env->singlestep_enabled;
686 return kvm_guest_debug(kvm_context, 0, &dbg);
690 #endif