2 * qemu/kvm integration, x86 specific code
4 * Copyright (C) 2006-2008 Qumranet Technologies
6 * Licensed under the terms of the GNU GPL version 2 or higher.
10 #include "config-host.h"
18 #include <sys/utsname.h>
19 #include <linux/kvm_para.h>
21 #define MSR_IA32_TSC 0x10
23 static struct kvm_msr_list
*kvm_msr_list
;
24 extern unsigned int kvm_shadow_memory
;
25 extern kvm_context_t kvm_context
;
26 static int kvm_has_msr_star
;
28 static int lm_capable_kernel
;
30 int kvm_arch_qemu_create_context(void)
33 struct utsname utsname
;
36 lm_capable_kernel
= strcmp(utsname
.machine
, "x86_64") == 0;
38 if (kvm_shadow_memory
)
39 kvm_set_shadow_pages(kvm_context
, kvm_shadow_memory
);
41 kvm_msr_list
= kvm_get_msr_list(kvm_context
);
44 for (i
= 0; i
< kvm_msr_list
->nmsrs
; ++i
)
45 if (kvm_msr_list
->indices
[i
] == MSR_STAR
)
50 static void set_msr_entry(struct kvm_msr_entry
*entry
, uint32_t index
,
57 /* returns 0 on success, non-0 on failure */
58 static int get_msr_entry(struct kvm_msr_entry
*entry
, CPUState
*env
)
60 switch (entry
->index
) {
61 case MSR_IA32_SYSENTER_CS
:
62 env
->sysenter_cs
= entry
->data
;
64 case MSR_IA32_SYSENTER_ESP
:
65 env
->sysenter_esp
= entry
->data
;
67 case MSR_IA32_SYSENTER_EIP
:
68 env
->sysenter_eip
= entry
->data
;
71 env
->star
= entry
->data
;
75 env
->cstar
= entry
->data
;
77 case MSR_KERNELGSBASE
:
78 env
->kernelgsbase
= entry
->data
;
81 env
->fmask
= entry
->data
;
84 env
->lstar
= entry
->data
;
88 env
->tsc
= entry
->data
;
91 printf("Warning unknown msr index 0x%x\n", entry
->index
);
103 static void set_v8086_seg(struct kvm_segment
*lhs
, const SegmentCache
*rhs
)
105 lhs
->selector
= rhs
->selector
;
106 lhs
->base
= rhs
->base
;
107 lhs
->limit
= rhs
->limit
;
119 static void set_seg(struct kvm_segment
*lhs
, const SegmentCache
*rhs
)
121 unsigned flags
= rhs
->flags
;
122 lhs
->selector
= rhs
->selector
;
123 lhs
->base
= rhs
->base
;
124 lhs
->limit
= rhs
->limit
;
125 lhs
->type
= (flags
>> DESC_TYPE_SHIFT
) & 15;
126 lhs
->present
= (flags
& DESC_P_MASK
) != 0;
127 lhs
->dpl
= rhs
->selector
& 3;
128 lhs
->db
= (flags
>> DESC_B_SHIFT
) & 1;
129 lhs
->s
= (flags
& DESC_S_MASK
) != 0;
130 lhs
->l
= (flags
>> DESC_L_SHIFT
) & 1;
131 lhs
->g
= (flags
& DESC_G_MASK
) != 0;
132 lhs
->avl
= (flags
& DESC_AVL_MASK
) != 0;
136 static void get_seg(SegmentCache
*lhs
, const struct kvm_segment
*rhs
)
138 lhs
->selector
= rhs
->selector
;
139 lhs
->base
= rhs
->base
;
140 lhs
->limit
= rhs
->limit
;
142 (rhs
->type
<< DESC_TYPE_SHIFT
)
143 | (rhs
->present
* DESC_P_MASK
)
144 | (rhs
->dpl
<< DESC_DPL_SHIFT
)
145 | (rhs
->db
<< DESC_B_SHIFT
)
146 | (rhs
->s
* DESC_S_MASK
)
147 | (rhs
->l
<< DESC_L_SHIFT
)
148 | (rhs
->g
* DESC_G_MASK
)
149 | (rhs
->avl
* DESC_AVL_MASK
);
152 /* the reset values of qemu are not compatible to SVM
153 * this function is used to fix the segment descriptor values */
154 static void fix_realmode_dataseg(struct kvm_segment
*seg
)
161 void kvm_arch_load_regs(CPUState
*env
)
163 struct kvm_regs regs
;
165 struct kvm_sregs sregs
;
166 struct kvm_msr_entry msrs
[MSR_COUNT
];
169 regs
.rax
= env
->regs
[R_EAX
];
170 regs
.rbx
= env
->regs
[R_EBX
];
171 regs
.rcx
= env
->regs
[R_ECX
];
172 regs
.rdx
= env
->regs
[R_EDX
];
173 regs
.rsi
= env
->regs
[R_ESI
];
174 regs
.rdi
= env
->regs
[R_EDI
];
175 regs
.rsp
= env
->regs
[R_ESP
];
176 regs
.rbp
= env
->regs
[R_EBP
];
178 regs
.r8
= env
->regs
[8];
179 regs
.r9
= env
->regs
[9];
180 regs
.r10
= env
->regs
[10];
181 regs
.r11
= env
->regs
[11];
182 regs
.r12
= env
->regs
[12];
183 regs
.r13
= env
->regs
[13];
184 regs
.r14
= env
->regs
[14];
185 regs
.r15
= env
->regs
[15];
188 regs
.rflags
= env
->eflags
;
191 kvm_set_regs(kvm_context
, env
->cpu_index
, ®s
);
193 memset(&fpu
, 0, sizeof fpu
);
194 fpu
.fsw
= env
->fpus
& ~(7 << 11);
195 fpu
.fsw
|= (env
->fpstt
& 7) << 11;
197 for (i
= 0; i
< 8; ++i
)
198 fpu
.ftwx
|= (!env
->fptags
[i
]) << i
;
199 memcpy(fpu
.fpr
, env
->fpregs
, sizeof env
->fpregs
);
200 memcpy(fpu
.xmm
, env
->xmm_regs
, sizeof env
->xmm_regs
);
201 fpu
.mxcsr
= env
->mxcsr
;
202 kvm_set_fpu(kvm_context
, env
->cpu_index
, &fpu
);
204 memcpy(sregs
.interrupt_bitmap
, env
->kvm_interrupt_bitmap
, sizeof(sregs
.interrupt_bitmap
));
206 if ((env
->eflags
& VM_MASK
)) {
207 set_v8086_seg(&sregs
.cs
, &env
->segs
[R_CS
]);
208 set_v8086_seg(&sregs
.ds
, &env
->segs
[R_DS
]);
209 set_v8086_seg(&sregs
.es
, &env
->segs
[R_ES
]);
210 set_v8086_seg(&sregs
.fs
, &env
->segs
[R_FS
]);
211 set_v8086_seg(&sregs
.gs
, &env
->segs
[R_GS
]);
212 set_v8086_seg(&sregs
.ss
, &env
->segs
[R_SS
]);
214 set_seg(&sregs
.cs
, &env
->segs
[R_CS
]);
215 set_seg(&sregs
.ds
, &env
->segs
[R_DS
]);
216 set_seg(&sregs
.es
, &env
->segs
[R_ES
]);
217 set_seg(&sregs
.fs
, &env
->segs
[R_FS
]);
218 set_seg(&sregs
.gs
, &env
->segs
[R_GS
]);
219 set_seg(&sregs
.ss
, &env
->segs
[R_SS
]);
221 if (env
->cr
[0] & CR0_PE_MASK
) {
222 /* force ss cpl to cs cpl */
223 sregs
.ss
.selector
= (sregs
.ss
.selector
& ~3) |
224 (sregs
.cs
.selector
& 3);
225 sregs
.ss
.dpl
= sregs
.ss
.selector
& 3;
228 if (!(env
->cr
[0] & CR0_PG_MASK
)) {
229 fix_realmode_dataseg(&sregs
.cs
);
230 fix_realmode_dataseg(&sregs
.ds
);
231 fix_realmode_dataseg(&sregs
.es
);
232 fix_realmode_dataseg(&sregs
.fs
);
233 fix_realmode_dataseg(&sregs
.gs
);
234 fix_realmode_dataseg(&sregs
.ss
);
238 set_seg(&sregs
.tr
, &env
->tr
);
239 set_seg(&sregs
.ldt
, &env
->ldt
);
241 sregs
.idt
.limit
= env
->idt
.limit
;
242 sregs
.idt
.base
= env
->idt
.base
;
243 sregs
.gdt
.limit
= env
->gdt
.limit
;
244 sregs
.gdt
.base
= env
->gdt
.base
;
246 sregs
.cr0
= env
->cr
[0];
247 sregs
.cr2
= env
->cr
[2];
248 sregs
.cr3
= env
->cr
[3];
249 sregs
.cr4
= env
->cr
[4];
251 sregs
.apic_base
= cpu_get_apic_base(env
);
252 sregs
.efer
= env
->efer
;
253 sregs
.cr8
= cpu_get_apic_tpr(env
);
255 kvm_set_sregs(kvm_context
, env
->cpu_index
, &sregs
);
259 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_CS
, env
->sysenter_cs
);
260 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_ESP
, env
->sysenter_esp
);
261 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_EIP
, env
->sysenter_eip
);
262 if (kvm_has_msr_star
)
263 set_msr_entry(&msrs
[n
++], MSR_STAR
, env
->star
);
264 set_msr_entry(&msrs
[n
++], MSR_IA32_TSC
, env
->tsc
);
266 if (lm_capable_kernel
) {
267 set_msr_entry(&msrs
[n
++], MSR_CSTAR
, env
->cstar
);
268 set_msr_entry(&msrs
[n
++], MSR_KERNELGSBASE
, env
->kernelgsbase
);
269 set_msr_entry(&msrs
[n
++], MSR_FMASK
, env
->fmask
);
270 set_msr_entry(&msrs
[n
++], MSR_LSTAR
, env
->lstar
);
274 rc
= kvm_set_msrs(kvm_context
, env
->cpu_index
, msrs
, n
);
276 perror("kvm_set_msrs FAILED");
280 void kvm_arch_save_regs(CPUState
*env
)
282 struct kvm_regs regs
;
284 struct kvm_sregs sregs
;
285 struct kvm_msr_entry msrs
[MSR_COUNT
];
289 kvm_get_regs(kvm_context
, env
->cpu_index
, ®s
);
291 env
->regs
[R_EAX
] = regs
.rax
;
292 env
->regs
[R_EBX
] = regs
.rbx
;
293 env
->regs
[R_ECX
] = regs
.rcx
;
294 env
->regs
[R_EDX
] = regs
.rdx
;
295 env
->regs
[R_ESI
] = regs
.rsi
;
296 env
->regs
[R_EDI
] = regs
.rdi
;
297 env
->regs
[R_ESP
] = regs
.rsp
;
298 env
->regs
[R_EBP
] = regs
.rbp
;
300 env
->regs
[8] = regs
.r8
;
301 env
->regs
[9] = regs
.r9
;
302 env
->regs
[10] = regs
.r10
;
303 env
->regs
[11] = regs
.r11
;
304 env
->regs
[12] = regs
.r12
;
305 env
->regs
[13] = regs
.r13
;
306 env
->regs
[14] = regs
.r14
;
307 env
->regs
[15] = regs
.r15
;
310 env
->eflags
= regs
.rflags
;
313 kvm_get_fpu(kvm_context
, env
->cpu_index
, &fpu
);
314 env
->fpstt
= (fpu
.fsw
>> 11) & 7;
317 for (i
= 0; i
< 8; ++i
)
318 env
->fptags
[i
] = !((fpu
.ftwx
>> i
) & 1);
319 memcpy(env
->fpregs
, fpu
.fpr
, sizeof env
->fpregs
);
320 memcpy(env
->xmm_regs
, fpu
.xmm
, sizeof env
->xmm_regs
);
321 env
->mxcsr
= fpu
.mxcsr
;
323 kvm_get_sregs(kvm_context
, env
->cpu_index
, &sregs
);
325 memcpy(env
->kvm_interrupt_bitmap
, sregs
.interrupt_bitmap
, sizeof(env
->kvm_interrupt_bitmap
));
327 get_seg(&env
->segs
[R_CS
], &sregs
.cs
);
328 get_seg(&env
->segs
[R_DS
], &sregs
.ds
);
329 get_seg(&env
->segs
[R_ES
], &sregs
.es
);
330 get_seg(&env
->segs
[R_FS
], &sregs
.fs
);
331 get_seg(&env
->segs
[R_GS
], &sregs
.gs
);
332 get_seg(&env
->segs
[R_SS
], &sregs
.ss
);
334 get_seg(&env
->tr
, &sregs
.tr
);
335 get_seg(&env
->ldt
, &sregs
.ldt
);
337 env
->idt
.limit
= sregs
.idt
.limit
;
338 env
->idt
.base
= sregs
.idt
.base
;
339 env
->gdt
.limit
= sregs
.gdt
.limit
;
340 env
->gdt
.base
= sregs
.gdt
.base
;
342 env
->cr
[0] = sregs
.cr0
;
343 env
->cr
[2] = sregs
.cr2
;
344 env
->cr
[3] = sregs
.cr3
;
345 env
->cr
[4] = sregs
.cr4
;
347 cpu_set_apic_base(env
, sregs
.apic_base
);
349 env
->efer
= sregs
.efer
;
350 //cpu_set_apic_tpr(env, sregs.cr8);
352 #define HFLAG_COPY_MASK ~( \
353 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
354 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
355 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
356 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
360 hflags
= (env
->segs
[R_CS
].flags
>> DESC_DPL_SHIFT
) & HF_CPL_MASK
;
361 hflags
|= (env
->cr
[0] & CR0_PE_MASK
) << (HF_PE_SHIFT
- CR0_PE_SHIFT
);
362 hflags
|= (env
->cr
[0] << (HF_MP_SHIFT
- CR0_MP_SHIFT
)) &
363 (HF_MP_MASK
| HF_EM_MASK
| HF_TS_MASK
);
364 hflags
|= (env
->eflags
& (HF_TF_MASK
| HF_VM_MASK
| HF_IOPL_MASK
));
365 hflags
|= (env
->cr
[4] & CR4_OSFXSR_MASK
) <<
366 (HF_OSFXSR_SHIFT
- CR4_OSFXSR_SHIFT
);
368 if (env
->efer
& MSR_EFER_LMA
) {
369 hflags
|= HF_LMA_MASK
;
372 if ((hflags
& HF_LMA_MASK
) && (env
->segs
[R_CS
].flags
& DESC_L_MASK
)) {
373 hflags
|= HF_CS32_MASK
| HF_SS32_MASK
| HF_CS64_MASK
;
375 hflags
|= (env
->segs
[R_CS
].flags
& DESC_B_MASK
) >>
376 (DESC_B_SHIFT
- HF_CS32_SHIFT
);
377 hflags
|= (env
->segs
[R_SS
].flags
& DESC_B_MASK
) >>
378 (DESC_B_SHIFT
- HF_SS32_SHIFT
);
379 if (!(env
->cr
[0] & CR0_PE_MASK
) ||
380 (env
->eflags
& VM_MASK
) ||
381 !(hflags
& HF_CS32_MASK
)) {
382 hflags
|= HF_ADDSEG_MASK
;
384 hflags
|= ((env
->segs
[R_DS
].base
|
385 env
->segs
[R_ES
].base
|
386 env
->segs
[R_SS
].base
) != 0) <<
390 env
->hflags
= (env
->hflags
& HFLAG_COPY_MASK
) | hflags
;
391 env
->cc_src
= env
->eflags
& (CC_O
| CC_S
| CC_Z
| CC_A
| CC_P
| CC_C
);
392 env
->df
= 1 - (2 * ((env
->eflags
>> 10) & 1));
393 env
->cc_op
= CC_OP_EFLAGS
;
394 env
->eflags
&= ~(DF_MASK
| CC_O
| CC_S
| CC_Z
| CC_A
| CC_P
| CC_C
);
398 msrs
[n
++].index
= MSR_IA32_SYSENTER_CS
;
399 msrs
[n
++].index
= MSR_IA32_SYSENTER_ESP
;
400 msrs
[n
++].index
= MSR_IA32_SYSENTER_EIP
;
401 if (kvm_has_msr_star
)
402 msrs
[n
++].index
= MSR_STAR
;
403 msrs
[n
++].index
= MSR_IA32_TSC
;
405 if (lm_capable_kernel
) {
406 msrs
[n
++].index
= MSR_CSTAR
;
407 msrs
[n
++].index
= MSR_KERNELGSBASE
;
408 msrs
[n
++].index
= MSR_FMASK
;
409 msrs
[n
++].index
= MSR_LSTAR
;
412 rc
= kvm_get_msrs(kvm_context
, env
->cpu_index
, msrs
, n
);
414 perror("kvm_get_msrs FAILED");
417 n
= rc
; /* actual number of MSRs */
418 for (i
=0 ; i
<n
; i
++) {
419 if (get_msr_entry(&msrs
[i
], env
))
425 static void host_cpuid(uint32_t function
, uint32_t *eax
, uint32_t *ebx
,
426 uint32_t *ecx
, uint32_t *edx
)
432 : "=a"(vec
[0]), "=b"(vec
[1]), "=c"(vec
[2]), "=d"(vec
[3])
435 asm volatile("movl %%ebx, %%esi \n\t"
437 "movl %%ebx, %1 \n\t"
439 : "=a"(vec
[0]), "=r"(vec
[1]), "=c"(vec
[2]), "=d"(vec
[3])
454 static void do_cpuid_ent(struct kvm_cpuid_entry
*e
, uint32_t function
,
457 env
->regs
[R_EAX
] = function
;
458 qemu_kvm_cpuid_on_env(env
);
459 e
->function
= function
;
460 e
->eax
= env
->regs
[R_EAX
];
461 e
->ebx
= env
->regs
[R_EBX
];
462 e
->ecx
= env
->regs
[R_ECX
];
463 e
->edx
= env
->regs
[R_EDX
];
464 if (function
== 0x80000001) {
465 uint32_t h_eax
, h_edx
;
467 host_cpuid(function
, &h_eax
, NULL
, NULL
, &h_edx
);
470 if ((h_edx
& 0x20000000) == 0 || !lm_capable_kernel
)
471 e
->edx
&= ~0x20000000u
;
473 if ((h_edx
& 0x00000800) == 0)
474 e
->edx
&= ~0x00000800u
;
476 if ((h_edx
& 0x00100000) == 0)
477 e
->edx
&= ~0x00100000u
;
482 // sysenter isn't supported on compatibility mode on AMD. and syscall
483 // isn't supported in compatibility mode on Intel. so advertise the
484 // actuall cpu, and say goodbye to migration between different vendors
485 // is you use compatibility mode.
489 host_cpuid(0, NULL
, &bcd
[0], &bcd
[1], &bcd
[2]);
496 int kvm_arch_qemu_init_env(CPUState
*cenv
)
498 struct kvm_cpuid_entry cpuid_ent
[100];
499 #ifdef KVM_CPUID_SIGNATURE
500 struct kvm_cpuid_entry
*pv_ent
;
501 uint32_t signature
[3];
506 int has_clocksource
= 0;
507 #ifdef KVM_CAP_CLOCKSOURCE
508 has_clocksource
= kvm_check_extension(kvm_context
, KVM_CAP_CLOCKSOURCE
);
513 #ifdef KVM_CPUID_SIGNATURE
514 /* Paravirtualization CPUIDs */
515 memcpy(signature
, "KVMKVMKVM", 12);
516 pv_ent
= &cpuid_ent
[cpuid_nent
++];
517 memset(pv_ent
, 0, sizeof(*pv_ent
));
518 pv_ent
->function
= KVM_CPUID_SIGNATURE
;
520 pv_ent
->ebx
= signature
[0];
521 pv_ent
->ecx
= signature
[1];
522 pv_ent
->edx
= signature
[2];
524 pv_ent
= &cpuid_ent
[cpuid_nent
++];
525 memset(pv_ent
, 0, sizeof(*pv_ent
));
526 pv_ent
->function
= KVM_CPUID_FEATURES
;
527 pv_ent
->eax
= (has_clocksource
<< KVM_FEATURE_CLOCKSOURCE
);
530 copy
.regs
[R_EAX
] = 0;
531 qemu_kvm_cpuid_on_env(©
);
532 limit
= copy
.regs
[R_EAX
];
534 for (i
= 0; i
<= limit
; ++i
)
535 do_cpuid_ent(&cpuid_ent
[cpuid_nent
++], i
, ©
);
537 copy
.regs
[R_EAX
] = 0x80000000;
538 qemu_kvm_cpuid_on_env(©
);
539 limit
= copy
.regs
[R_EAX
];
541 for (i
= 0x80000000; i
<= limit
; ++i
)
542 do_cpuid_ent(&cpuid_ent
[cpuid_nent
++], i
, ©
);
544 kvm_setup_cpuid(kvm_context
, cenv
->cpu_index
, cpuid_nent
, cpuid_ent
);
548 int kvm_arch_halt(void *opaque
, int vcpu
)
550 CPUState
*env
= cpu_single_env
;
552 if (!((env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
553 (env
->eflags
& IF_MASK
))) {
554 env
->hflags
|= HF_HALTED_MASK
;
555 env
->exception_index
= EXCP_HLT
;
560 void kvm_arch_pre_kvm_run(void *opaque
, int vcpu
)
562 CPUState
*env
= cpu_single_env
;
564 if (!kvm_irqchip_in_kernel(kvm_context
))
565 kvm_set_cr8(kvm_context
, vcpu
, cpu_get_apic_tpr(env
));
568 void kvm_arch_post_kvm_run(void *opaque
, int vcpu
)
570 CPUState
*env
= qemu_kvm_cpu_env(vcpu
);
571 cpu_single_env
= env
;
573 env
->eflags
= kvm_get_interrupt_flag(kvm_context
, vcpu
)
574 ? env
->eflags
| IF_MASK
: env
->eflags
& ~IF_MASK
;
575 env
->ready_for_interrupt_injection
576 = kvm_is_ready_for_interrupt_injection(kvm_context
, vcpu
);
578 cpu_set_apic_tpr(env
, kvm_get_cr8(kvm_context
, vcpu
));
579 cpu_set_apic_base(env
, kvm_get_apic_base(kvm_context
, vcpu
));
582 int kvm_arch_has_work(CPUState
*env
)
584 if ((env
->interrupt_request
& (CPU_INTERRUPT_HARD
| CPU_INTERRUPT_EXIT
)) &&
585 (env
->eflags
& IF_MASK
))
590 int kvm_arch_try_push_interrupts(void *opaque
)
592 CPUState
*env
= cpu_single_env
;
595 if (env
->ready_for_interrupt_injection
&&
596 (env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
597 (env
->eflags
& IF_MASK
)) {
598 env
->interrupt_request
&= ~CPU_INTERRUPT_HARD
;
599 irq
= cpu_get_pic_interrupt(env
);
601 r
= kvm_inject_irq(kvm_context
, env
->cpu_index
, irq
);
603 printf("cpu %d fail inject %x\n", env
->cpu_index
, irq
);
607 return (env
->interrupt_request
& CPU_INTERRUPT_HARD
) != 0;
610 void kvm_arch_update_regs_for_sipi(CPUState
*env
)
612 SegmentCache cs
= env
->segs
[R_CS
];
614 kvm_arch_save_regs(env
);
615 env
->segs
[R_CS
] = cs
;
617 kvm_arch_load_regs(env
);
620 int handle_tpr_access(void *opaque
, int vcpu
,
621 uint64_t rip
, int is_write
)
623 kvm_tpr_access_report(cpu_single_env
, rip
, is_write
);