3 #include "config-host.h"
6 #define KVM_ALLOWED_DEFAULT 1
8 #define KVM_ALLOWED_DEFAULT 0
11 int kvm_allowed
= KVM_ALLOWED_DEFAULT
;
12 static int lm_capable_kernel
;
23 #include <sys/utsname.h>
25 #define MSR_IA32_TSC 0x10
27 extern void perror(const char *s
);
29 kvm_context_t kvm_context
;
30 static struct kvm_msr_list
*kvm_msr_list
;
31 static int kvm_has_msr_star
;
34 extern unsigned int kvm_shadow_memory
;
36 pthread_mutex_t qemu_mutex
= PTHREAD_MUTEX_INITIALIZER
;
37 static __thread CPUState
*vcpu_env
;
39 static sigset_t io_sigset
, io_negsigset
;
43 #define SIG_IPI (SIGRTMIN+4)
54 static void sig_ipi_handler(int n
)
58 void kvm_update_interrupt_request(CPUState
*env
)
60 if (env
&& env
!= vcpu_env
) {
61 if (vcpu_info
[env
->cpu_index
].signalled
)
63 vcpu_info
[env
->cpu_index
].signalled
= 1;
64 if (vcpu_info
[env
->cpu_index
].thread
)
65 pthread_kill(vcpu_info
[env
->cpu_index
].thread
, SIG_IPI
);
69 void kvm_update_after_sipi(CPUState
*env
)
71 vcpu_info
[env
->cpu_index
].sipi_needed
= 1;
72 kvm_update_interrupt_request(env
);
75 * the qemu bios waits using a busy loop that's much too short for
76 * kvm. add a wait after the first sipi.
79 static int first_sipi
= 1;
88 void kvm_apic_init(CPUState
*env
)
90 if (env
->cpu_index
!= 0)
91 vcpu_info
[env
->cpu_index
].init
= 1;
92 kvm_update_interrupt_request(env
);
95 static void set_msr_entry(struct kvm_msr_entry
*entry
, uint32_t index
,
102 /* returns 0 on success, non-0 on failure */
103 static int get_msr_entry(struct kvm_msr_entry
*entry
, CPUState
*env
)
105 switch (entry
->index
) {
106 case MSR_IA32_SYSENTER_CS
:
107 env
->sysenter_cs
= entry
->data
;
109 case MSR_IA32_SYSENTER_ESP
:
110 env
->sysenter_esp
= entry
->data
;
112 case MSR_IA32_SYSENTER_EIP
:
113 env
->sysenter_eip
= entry
->data
;
116 env
->star
= entry
->data
;
120 env
->cstar
= entry
->data
;
122 case MSR_KERNELGSBASE
:
123 env
->kernelgsbase
= entry
->data
;
126 env
->fmask
= entry
->data
;
129 env
->lstar
= entry
->data
;
133 env
->tsc
= entry
->data
;
136 printf("Warning unknown msr index 0x%x\n", entry
->index
);
148 static void set_v8086_seg(struct kvm_segment
*lhs
, const SegmentCache
*rhs
)
150 lhs
->selector
= rhs
->selector
;
151 lhs
->base
= rhs
->base
;
152 lhs
->limit
= rhs
->limit
;
164 static void set_seg(struct kvm_segment
*lhs
, const SegmentCache
*rhs
)
166 unsigned flags
= rhs
->flags
;
167 lhs
->selector
= rhs
->selector
;
168 lhs
->base
= rhs
->base
;
169 lhs
->limit
= rhs
->limit
;
170 lhs
->type
= (flags
>> DESC_TYPE_SHIFT
) & 15;
171 lhs
->present
= (flags
& DESC_P_MASK
) != 0;
172 lhs
->dpl
= rhs
->selector
& 3;
173 lhs
->db
= (flags
>> DESC_B_SHIFT
) & 1;
174 lhs
->s
= (flags
& DESC_S_MASK
) != 0;
175 lhs
->l
= (flags
>> DESC_L_SHIFT
) & 1;
176 lhs
->g
= (flags
& DESC_G_MASK
) != 0;
177 lhs
->avl
= (flags
& DESC_AVL_MASK
) != 0;
181 static void get_seg(SegmentCache
*lhs
, const struct kvm_segment
*rhs
)
183 lhs
->selector
= rhs
->selector
;
184 lhs
->base
= rhs
->base
;
185 lhs
->limit
= rhs
->limit
;
187 (rhs
->type
<< DESC_TYPE_SHIFT
)
188 | (rhs
->present
* DESC_P_MASK
)
189 | (rhs
->dpl
<< DESC_DPL_SHIFT
)
190 | (rhs
->db
<< DESC_B_SHIFT
)
191 | (rhs
->s
* DESC_S_MASK
)
192 | (rhs
->l
<< DESC_L_SHIFT
)
193 | (rhs
->g
* DESC_G_MASK
)
194 | (rhs
->avl
* DESC_AVL_MASK
);
197 /* the reset values of qemu are not compatible to SVM
198 * this function is used to fix the segment descriptor values */
199 static void fix_realmode_dataseg(struct kvm_segment
*seg
)
206 static void load_regs(CPUState
*env
)
208 struct kvm_regs regs
;
210 struct kvm_sregs sregs
;
211 struct kvm_msr_entry msrs
[MSR_COUNT
];
214 regs
.rax
= env
->regs
[R_EAX
];
215 regs
.rbx
= env
->regs
[R_EBX
];
216 regs
.rcx
= env
->regs
[R_ECX
];
217 regs
.rdx
= env
->regs
[R_EDX
];
218 regs
.rsi
= env
->regs
[R_ESI
];
219 regs
.rdi
= env
->regs
[R_EDI
];
220 regs
.rsp
= env
->regs
[R_ESP
];
221 regs
.rbp
= env
->regs
[R_EBP
];
223 regs
.r8
= env
->regs
[8];
224 regs
.r9
= env
->regs
[9];
225 regs
.r10
= env
->regs
[10];
226 regs
.r11
= env
->regs
[11];
227 regs
.r12
= env
->regs
[12];
228 regs
.r13
= env
->regs
[13];
229 regs
.r14
= env
->regs
[14];
230 regs
.r15
= env
->regs
[15];
233 regs
.rflags
= env
->eflags
;
236 kvm_set_regs(kvm_context
, env
->cpu_index
, ®s
);
238 memset(&fpu
, 0, sizeof fpu
);
239 fpu
.fsw
= env
->fpus
& ~(7 << 11);
240 fpu
.fsw
|= (env
->fpstt
& 7) << 11;
242 for (i
= 0; i
< 8; ++i
)
243 fpu
.ftwx
|= (!env
->fptags
[i
]) << i
;
244 memcpy(fpu
.fpr
, env
->fpregs
, sizeof env
->fpregs
);
245 memcpy(fpu
.xmm
, env
->xmm_regs
, sizeof env
->xmm_regs
);
246 fpu
.mxcsr
= env
->mxcsr
;
247 kvm_set_fpu(kvm_context
, env
->cpu_index
, &fpu
);
249 memcpy(sregs
.interrupt_bitmap
, env
->kvm_interrupt_bitmap
, sizeof(sregs
.interrupt_bitmap
));
251 if ((env
->eflags
& VM_MASK
)) {
252 set_v8086_seg(&sregs
.cs
, &env
->segs
[R_CS
]);
253 set_v8086_seg(&sregs
.ds
, &env
->segs
[R_DS
]);
254 set_v8086_seg(&sregs
.es
, &env
->segs
[R_ES
]);
255 set_v8086_seg(&sregs
.fs
, &env
->segs
[R_FS
]);
256 set_v8086_seg(&sregs
.gs
, &env
->segs
[R_GS
]);
257 set_v8086_seg(&sregs
.ss
, &env
->segs
[R_SS
]);
259 set_seg(&sregs
.cs
, &env
->segs
[R_CS
]);
260 set_seg(&sregs
.ds
, &env
->segs
[R_DS
]);
261 set_seg(&sregs
.es
, &env
->segs
[R_ES
]);
262 set_seg(&sregs
.fs
, &env
->segs
[R_FS
]);
263 set_seg(&sregs
.gs
, &env
->segs
[R_GS
]);
264 set_seg(&sregs
.ss
, &env
->segs
[R_SS
]);
266 if (env
->cr
[0] & CR0_PE_MASK
) {
267 /* force ss cpl to cs cpl */
268 sregs
.ss
.selector
= (sregs
.ss
.selector
& ~3) |
269 (sregs
.cs
.selector
& 3);
270 sregs
.ss
.dpl
= sregs
.ss
.selector
& 3;
273 if (!(env
->cr
[0] & CR0_PG_MASK
)) {
274 fix_realmode_dataseg(&sregs
.cs
);
275 fix_realmode_dataseg(&sregs
.ds
);
276 fix_realmode_dataseg(&sregs
.es
);
277 fix_realmode_dataseg(&sregs
.fs
);
278 fix_realmode_dataseg(&sregs
.gs
);
279 fix_realmode_dataseg(&sregs
.ss
);
283 set_seg(&sregs
.tr
, &env
->tr
);
284 set_seg(&sregs
.ldt
, &env
->ldt
);
286 sregs
.idt
.limit
= env
->idt
.limit
;
287 sregs
.idt
.base
= env
->idt
.base
;
288 sregs
.gdt
.limit
= env
->gdt
.limit
;
289 sregs
.gdt
.base
= env
->gdt
.base
;
291 sregs
.cr0
= env
->cr
[0];
292 sregs
.cr2
= env
->cr
[2];
293 sregs
.cr3
= env
->cr
[3];
294 sregs
.cr4
= env
->cr
[4];
296 sregs
.apic_base
= cpu_get_apic_base(env
);
297 sregs
.efer
= env
->efer
;
298 sregs
.cr8
= cpu_get_apic_tpr(env
);
300 kvm_set_sregs(kvm_context
, env
->cpu_index
, &sregs
);
304 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_CS
, env
->sysenter_cs
);
305 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_ESP
, env
->sysenter_esp
);
306 set_msr_entry(&msrs
[n
++], MSR_IA32_SYSENTER_EIP
, env
->sysenter_eip
);
307 if (kvm_has_msr_star
)
308 set_msr_entry(&msrs
[n
++], MSR_STAR
, env
->star
);
309 set_msr_entry(&msrs
[n
++], MSR_IA32_TSC
, env
->tsc
);
311 if (lm_capable_kernel
) {
312 set_msr_entry(&msrs
[n
++], MSR_CSTAR
, env
->cstar
);
313 set_msr_entry(&msrs
[n
++], MSR_KERNELGSBASE
, env
->kernelgsbase
);
314 set_msr_entry(&msrs
[n
++], MSR_FMASK
, env
->fmask
);
315 set_msr_entry(&msrs
[n
++], MSR_LSTAR
, env
->lstar
);
319 rc
= kvm_set_msrs(kvm_context
, env
->cpu_index
, msrs
, n
);
321 perror("kvm_set_msrs FAILED");
325 static void save_regs(CPUState
*env
)
327 struct kvm_regs regs
;
329 struct kvm_sregs sregs
;
330 struct kvm_msr_entry msrs
[MSR_COUNT
];
334 kvm_get_regs(kvm_context
, env
->cpu_index
, ®s
);
336 env
->regs
[R_EAX
] = regs
.rax
;
337 env
->regs
[R_EBX
] = regs
.rbx
;
338 env
->regs
[R_ECX
] = regs
.rcx
;
339 env
->regs
[R_EDX
] = regs
.rdx
;
340 env
->regs
[R_ESI
] = regs
.rsi
;
341 env
->regs
[R_EDI
] = regs
.rdi
;
342 env
->regs
[R_ESP
] = regs
.rsp
;
343 env
->regs
[R_EBP
] = regs
.rbp
;
345 env
->regs
[8] = regs
.r8
;
346 env
->regs
[9] = regs
.r9
;
347 env
->regs
[10] = regs
.r10
;
348 env
->regs
[11] = regs
.r11
;
349 env
->regs
[12] = regs
.r12
;
350 env
->regs
[13] = regs
.r13
;
351 env
->regs
[14] = regs
.r14
;
352 env
->regs
[15] = regs
.r15
;
355 env
->eflags
= regs
.rflags
;
358 kvm_get_fpu(kvm_context
, env
->cpu_index
, &fpu
);
359 env
->fpstt
= (fpu
.fsw
>> 11) & 7;
362 for (i
= 0; i
< 8; ++i
)
363 env
->fptags
[i
] = !((fpu
.ftwx
>> i
) & 1);
364 memcpy(env
->fpregs
, fpu
.fpr
, sizeof env
->fpregs
);
365 memcpy(env
->xmm_regs
, fpu
.xmm
, sizeof env
->xmm_regs
);
366 env
->mxcsr
= fpu
.mxcsr
;
368 kvm_get_sregs(kvm_context
, env
->cpu_index
, &sregs
);
370 memcpy(env
->kvm_interrupt_bitmap
, sregs
.interrupt_bitmap
, sizeof(env
->kvm_interrupt_bitmap
));
372 get_seg(&env
->segs
[R_CS
], &sregs
.cs
);
373 get_seg(&env
->segs
[R_DS
], &sregs
.ds
);
374 get_seg(&env
->segs
[R_ES
], &sregs
.es
);
375 get_seg(&env
->segs
[R_FS
], &sregs
.fs
);
376 get_seg(&env
->segs
[R_GS
], &sregs
.gs
);
377 get_seg(&env
->segs
[R_SS
], &sregs
.ss
);
379 get_seg(&env
->tr
, &sregs
.tr
);
380 get_seg(&env
->ldt
, &sregs
.ldt
);
382 env
->idt
.limit
= sregs
.idt
.limit
;
383 env
->idt
.base
= sregs
.idt
.base
;
384 env
->gdt
.limit
= sregs
.gdt
.limit
;
385 env
->gdt
.base
= sregs
.gdt
.base
;
387 env
->cr
[0] = sregs
.cr0
;
388 env
->cr
[2] = sregs
.cr2
;
389 env
->cr
[3] = sregs
.cr3
;
390 env
->cr
[4] = sregs
.cr4
;
392 cpu_set_apic_base(env
, sregs
.apic_base
);
394 env
->efer
= sregs
.efer
;
395 //cpu_set_apic_tpr(env, sregs.cr8);
397 #define HFLAG_COPY_MASK ~( \
398 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
399 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
400 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
401 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
405 hflags
= (env
->segs
[R_CS
].flags
>> DESC_DPL_SHIFT
) & HF_CPL_MASK
;
406 hflags
|= (env
->cr
[0] & CR0_PE_MASK
) << (HF_PE_SHIFT
- CR0_PE_SHIFT
);
407 hflags
|= (env
->cr
[0] << (HF_MP_SHIFT
- CR0_MP_SHIFT
)) &
408 (HF_MP_MASK
| HF_EM_MASK
| HF_TS_MASK
);
409 hflags
|= (env
->eflags
& (HF_TF_MASK
| HF_VM_MASK
| HF_IOPL_MASK
));
410 hflags
|= (env
->cr
[4] & CR4_OSFXSR_MASK
) <<
411 (HF_OSFXSR_SHIFT
- CR4_OSFXSR_SHIFT
);
413 if (env
->efer
& MSR_EFER_LMA
) {
414 hflags
|= HF_LMA_MASK
;
417 if ((hflags
& HF_LMA_MASK
) && (env
->segs
[R_CS
].flags
& DESC_L_MASK
)) {
418 hflags
|= HF_CS32_MASK
| HF_SS32_MASK
| HF_CS64_MASK
;
420 hflags
|= (env
->segs
[R_CS
].flags
& DESC_B_MASK
) >>
421 (DESC_B_SHIFT
- HF_CS32_SHIFT
);
422 hflags
|= (env
->segs
[R_SS
].flags
& DESC_B_MASK
) >>
423 (DESC_B_SHIFT
- HF_SS32_SHIFT
);
424 if (!(env
->cr
[0] & CR0_PE_MASK
) ||
425 (env
->eflags
& VM_MASK
) ||
426 !(hflags
& HF_CS32_MASK
)) {
427 hflags
|= HF_ADDSEG_MASK
;
429 hflags
|= ((env
->segs
[R_DS
].base
|
430 env
->segs
[R_ES
].base
|
431 env
->segs
[R_SS
].base
) != 0) <<
435 env
->hflags
= (env
->hflags
& HFLAG_COPY_MASK
) | hflags
;
436 env
->cc_src
= env
->eflags
& (CC_O
| CC_S
| CC_Z
| CC_A
| CC_P
| CC_C
);
437 env
->df
= 1 - (2 * ((env
->eflags
>> 10) & 1));
438 env
->cc_op
= CC_OP_EFLAGS
;
439 env
->eflags
&= ~(DF_MASK
| CC_O
| CC_S
| CC_Z
| CC_A
| CC_P
| CC_C
);
443 msrs
[n
++].index
= MSR_IA32_SYSENTER_CS
;
444 msrs
[n
++].index
= MSR_IA32_SYSENTER_ESP
;
445 msrs
[n
++].index
= MSR_IA32_SYSENTER_EIP
;
446 if (kvm_has_msr_star
)
447 msrs
[n
++].index
= MSR_STAR
;
448 msrs
[n
++].index
= MSR_IA32_TSC
;
450 if (lm_capable_kernel
) {
451 msrs
[n
++].index
= MSR_CSTAR
;
452 msrs
[n
++].index
= MSR_KERNELGSBASE
;
453 msrs
[n
++].index
= MSR_FMASK
;
454 msrs
[n
++].index
= MSR_LSTAR
;
457 rc
= kvm_get_msrs(kvm_context
, env
->cpu_index
, msrs
, n
);
459 perror("kvm_get_msrs FAILED");
462 n
= rc
; /* actual number of MSRs */
463 for (i
=0 ; i
<n
; i
++) {
464 if (get_msr_entry(&msrs
[i
], env
))
473 static int try_push_interrupts(void *opaque
)
475 CPUState
*env
= cpu_single_env
;
478 if (env
->ready_for_interrupt_injection
&&
479 (env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
480 (env
->eflags
& IF_MASK
)) {
481 env
->interrupt_request
&= ~CPU_INTERRUPT_HARD
;
482 irq
= cpu_get_pic_interrupt(env
);
484 r
= kvm_inject_irq(kvm_context
, env
->cpu_index
, irq
);
486 printf("cpu %d fail inject %x\n", env
->cpu_index
, irq
);
490 return (env
->interrupt_request
& CPU_INTERRUPT_HARD
) != 0;
493 static void post_kvm_run(void *opaque
, int vcpu
)
495 CPUState
*env
= vcpu_env
;
497 pthread_mutex_lock(&qemu_mutex
);
498 cpu_single_env
= env
;
499 env
->eflags
= kvm_get_interrupt_flag(kvm_context
, vcpu
)
500 ? env
->eflags
| IF_MASK
: env
->eflags
& ~IF_MASK
;
501 env
->ready_for_interrupt_injection
502 = kvm_is_ready_for_interrupt_injection(kvm_context
, vcpu
);
504 cpu_set_apic_tpr(env
, kvm_get_cr8(kvm_context
, vcpu
));
505 cpu_set_apic_base(env
, kvm_get_apic_base(kvm_context
, vcpu
));
508 static int pre_kvm_run(void *opaque
, int vcpu
)
510 CPUState
*env
= cpu_single_env
;
512 if (env
->cpu_index
== 0 && wait_hack
) {
517 pthread_mutex_unlock(&qemu_mutex
);
518 for (i
= 0; i
< 10; ++i
)
520 pthread_mutex_lock(&qemu_mutex
);
523 if (!kvm_irqchip_in_kernel(kvm_context
))
524 kvm_set_cr8(kvm_context
, vcpu
, cpu_get_apic_tpr(env
));
525 if (env
->interrupt_request
& CPU_INTERRUPT_EXIT
)
527 pthread_mutex_unlock(&qemu_mutex
);
531 void kvm_load_registers(CPUState
*env
)
537 void kvm_save_registers(CPUState
*env
)
543 int kvm_cpu_exec(CPUState
*env
)
547 r
= kvm_run(kvm_context
, env
->cpu_index
);
549 printf("kvm_run returned %d\n", r
);
556 extern int vm_running
;
558 static int has_work(CPUState
*env
)
562 if (!(env
->hflags
& HF_HALTED_MASK
))
564 if (env
->interrupt_request
& (CPU_INTERRUPT_HARD
| CPU_INTERRUPT_EXIT
))
569 static int kvm_eat_signal(CPUState
*env
, int timeout
)
576 ts
.tv_sec
= timeout
/ 1000;
577 ts
.tv_nsec
= (timeout
% 1000) * 1000000;
578 r
= sigtimedwait(&io_sigset
, &siginfo
, &ts
);
579 if (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
) && !timeout
)
582 pthread_mutex_lock(&qemu_mutex
);
583 cpu_single_env
= vcpu_env
;
584 if (r
== -1 && !(errno
== EAGAIN
|| errno
== EINTR
)) {
585 printf("sigtimedwait: %s\n", strerror(e
));
589 sigaction(siginfo
.si_signo
, NULL
, &sa
);
590 sa
.sa_handler(siginfo
.si_signo
);
593 pthread_mutex_unlock(&qemu_mutex
);
599 static void kvm_eat_signals(CPUState
*env
, int timeout
)
603 while (kvm_eat_signal(env
, 0))
606 r
= kvm_eat_signal(env
, timeout
);
608 while (kvm_eat_signal(env
, 0))
612 * we call select() even if no signal was received, to account for
613 * for which there is no signal handler installed.
615 pthread_mutex_lock(&qemu_mutex
);
616 cpu_single_env
= vcpu_env
;
618 pthread_mutex_unlock(&qemu_mutex
);
621 static void kvm_main_loop_wait(CPUState
*env
, int timeout
)
623 pthread_mutex_unlock(&qemu_mutex
);
624 if (env
->cpu_index
== 0)
625 kvm_eat_signals(env
, timeout
);
627 if (!kvm_irqchip_in_kernel(kvm_context
) &&
628 (timeout
|| vcpu_info
[env
->cpu_index
].stopped
)) {
634 sigaddset(&set
, SIG_IPI
);
644 sigaddset(&set
, SIG_IPI
);
645 sigtimedwait(&set
, &siginfo
, &ts
);
647 if (vcpu_info
[env
->cpu_index
].stop
) {
648 vcpu_info
[env
->cpu_index
].stop
= 0;
649 vcpu_info
[env
->cpu_index
].stopped
= 1;
650 pthread_kill(vcpu_info
[0].thread
, SIG_IPI
);
654 pthread_mutex_lock(&qemu_mutex
);
655 cpu_single_env
= env
;
656 vcpu_info
[env
->cpu_index
].signalled
= 0;
659 static int all_threads_paused(void)
663 for (i
= 1; i
< smp_cpus
; ++i
)
664 if (vcpu_info
[i
].stopped
)
669 static void pause_other_threads(void)
673 for (i
= 1; i
< smp_cpus
; ++i
) {
674 vcpu_info
[i
].stop
= 1;
675 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
677 while (!all_threads_paused())
678 kvm_eat_signals(vcpu_env
, 0);
681 static void resume_other_threads(void)
685 for (i
= 1; i
< smp_cpus
; ++i
) {
686 vcpu_info
[i
].stop
= 0;
687 vcpu_info
[i
].stopped
= 0;
688 pthread_kill(vcpu_info
[i
].thread
, SIG_IPI
);
692 static void kvm_vm_state_change_handler(void *context
, int running
)
695 resume_other_threads();
697 pause_other_threads();
700 static void update_regs_for_sipi(CPUState
*env
)
702 SegmentCache cs
= env
->segs
[R_CS
];
705 env
->segs
[R_CS
] = cs
;
708 vcpu_info
[env
->cpu_index
].sipi_needed
= 0;
709 vcpu_info
[env
->cpu_index
].init
= 0;
712 static void update_regs_for_init(CPUState
*env
)
718 static void setup_kernel_sigmask(CPUState
*env
)
722 sigprocmask(SIG_BLOCK
, NULL
, &set
);
723 sigdelset(&set
, SIG_IPI
);
724 if (env
->cpu_index
== 0)
725 sigandset(&set
, &set
, &io_negsigset
);
727 kvm_set_signal_mask(kvm_context
, env
->cpu_index
, &set
);
730 static int kvm_main_loop_cpu(CPUState
*env
)
732 struct vcpu_info
*info
= &vcpu_info
[env
->cpu_index
];
734 setup_kernel_sigmask(env
);
735 pthread_mutex_lock(&qemu_mutex
);
736 cpu_single_env
= env
;
738 while (!has_work(env
))
739 kvm_main_loop_wait(env
, 10);
740 if (env
->interrupt_request
& CPU_INTERRUPT_HARD
)
741 env
->hflags
&= ~HF_HALTED_MASK
;
742 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->sipi_needed
)
743 update_regs_for_sipi(env
);
744 if (!kvm_irqchip_in_kernel(kvm_context
) && info
->init
)
745 update_regs_for_init(env
);
746 if (!(env
->hflags
& HF_HALTED_MASK
) && !info
->init
)
748 env
->interrupt_request
&= ~CPU_INTERRUPT_EXIT
;
749 kvm_main_loop_wait(env
, 0);
750 if (qemu_shutdown_requested())
752 else if (qemu_powerdown_requested())
753 qemu_system_powerdown();
754 else if (qemu_reset_requested()) {
755 env
->interrupt_request
= 0;
760 pthread_mutex_unlock(&qemu_mutex
);
764 static void *ap_main_loop(void *_env
)
766 CPUState
*env
= _env
;
770 sigfillset(&signals
);
771 //sigdelset(&signals, SIG_IPI);
772 sigprocmask(SIG_BLOCK
, &signals
, NULL
);
773 kvm_create_vcpu(kvm_context
, env
->cpu_index
);
774 kvm_qemu_init_env(env
);
775 if (kvm_irqchip_in_kernel(kvm_context
))
776 env
->hflags
&= ~HF_HALTED_MASK
;
777 kvm_main_loop_cpu(env
);
781 static void kvm_add_signal(int signum
)
783 sigaddset(&io_sigset
, signum
);
784 sigdelset(&io_negsigset
, signum
);
785 sigprocmask(SIG_BLOCK
, &io_sigset
, NULL
);
788 int kvm_init_ap(void)
790 CPUState
*env
= first_cpu
->next_cpu
;
793 qemu_add_vm_change_state_handler(kvm_vm_state_change_handler
, NULL
);
794 sigemptyset(&io_sigset
);
795 sigfillset(&io_negsigset
);
796 kvm_add_signal(SIGIO
);
797 kvm_add_signal(SIGALRM
);
798 kvm_add_signal(SIGUSR2
);
799 if (!kvm_irqchip_in_kernel(kvm_context
))
800 kvm_add_signal(SIG_IPI
);
802 vcpu_env
= first_cpu
;
803 signal(SIG_IPI
, sig_ipi_handler
);
804 for (i
= 1; i
< smp_cpus
; ++i
) {
805 pthread_create(&vcpu_info
[i
].thread
, NULL
, ap_main_loop
, env
);
811 int kvm_main_loop(void)
813 vcpu_info
[0].thread
= pthread_self();
814 return kvm_main_loop_cpu(first_cpu
);
817 static int kvm_debug(void *opaque
, int vcpu
)
819 CPUState
*env
= cpu_single_env
;
821 env
->exception_index
= EXCP_DEBUG
;
825 static int kvm_inb(void *opaque
, uint16_t addr
, uint8_t *data
)
827 *data
= cpu_inb(0, addr
);
831 static int kvm_inw(void *opaque
, uint16_t addr
, uint16_t *data
)
833 *data
= cpu_inw(0, addr
);
837 static int kvm_inl(void *opaque
, uint16_t addr
, uint32_t *data
)
839 *data
= cpu_inl(0, addr
);
843 #define PM_IO_BASE 0xb000
845 static int kvm_outb(void *opaque
, uint16_t addr
, uint8_t data
)
850 cpu_outb(0, 0xb3, 0);
857 x
= cpu_inw(0, PM_IO_BASE
+ 4);
859 cpu_outw(0, PM_IO_BASE
+ 4, x
);
866 x
= cpu_inw(0, PM_IO_BASE
+ 4);
868 cpu_outw(0, PM_IO_BASE
+ 4, x
);
876 cpu_outb(0, addr
, data
);
880 static int kvm_outw(void *opaque
, uint16_t addr
, uint16_t data
)
882 cpu_outw(0, addr
, data
);
886 static int kvm_outl(void *opaque
, uint16_t addr
, uint32_t data
)
888 cpu_outl(0, addr
, data
);
892 static int kvm_readb(void *opaque
, uint64_t addr
, uint8_t *data
)
894 *data
= ldub_phys(addr
);
898 static int kvm_readw(void *opaque
, uint64_t addr
, uint16_t *data
)
900 *data
= lduw_phys(addr
);
904 static int kvm_readl(void *opaque
, uint64_t addr
, uint32_t *data
)
906 /* hack: Red Hat 7.1 generates some wierd accesses. */
907 if (addr
> 0xa0000 - 4 && addr
< 0xa0000) {
912 *data
= ldl_phys(addr
);
916 static int kvm_readq(void *opaque
, uint64_t addr
, uint64_t *data
)
918 *data
= ldq_phys(addr
);
922 static int kvm_writeb(void *opaque
, uint64_t addr
, uint8_t data
)
924 stb_phys(addr
, data
);
928 static int kvm_writew(void *opaque
, uint64_t addr
, uint16_t data
)
930 stw_phys(addr
, data
);
934 static int kvm_writel(void *opaque
, uint64_t addr
, uint32_t data
)
936 stl_phys(addr
, data
);
940 static int kvm_writeq(void *opaque
, uint64_t addr
, uint64_t data
)
942 stq_phys(addr
, data
);
946 static int kvm_io_window(void *opaque
)
952 static int kvm_halt(void *opaque
, int vcpu
)
954 CPUState
*env
= cpu_single_env
;
956 if (!((env
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
957 (env
->eflags
& IF_MASK
))) {
958 env
->hflags
|= HF_HALTED_MASK
;
959 env
->exception_index
= EXCP_HLT
;
965 static int kvm_shutdown(void *opaque
, int vcpu
)
967 qemu_system_reset_request();
971 static struct kvm_callbacks qemu_kvm_ops
= {
983 .writeb
= kvm_writeb
,
984 .writew
= kvm_writew
,
985 .writel
= kvm_writel
,
986 .writeq
= kvm_writeq
,
988 .shutdown
= kvm_shutdown
,
989 .io_window
= kvm_io_window
,
990 .try_push_interrupts
= try_push_interrupts
,
991 .post_kvm_run
= post_kvm_run
,
992 .pre_kvm_run
= pre_kvm_run
,
997 /* Try to initialize kvm */
998 kvm_context
= kvm_init(&qemu_kvm_ops
, cpu_single_env
);
1006 int kvm_qemu_create_context(void)
1011 kvm_disable_irqchip_creation(kvm_context
);
1013 if (kvm_create(kvm_context
, phys_ram_size
, (void**)&phys_ram_base
) < 0) {
1017 if (kvm_shadow_memory
)
1018 kvm_set_shadow_pages(kvm_context
, kvm_shadow_memory
);
1019 kvm_msr_list
= kvm_get_msr_list(kvm_context
);
1020 if (!kvm_msr_list
) {
1024 for (i
= 0; i
< kvm_msr_list
->nmsrs
; ++i
)
1025 if (kvm_msr_list
->indices
[i
] == MSR_STAR
)
1026 kvm_has_msr_star
= 1;
1030 void kvm_qemu_destroy(void)
1032 kvm_finalize(kvm_context
);
1035 static void host_cpuid(uint32_t function
, uint32_t *eax
, uint32_t *ebx
,
1036 uint32_t *ecx
, uint32_t *edx
)
1043 "sub $128, %%rsp \n\t" /* skip red zone */
1044 "push %0; push %%rsi \n\t"
1045 "push %%rax; push %%rbx; push %%rcx; push %%rdx \n\t"
1046 "mov 8*5(%%rsp), %%rsi \n\t"
1047 "mov (%%rsi), %%eax \n\t"
1049 "mov %%eax, (%%rsi) \n\t"
1050 "mov %%ebx, 4(%%rsi) \n\t"
1051 "mov %%ecx, 8(%%rsi) \n\t"
1052 "mov %%edx, 12(%%rsi) \n\t"
1053 "pop %%rdx; pop %%rcx; pop %%rbx; pop %%rax \n\t"
1054 "pop %%rsi; pop %0 \n\t"
1057 "push %0; push %%esi \n\t"
1058 "push %%eax; push %%ebx; push %%ecx; push %%edx \n\t"
1059 "mov 4*5(%%esp), %%esi \n\t"
1060 "mov (%%esi), %%eax \n\t"
1062 "mov %%eax, (%%esi) \n\t"
1063 "mov %%ebx, 4(%%esi) \n\t"
1064 "mov %%ecx, 8(%%esi) \n\t"
1065 "mov %%edx, 12(%%esi) \n\t"
1066 "pop %%edx; pop %%ecx; pop %%ebx; pop %%eax \n\t"
1067 "pop %%esi; pop %0 \n\t"
1069 : : "rm"(vec
) : "memory");
1080 static void do_cpuid_ent(struct kvm_cpuid_entry
*e
, uint32_t function
,
1083 env
->regs
[R_EAX
] = function
;
1084 qemu_kvm_cpuid_on_env(env
);
1085 e
->function
= function
;
1086 e
->eax
= env
->regs
[R_EAX
];
1087 e
->ebx
= env
->regs
[R_EBX
];
1088 e
->ecx
= env
->regs
[R_ECX
];
1089 e
->edx
= env
->regs
[R_EDX
];
1090 if (function
== 0x80000001) {
1091 uint32_t h_eax
, h_edx
;
1092 struct utsname utsname
;
1094 host_cpuid(function
, &h_eax
, NULL
, NULL
, &h_edx
);
1096 lm_capable_kernel
= strcmp(utsname
.machine
, "x86_64") == 0;
1099 if ((h_edx
& 0x20000000) == 0 || !lm_capable_kernel
)
1100 e
->edx
&= ~0x20000000u
;
1102 if ((h_edx
& 0x00000800) == 0)
1103 e
->edx
&= ~0x00000800u
;
1105 if ((h_edx
& 0x00100000) == 0)
1106 e
->edx
&= ~0x00100000u
;
1108 // sysenter isn't supported on compatibility mode on AMD. and syscall
1109 // isn't supported in compatibility mode on Intel. so advertise the
1110 // actuall cpu, and say goodbye to migration between different vendors
1111 // is you use compatibility mode.
1112 if (function
== 0) {
1115 host_cpuid(0, NULL
, &bcd
[0], &bcd
[1], &bcd
[2]);
1122 int kvm_qemu_init_env(CPUState
*cenv
)
1124 struct kvm_cpuid_entry cpuid_ent
[100];
1125 #ifdef KVM_CPUID_SIGNATURE
1126 struct kvm_cpuid_entry
*pv_ent
;
1127 uint32_t signature
[3];
1135 #ifdef KVM_CPUID_SIGNATURE
1136 /* Paravirtualization CPUIDs */
1137 memcpy(signature
, "KVMKVMKVM", 12);
1138 pv_ent
= &cpuid_ent
[cpuid_nent
++];
1139 memset(pv_ent
, 0, sizeof(*pv_ent
));
1140 pv_ent
->function
= KVM_CPUID_SIGNATURE
;
1142 pv_ent
->ebx
= signature
[0];
1143 pv_ent
->ecx
= signature
[1];
1144 pv_ent
->edx
= signature
[2];
1146 pv_ent
= &cpuid_ent
[cpuid_nent
++];
1147 memset(pv_ent
, 0, sizeof(*pv_ent
));
1148 pv_ent
->function
= KVM_CPUID_FEATURES
;
1152 copy
.regs
[R_EAX
] = 0;
1153 qemu_kvm_cpuid_on_env(©
);
1154 limit
= copy
.regs
[R_EAX
];
1156 for (i
= 0; i
<= limit
; ++i
)
1157 do_cpuid_ent(&cpuid_ent
[cpuid_nent
++], i
, ©
);
1159 copy
.regs
[R_EAX
] = 0x80000000;
1160 qemu_kvm_cpuid_on_env(©
);
1161 limit
= copy
.regs
[R_EAX
];
1163 for (i
= 0x80000000; i
<= limit
; ++i
)
1164 do_cpuid_ent(&cpuid_ent
[cpuid_nent
++], i
, ©
);
1166 kvm_setup_cpuid(kvm_context
, cenv
->cpu_index
, cpuid_nent
, cpuid_ent
);
1171 int kvm_update_debugger(CPUState
*env
)
1173 struct kvm_debug_guest dbg
;
1177 if (env
->nb_breakpoints
|| env
->singlestep_enabled
) {
1179 for (i
= 0; i
< 4 && i
< env
->nb_breakpoints
; ++i
) {
1180 dbg
.breakpoints
[i
].enabled
= 1;
1181 dbg
.breakpoints
[i
].address
= env
->breakpoints
[i
];
1183 dbg
.singlestep
= env
->singlestep_enabled
;
1185 return kvm_guest_debug(kvm_context
, env
->cpu_index
, &dbg
);
1190 * dirty pages logging
1192 /* FIXME: use unsigned long pointer instead of unsigned char */
1193 unsigned char *kvm_dirty_bitmap
= NULL
;
1194 int kvm_physical_memory_set_dirty_tracking(int enable
)
1202 if (!kvm_dirty_bitmap
) {
1203 unsigned bitmap_size
= BITMAP_SIZE(phys_ram_size
);
1204 kvm_dirty_bitmap
= qemu_malloc(bitmap_size
);
1205 if (kvm_dirty_bitmap
== NULL
) {
1206 perror("Failed to allocate dirty pages bitmap");
1210 r
= kvm_dirty_pages_log_enable_all(kvm_context
);
1215 if (kvm_dirty_bitmap
) {
1216 r
= kvm_dirty_pages_log_reset(kvm_context
);
1217 qemu_free(kvm_dirty_bitmap
);
1218 kvm_dirty_bitmap
= NULL
;
1224 /* get kvm's dirty pages bitmap and update qemu's */
1225 int kvm_get_dirty_pages_log_slot(int slot
,
1226 unsigned char *bitmap
,
1227 unsigned int offset
,
1231 unsigned int i
, j
, n
=0;
1233 unsigned page_number
, addr
, addr1
;
1235 memset(bitmap
, 0, len
);
1236 r
= kvm_get_dirty_pages(kvm_context
, slot
, bitmap
);
1241 * bitmap-traveling is faster than memory-traveling (for addr...)
1242 * especially when most of the memory is not dirty.
1244 for (i
=0; i
<len
; i
++) {
1249 page_number
= i
* 8 + j
;
1250 addr1
= page_number
* TARGET_PAGE_SIZE
;
1251 addr
= offset
+ addr1
;
1252 cpu_physical_memory_set_dirty(addr
);
1260 * get kvm's dirty pages bitmap and update qemu's
1261 * we only care about physical ram, which resides in slots 0 and 3
1263 int kvm_update_dirty_pages_log(void)
1267 len
= BITMAP_SIZE(0xa0000);
1268 r
= kvm_get_dirty_pages_log_slot(3, kvm_dirty_bitmap
, 0 , len
);
1269 len
= BITMAP_SIZE(phys_ram_size
- 0xc0000);
1270 r
= r
|| kvm_get_dirty_pages_log_slot(0, kvm_dirty_bitmap
, 0xc0000, len
);
1274 int kvm_get_phys_ram_page_bitmap(unsigned char *bitmap
)
1276 int r
=0, len
, offset
;
1278 len
= BITMAP_SIZE(phys_ram_size
);
1279 memset(bitmap
, 0, len
);
1281 r
= kvm_get_mem_map(kvm_context
, 3, bitmap
);
1285 offset
= BITMAP_SIZE(0xc0000);
1286 r
= kvm_get_mem_map(kvm_context
, 0, bitmap
+ offset
);
1292 #ifdef KVM_CAP_IRQCHIP
1294 int kvm_set_irq(int irq
, int level
)
1296 return kvm_set_irq_level(kvm_context
, irq
, level
);