4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "migration/vmstate.h"
29 #include "monitor/monitor.h"
30 #include "qapi/error.h"
31 #include "qapi/qapi-commands-misc.h"
32 #include "qapi/qapi-events-run-state.h"
33 #include "qapi/qmp/qerror.h"
34 #include "qemu/error-report.h"
35 #include "qemu/qemu-print.h"
36 #include "sysemu/tcg.h"
37 #include "sysemu/block-backend.h"
38 #include "exec/gdbstub.h"
39 #include "sysemu/dma.h"
40 #include "sysemu/hw_accel.h"
41 #include "sysemu/kvm.h"
42 #include "sysemu/hax.h"
43 #include "sysemu/hvf.h"
44 #include "sysemu/whpx.h"
45 #include "exec/exec-all.h"
47 #include "qemu/thread.h"
48 #include "sysemu/cpus.h"
49 #include "sysemu/qtest.h"
50 #include "qemu/main-loop.h"
51 #include "qemu/option.h"
52 #include "qemu/bitmap.h"
53 #include "qemu/seqlock.h"
54 #include "qemu/guest-random.h"
57 #include "sysemu/replay.h"
58 #include "sysemu/runstate.h"
59 #include "hw/boards.h"
64 #include <sys/prctl.h>
67 #define PR_MCE_KILL 33
70 #ifndef PR_MCE_KILL_SET
71 #define PR_MCE_KILL_SET 1
74 #ifndef PR_MCE_KILL_EARLY
75 #define PR_MCE_KILL_EARLY 1
78 #endif /* CONFIG_LINUX */
83 /* vcpu throttling controls */
84 static QEMUTimer
*throttle_timer
;
85 static unsigned int throttle_percentage
;
87 #define CPU_THROTTLE_PCT_MIN 1
88 #define CPU_THROTTLE_PCT_MAX 99
89 #define CPU_THROTTLE_TIMESLICE_NS 10000000
91 bool cpu_is_stopped(CPUState
*cpu
)
93 return cpu
->stopped
|| !runstate_is_running();
96 static bool cpu_thread_is_idle(CPUState
*cpu
)
98 if (cpu
->stop
|| cpu
->queued_work_first
) {
101 if (cpu_is_stopped(cpu
)) {
104 if (!cpu
->halted
|| cpu_has_work(cpu
) ||
105 kvm_halt_in_kernel()) {
111 static bool all_cpu_threads_idle(void)
116 if (!cpu_thread_is_idle(cpu
)) {
123 /***********************************************************/
124 /* guest cycle counter */
126 /* Protected by TimersState seqlock */
128 static bool icount_sleep
= true;
129 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
130 #define MAX_ICOUNT_SHIFT 10
132 typedef struct TimersState
{
133 /* Protected by BQL. */
134 int64_t cpu_ticks_prev
;
135 int64_t cpu_ticks_offset
;
137 /* Protect fields that can be respectively read outside the
138 * BQL, and written from multiple threads.
140 QemuSeqLock vm_clock_seqlock
;
141 QemuSpin vm_clock_lock
;
143 int16_t cpu_ticks_enabled
;
145 /* Conversion factor from emulated instructions to virtual clock ticks. */
146 int16_t icount_time_shift
;
148 /* Compensate for varying guest execution speed. */
149 int64_t qemu_icount_bias
;
151 int64_t vm_clock_warp_start
;
152 int64_t cpu_clock_offset
;
154 /* Only written by TCG thread */
157 /* for adjusting icount */
158 QEMUTimer
*icount_rt_timer
;
159 QEMUTimer
*icount_vm_timer
;
160 QEMUTimer
*icount_warp_timer
;
163 static TimersState timers_state
;
167 * We default to false if we know other options have been enabled
168 * which are currently incompatible with MTTCG. Otherwise when each
169 * guest (target) has been updated to support:
170 * - atomic instructions
171 * - memory ordering primitives (barriers)
172 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
174 * Once a guest architecture has been converted to the new primitives
175 * there are two remaining limitations to check.
177 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
178 * - The host must have a stronger memory order than the guest
180 * It may be possible in future to support strong guests on weak hosts
181 * but that will require tagging all load/stores in a guest with their
182 * implicit memory order requirements which would likely slow things
186 static bool check_tcg_memory_orders_compatible(void)
188 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
189 return (TCG_GUEST_DEFAULT_MO
& ~TCG_TARGET_DEFAULT_MO
) == 0;
195 static bool default_mttcg_enabled(void)
197 if (use_icount
|| TCG_OVERSIZED_GUEST
) {
200 #ifdef TARGET_SUPPORTS_MTTCG
201 return check_tcg_memory_orders_compatible();
208 void qemu_tcg_configure(QemuOpts
*opts
, Error
**errp
)
210 const char *t
= qemu_opt_get(opts
, "thread");
212 if (strcmp(t
, "multi") == 0) {
213 if (TCG_OVERSIZED_GUEST
) {
214 error_setg(errp
, "No MTTCG when guest word size > hosts");
215 } else if (use_icount
) {
216 error_setg(errp
, "No MTTCG when icount is enabled");
218 #ifndef TARGET_SUPPORTS_MTTCG
219 warn_report("Guest not yet converted to MTTCG - "
220 "you may get unexpected results");
222 if (!check_tcg_memory_orders_compatible()) {
223 warn_report("Guest expects a stronger memory ordering "
224 "than the host provides");
225 error_printf("This may cause strange/hard to debug errors\n");
227 mttcg_enabled
= true;
229 } else if (strcmp(t
, "single") == 0) {
230 mttcg_enabled
= false;
232 error_setg(errp
, "Invalid 'thread' setting %s", t
);
235 mttcg_enabled
= default_mttcg_enabled();
239 /* The current number of executed instructions is based on what we
240 * originally budgeted minus the current state of the decrementing
241 * icount counters in extra/u16.low.
243 static int64_t cpu_get_icount_executed(CPUState
*cpu
)
245 return (cpu
->icount_budget
-
246 (cpu_neg(cpu
)->icount_decr
.u16
.low
+ cpu
->icount_extra
));
250 * Update the global shared timer_state.qemu_icount to take into
251 * account executed instructions. This is done by the TCG vCPU
252 * thread so the main-loop can see time has moved forward.
254 static void cpu_update_icount_locked(CPUState
*cpu
)
256 int64_t executed
= cpu_get_icount_executed(cpu
);
257 cpu
->icount_budget
-= executed
;
259 atomic_set_i64(&timers_state
.qemu_icount
,
260 timers_state
.qemu_icount
+ executed
);
264 * Update the global shared timer_state.qemu_icount to take into
265 * account executed instructions. This is done by the TCG vCPU
266 * thread so the main-loop can see time has moved forward.
268 void cpu_update_icount(CPUState
*cpu
)
270 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
271 &timers_state
.vm_clock_lock
);
272 cpu_update_icount_locked(cpu
);
273 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
274 &timers_state
.vm_clock_lock
);
277 static int64_t cpu_get_icount_raw_locked(void)
279 CPUState
*cpu
= current_cpu
;
281 if (cpu
&& cpu
->running
) {
282 if (!cpu
->can_do_io
) {
283 error_report("Bad icount read");
286 /* Take into account what has run */
287 cpu_update_icount_locked(cpu
);
289 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
290 return atomic_read_i64(&timers_state
.qemu_icount
);
293 static int64_t cpu_get_icount_locked(void)
295 int64_t icount
= cpu_get_icount_raw_locked();
296 return atomic_read_i64(&timers_state
.qemu_icount_bias
) +
297 cpu_icount_to_ns(icount
);
300 int64_t cpu_get_icount_raw(void)
306 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
307 icount
= cpu_get_icount_raw_locked();
308 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
313 /* Return the virtual CPU time, based on the instruction counter. */
314 int64_t cpu_get_icount(void)
320 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
321 icount
= cpu_get_icount_locked();
322 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
327 int64_t cpu_icount_to_ns(int64_t icount
)
329 return icount
<< atomic_read(&timers_state
.icount_time_shift
);
332 static int64_t cpu_get_ticks_locked(void)
334 int64_t ticks
= timers_state
.cpu_ticks_offset
;
335 if (timers_state
.cpu_ticks_enabled
) {
336 ticks
+= cpu_get_host_ticks();
339 if (timers_state
.cpu_ticks_prev
> ticks
) {
340 /* Non increasing ticks may happen if the host uses software suspend. */
341 timers_state
.cpu_ticks_offset
+= timers_state
.cpu_ticks_prev
- ticks
;
342 ticks
= timers_state
.cpu_ticks_prev
;
345 timers_state
.cpu_ticks_prev
= ticks
;
349 /* return the time elapsed in VM between vm_start and vm_stop. Unless
350 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
353 int64_t cpu_get_ticks(void)
358 return cpu_get_icount();
361 qemu_spin_lock(&timers_state
.vm_clock_lock
);
362 ticks
= cpu_get_ticks_locked();
363 qemu_spin_unlock(&timers_state
.vm_clock_lock
);
367 static int64_t cpu_get_clock_locked(void)
371 time
= timers_state
.cpu_clock_offset
;
372 if (timers_state
.cpu_ticks_enabled
) {
379 /* Return the monotonic time elapsed in VM, i.e.,
380 * the time between vm_start and vm_stop
382 int64_t cpu_get_clock(void)
388 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
389 ti
= cpu_get_clock_locked();
390 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
395 /* enable cpu_get_ticks()
396 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
398 void cpu_enable_ticks(void)
400 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
401 &timers_state
.vm_clock_lock
);
402 if (!timers_state
.cpu_ticks_enabled
) {
403 timers_state
.cpu_ticks_offset
-= cpu_get_host_ticks();
404 timers_state
.cpu_clock_offset
-= get_clock();
405 timers_state
.cpu_ticks_enabled
= 1;
407 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
408 &timers_state
.vm_clock_lock
);
411 /* disable cpu_get_ticks() : the clock is stopped. You must not call
412 * cpu_get_ticks() after that.
413 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
415 void cpu_disable_ticks(void)
417 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
418 &timers_state
.vm_clock_lock
);
419 if (timers_state
.cpu_ticks_enabled
) {
420 timers_state
.cpu_ticks_offset
+= cpu_get_host_ticks();
421 timers_state
.cpu_clock_offset
= cpu_get_clock_locked();
422 timers_state
.cpu_ticks_enabled
= 0;
424 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
425 &timers_state
.vm_clock_lock
);
428 /* Correlation between real and virtual time is always going to be
429 fairly approximate, so ignore small variation.
430 When the guest is idle real and virtual time will be aligned in
432 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
434 static void icount_adjust(void)
440 /* Protected by TimersState mutex. */
441 static int64_t last_delta
;
443 /* If the VM is not running, then do nothing. */
444 if (!runstate_is_running()) {
448 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
449 &timers_state
.vm_clock_lock
);
450 cur_time
= cpu_get_clock_locked();
451 cur_icount
= cpu_get_icount_locked();
453 delta
= cur_icount
- cur_time
;
454 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
456 && last_delta
+ ICOUNT_WOBBLE
< delta
* 2
457 && timers_state
.icount_time_shift
> 0) {
458 /* The guest is getting too far ahead. Slow time down. */
459 atomic_set(&timers_state
.icount_time_shift
,
460 timers_state
.icount_time_shift
- 1);
463 && last_delta
- ICOUNT_WOBBLE
> delta
* 2
464 && timers_state
.icount_time_shift
< MAX_ICOUNT_SHIFT
) {
465 /* The guest is getting too far behind. Speed time up. */
466 atomic_set(&timers_state
.icount_time_shift
,
467 timers_state
.icount_time_shift
+ 1);
470 atomic_set_i64(&timers_state
.qemu_icount_bias
,
471 cur_icount
- (timers_state
.qemu_icount
472 << timers_state
.icount_time_shift
));
473 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
474 &timers_state
.vm_clock_lock
);
477 static void icount_adjust_rt(void *opaque
)
479 timer_mod(timers_state
.icount_rt_timer
,
480 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
484 static void icount_adjust_vm(void *opaque
)
486 timer_mod(timers_state
.icount_vm_timer
,
487 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
488 NANOSECONDS_PER_SECOND
/ 10);
492 static int64_t qemu_icount_round(int64_t count
)
494 int shift
= atomic_read(&timers_state
.icount_time_shift
);
495 return (count
+ (1 << shift
) - 1) >> shift
;
498 static void icount_warp_rt(void)
503 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
504 * changes from -1 to another value, so the race here is okay.
507 seq
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
508 warp_start
= timers_state
.vm_clock_warp_start
;
509 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, seq
));
511 if (warp_start
== -1) {
515 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
516 &timers_state
.vm_clock_lock
);
517 if (runstate_is_running()) {
518 int64_t clock
= REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT
,
519 cpu_get_clock_locked());
522 warp_delta
= clock
- timers_state
.vm_clock_warp_start
;
523 if (use_icount
== 2) {
525 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
526 * far ahead of real time.
528 int64_t cur_icount
= cpu_get_icount_locked();
529 int64_t delta
= clock
- cur_icount
;
530 warp_delta
= MIN(warp_delta
, delta
);
532 atomic_set_i64(&timers_state
.qemu_icount_bias
,
533 timers_state
.qemu_icount_bias
+ warp_delta
);
535 timers_state
.vm_clock_warp_start
= -1;
536 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
537 &timers_state
.vm_clock_lock
);
539 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL
)) {
540 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
544 static void icount_timer_cb(void *opaque
)
546 /* No need for a checkpoint because the timer already synchronizes
547 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
552 void qtest_clock_warp(int64_t dest
)
554 int64_t clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
555 AioContext
*aio_context
;
556 assert(qtest_enabled());
557 aio_context
= qemu_get_aio_context();
558 while (clock
< dest
) {
559 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
,
560 QEMU_TIMER_ATTR_ALL
);
561 int64_t warp
= qemu_soonest_timeout(dest
- clock
, deadline
);
563 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
564 &timers_state
.vm_clock_lock
);
565 atomic_set_i64(&timers_state
.qemu_icount_bias
,
566 timers_state
.qemu_icount_bias
+ warp
);
567 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
568 &timers_state
.vm_clock_lock
);
570 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
571 timerlist_run_timers(aio_context
->tlg
.tl
[QEMU_CLOCK_VIRTUAL
]);
572 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
574 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
577 void qemu_start_warp_timer(void)
586 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
587 * do not fire, so computing the deadline does not make sense.
589 if (!runstate_is_running()) {
593 if (replay_mode
!= REPLAY_MODE_PLAY
) {
594 if (!all_cpu_threads_idle()) {
598 if (qtest_enabled()) {
599 /* When testing, qtest commands advance icount. */
603 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
);
605 /* warp clock deterministically in record/replay mode */
606 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
)) {
607 /* vCPU is sleeping and warp can't be started.
608 It is probably a race condition: notification sent
609 to vCPU was processed in advance and vCPU went to sleep.
610 Therefore we have to wake it up for doing someting. */
611 if (replay_has_checkpoint()) {
612 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
618 /* We want to use the earliest deadline from ALL vm_clocks */
619 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
);
620 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
,
621 ~QEMU_TIMER_ATTR_EXTERNAL
);
623 static bool notified
;
624 if (!icount_sleep
&& !notified
) {
625 warn_report("icount sleep disabled and no active timers");
633 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
634 * sleep. Otherwise, the CPU might be waiting for a future timer
635 * interrupt to wake it up, but the interrupt never comes because
636 * the vCPU isn't running any insns and thus doesn't advance the
637 * QEMU_CLOCK_VIRTUAL.
641 * We never let VCPUs sleep in no sleep icount mode.
642 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
643 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
644 * It is useful when we want a deterministic execution time,
645 * isolated from host latencies.
647 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
648 &timers_state
.vm_clock_lock
);
649 atomic_set_i64(&timers_state
.qemu_icount_bias
,
650 timers_state
.qemu_icount_bias
+ deadline
);
651 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
652 &timers_state
.vm_clock_lock
);
653 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
656 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
657 * "real" time, (related to the time left until the next event) has
658 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
659 * This avoids that the warps are visible externally; for example,
660 * you will not be sending network packets continuously instead of
663 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
664 &timers_state
.vm_clock_lock
);
665 if (timers_state
.vm_clock_warp_start
== -1
666 || timers_state
.vm_clock_warp_start
> clock
) {
667 timers_state
.vm_clock_warp_start
= clock
;
669 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
670 &timers_state
.vm_clock_lock
);
671 timer_mod_anticipate(timers_state
.icount_warp_timer
,
674 } else if (deadline
== 0) {
675 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
679 static void qemu_account_warp_timer(void)
681 if (!use_icount
|| !icount_sleep
) {
685 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
686 * do not fire, so computing the deadline does not make sense.
688 if (!runstate_is_running()) {
692 /* warp clock deterministically in record/replay mode */
693 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT
)) {
697 timer_del(timers_state
.icount_warp_timer
);
701 static bool icount_state_needed(void *opaque
)
706 static bool warp_timer_state_needed(void *opaque
)
708 TimersState
*s
= opaque
;
709 return s
->icount_warp_timer
!= NULL
;
712 static bool adjust_timers_state_needed(void *opaque
)
714 TimersState
*s
= opaque
;
715 return s
->icount_rt_timer
!= NULL
;
719 * Subsection for warp timer migration is optional, because may not be created
721 static const VMStateDescription icount_vmstate_warp_timer
= {
722 .name
= "timer/icount/warp_timer",
724 .minimum_version_id
= 1,
725 .needed
= warp_timer_state_needed
,
726 .fields
= (VMStateField
[]) {
727 VMSTATE_INT64(vm_clock_warp_start
, TimersState
),
728 VMSTATE_TIMER_PTR(icount_warp_timer
, TimersState
),
729 VMSTATE_END_OF_LIST()
733 static const VMStateDescription icount_vmstate_adjust_timers
= {
734 .name
= "timer/icount/timers",
736 .minimum_version_id
= 1,
737 .needed
= adjust_timers_state_needed
,
738 .fields
= (VMStateField
[]) {
739 VMSTATE_TIMER_PTR(icount_rt_timer
, TimersState
),
740 VMSTATE_TIMER_PTR(icount_vm_timer
, TimersState
),
741 VMSTATE_END_OF_LIST()
746 * This is a subsection for icount migration.
748 static const VMStateDescription icount_vmstate_timers
= {
749 .name
= "timer/icount",
751 .minimum_version_id
= 1,
752 .needed
= icount_state_needed
,
753 .fields
= (VMStateField
[]) {
754 VMSTATE_INT64(qemu_icount_bias
, TimersState
),
755 VMSTATE_INT64(qemu_icount
, TimersState
),
756 VMSTATE_END_OF_LIST()
758 .subsections
= (const VMStateDescription
*[]) {
759 &icount_vmstate_warp_timer
,
760 &icount_vmstate_adjust_timers
,
765 static const VMStateDescription vmstate_timers
= {
768 .minimum_version_id
= 1,
769 .fields
= (VMStateField
[]) {
770 VMSTATE_INT64(cpu_ticks_offset
, TimersState
),
772 VMSTATE_INT64_V(cpu_clock_offset
, TimersState
, 2),
773 VMSTATE_END_OF_LIST()
775 .subsections
= (const VMStateDescription
*[]) {
776 &icount_vmstate_timers
,
781 static void cpu_throttle_thread(CPUState
*cpu
, run_on_cpu_data opaque
)
784 double throttle_ratio
;
787 if (!cpu_throttle_get_percentage()) {
791 pct
= (double)cpu_throttle_get_percentage()/100;
792 throttle_ratio
= pct
/ (1 - pct
);
793 sleeptime_ns
= (long)(throttle_ratio
* CPU_THROTTLE_TIMESLICE_NS
);
795 qemu_mutex_unlock_iothread();
796 g_usleep(sleeptime_ns
/ 1000); /* Convert ns to us for usleep call */
797 qemu_mutex_lock_iothread();
798 atomic_set(&cpu
->throttle_thread_scheduled
, 0);
801 static void cpu_throttle_timer_tick(void *opaque
)
806 /* Stop the timer if needed */
807 if (!cpu_throttle_get_percentage()) {
811 if (!atomic_xchg(&cpu
->throttle_thread_scheduled
, 1)) {
812 async_run_on_cpu(cpu
, cpu_throttle_thread
,
817 pct
= (double)cpu_throttle_get_percentage()/100;
818 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
819 CPU_THROTTLE_TIMESLICE_NS
/ (1-pct
));
822 void cpu_throttle_set(int new_throttle_pct
)
824 /* Ensure throttle percentage is within valid range */
825 new_throttle_pct
= MIN(new_throttle_pct
, CPU_THROTTLE_PCT_MAX
);
826 new_throttle_pct
= MAX(new_throttle_pct
, CPU_THROTTLE_PCT_MIN
);
828 atomic_set(&throttle_percentage
, new_throttle_pct
);
830 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
831 CPU_THROTTLE_TIMESLICE_NS
);
834 void cpu_throttle_stop(void)
836 atomic_set(&throttle_percentage
, 0);
839 bool cpu_throttle_active(void)
841 return (cpu_throttle_get_percentage() != 0);
844 int cpu_throttle_get_percentage(void)
846 return atomic_read(&throttle_percentage
);
849 void cpu_ticks_init(void)
851 seqlock_init(&timers_state
.vm_clock_seqlock
);
852 qemu_spin_init(&timers_state
.vm_clock_lock
);
853 vmstate_register(NULL
, 0, &vmstate_timers
, &timers_state
);
854 throttle_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
855 cpu_throttle_timer_tick
, NULL
);
858 void configure_icount(QemuOpts
*opts
, Error
**errp
)
861 char *rem_str
= NULL
;
863 option
= qemu_opt_get(opts
, "shift");
865 if (qemu_opt_get(opts
, "align") != NULL
) {
866 error_setg(errp
, "Please specify shift option when using align");
871 icount_sleep
= qemu_opt_get_bool(opts
, "sleep", true);
873 timers_state
.icount_warp_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
874 icount_timer_cb
, NULL
);
877 icount_align_option
= qemu_opt_get_bool(opts
, "align", false);
879 if (icount_align_option
&& !icount_sleep
) {
880 error_setg(errp
, "align=on and sleep=off are incompatible");
882 if (strcmp(option
, "auto") != 0) {
884 timers_state
.icount_time_shift
= strtol(option
, &rem_str
, 0);
885 if (errno
!= 0 || *rem_str
!= '\0' || !strlen(option
)) {
886 error_setg(errp
, "icount: Invalid shift value");
890 } else if (icount_align_option
) {
891 error_setg(errp
, "shift=auto and align=on are incompatible");
892 } else if (!icount_sleep
) {
893 error_setg(errp
, "shift=auto and sleep=off are incompatible");
898 /* 125MIPS seems a reasonable initial guess at the guest speed.
899 It will be corrected fairly quickly anyway. */
900 timers_state
.icount_time_shift
= 3;
902 /* Have both realtime and virtual time triggers for speed adjustment.
903 The realtime trigger catches emulated time passing too slowly,
904 the virtual time trigger catches emulated time passing too fast.
905 Realtime triggers occur even when idle, so use them less frequently
907 timers_state
.vm_clock_warp_start
= -1;
908 timers_state
.icount_rt_timer
= timer_new_ms(QEMU_CLOCK_VIRTUAL_RT
,
909 icount_adjust_rt
, NULL
);
910 timer_mod(timers_state
.icount_rt_timer
,
911 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
912 timers_state
.icount_vm_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
913 icount_adjust_vm
, NULL
);
914 timer_mod(timers_state
.icount_vm_timer
,
915 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
916 NANOSECONDS_PER_SECOND
/ 10);
919 /***********************************************************/
920 /* TCG vCPU kick timer
922 * The kick timer is responsible for moving single threaded vCPU
923 * emulation on to the next vCPU. If more than one vCPU is running a
924 * timer event with force a cpu->exit so the next vCPU can get
927 * The timer is removed if all vCPUs are idle and restarted again once
928 * idleness is complete.
931 static QEMUTimer
*tcg_kick_vcpu_timer
;
932 static CPUState
*tcg_current_rr_cpu
;
934 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
936 static inline int64_t qemu_tcg_next_kick(void)
938 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + TCG_KICK_PERIOD
;
941 /* Kick the currently round-robin scheduled vCPU */
942 static void qemu_cpu_kick_rr_cpu(void)
946 cpu
= atomic_mb_read(&tcg_current_rr_cpu
);
950 } while (cpu
!= atomic_mb_read(&tcg_current_rr_cpu
));
953 static void do_nothing(CPUState
*cpu
, run_on_cpu_data unused
)
957 void qemu_timer_notify_cb(void *opaque
, QEMUClockType type
)
959 if (!use_icount
|| type
!= QEMU_CLOCK_VIRTUAL
) {
964 if (qemu_in_vcpu_thread()) {
965 /* A CPU is currently running; kick it back out to the
966 * tcg_cpu_exec() loop so it will recalculate its
967 * icount deadline immediately.
969 qemu_cpu_kick(current_cpu
);
970 } else if (first_cpu
) {
971 /* qemu_cpu_kick is not enough to kick a halted CPU out of
972 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
973 * causes cpu_thread_is_idle to return false. This way,
974 * handle_icount_deadline can run.
975 * If we have no CPUs at all for some reason, we don't
976 * need to do anything.
978 async_run_on_cpu(first_cpu
, do_nothing
, RUN_ON_CPU_NULL
);
982 static void kick_tcg_thread(void *opaque
)
984 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
985 qemu_cpu_kick_rr_cpu();
988 static void start_tcg_kick_timer(void)
990 assert(!mttcg_enabled
);
991 if (!tcg_kick_vcpu_timer
&& CPU_NEXT(first_cpu
)) {
992 tcg_kick_vcpu_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
993 kick_tcg_thread
, NULL
);
995 if (tcg_kick_vcpu_timer
&& !timer_pending(tcg_kick_vcpu_timer
)) {
996 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
1000 static void stop_tcg_kick_timer(void)
1002 assert(!mttcg_enabled
);
1003 if (tcg_kick_vcpu_timer
&& timer_pending(tcg_kick_vcpu_timer
)) {
1004 timer_del(tcg_kick_vcpu_timer
);
1008 /***********************************************************/
1009 void hw_error(const char *fmt
, ...)
1015 fprintf(stderr
, "qemu: hardware error: ");
1016 vfprintf(stderr
, fmt
, ap
);
1017 fprintf(stderr
, "\n");
1019 fprintf(stderr
, "CPU #%d:\n", cpu
->cpu_index
);
1020 cpu_dump_state(cpu
, stderr
, CPU_DUMP_FPU
);
1026 void cpu_synchronize_all_states(void)
1031 cpu_synchronize_state(cpu
);
1032 /* TODO: move to cpu_synchronize_state() */
1033 if (hvf_enabled()) {
1034 hvf_cpu_synchronize_state(cpu
);
1039 void cpu_synchronize_all_post_reset(void)
1044 cpu_synchronize_post_reset(cpu
);
1045 /* TODO: move to cpu_synchronize_post_reset() */
1046 if (hvf_enabled()) {
1047 hvf_cpu_synchronize_post_reset(cpu
);
1052 void cpu_synchronize_all_post_init(void)
1057 cpu_synchronize_post_init(cpu
);
1058 /* TODO: move to cpu_synchronize_post_init() */
1059 if (hvf_enabled()) {
1060 hvf_cpu_synchronize_post_init(cpu
);
1065 void cpu_synchronize_all_pre_loadvm(void)
1070 cpu_synchronize_pre_loadvm(cpu
);
1074 static int do_vm_stop(RunState state
, bool send_stop
)
1078 if (runstate_is_running()) {
1079 cpu_disable_ticks();
1081 runstate_set(state
);
1082 vm_state_notify(0, state
);
1084 qapi_event_send_stop();
1089 replay_disable_events();
1090 ret
= bdrv_flush_all();
1095 /* Special vm_stop() variant for terminating the process. Historically clients
1096 * did not expect a QMP STOP event and so we need to retain compatibility.
1098 int vm_shutdown(void)
1100 return do_vm_stop(RUN_STATE_SHUTDOWN
, false);
1103 static bool cpu_can_run(CPUState
*cpu
)
1108 if (cpu_is_stopped(cpu
)) {
1114 static void cpu_handle_guest_debug(CPUState
*cpu
)
1116 gdb_set_stop_cpu(cpu
);
1117 qemu_system_debug_request();
1118 cpu
->stopped
= true;
1122 static void sigbus_reraise(void)
1125 struct sigaction action
;
1127 memset(&action
, 0, sizeof(action
));
1128 action
.sa_handler
= SIG_DFL
;
1129 if (!sigaction(SIGBUS
, &action
, NULL
)) {
1132 sigaddset(&set
, SIGBUS
);
1133 pthread_sigmask(SIG_UNBLOCK
, &set
, NULL
);
1135 perror("Failed to re-raise SIGBUS!\n");
1139 static void sigbus_handler(int n
, siginfo_t
*siginfo
, void *ctx
)
1141 if (siginfo
->si_code
!= BUS_MCEERR_AO
&& siginfo
->si_code
!= BUS_MCEERR_AR
) {
1146 /* Called asynchronously in VCPU thread. */
1147 if (kvm_on_sigbus_vcpu(current_cpu
, siginfo
->si_code
, siginfo
->si_addr
)) {
1151 /* Called synchronously (via signalfd) in main thread. */
1152 if (kvm_on_sigbus(siginfo
->si_code
, siginfo
->si_addr
)) {
1158 static void qemu_init_sigbus(void)
1160 struct sigaction action
;
1162 memset(&action
, 0, sizeof(action
));
1163 action
.sa_flags
= SA_SIGINFO
;
1164 action
.sa_sigaction
= sigbus_handler
;
1165 sigaction(SIGBUS
, &action
, NULL
);
1167 prctl(PR_MCE_KILL
, PR_MCE_KILL_SET
, PR_MCE_KILL_EARLY
, 0, 0);
1169 #else /* !CONFIG_LINUX */
1170 static void qemu_init_sigbus(void)
1173 #endif /* !CONFIG_LINUX */
1175 static QemuMutex qemu_global_mutex
;
1177 static QemuThread io_thread
;
1180 static QemuCond qemu_cpu_cond
;
1182 static QemuCond qemu_pause_cond
;
1184 void qemu_init_cpu_loop(void)
1187 qemu_cond_init(&qemu_cpu_cond
);
1188 qemu_cond_init(&qemu_pause_cond
);
1189 qemu_mutex_init(&qemu_global_mutex
);
1191 qemu_thread_get_self(&io_thread
);
1194 void run_on_cpu(CPUState
*cpu
, run_on_cpu_func func
, run_on_cpu_data data
)
1196 do_run_on_cpu(cpu
, func
, data
, &qemu_global_mutex
);
1199 static void qemu_kvm_destroy_vcpu(CPUState
*cpu
)
1201 if (kvm_destroy_vcpu(cpu
) < 0) {
1202 error_report("kvm_destroy_vcpu failed");
1207 static void qemu_tcg_destroy_vcpu(CPUState
*cpu
)
1211 static void qemu_cpu_stop(CPUState
*cpu
, bool exit
)
1213 g_assert(qemu_cpu_is_self(cpu
));
1215 cpu
->stopped
= true;
1219 qemu_cond_broadcast(&qemu_pause_cond
);
1222 static void qemu_wait_io_event_common(CPUState
*cpu
)
1224 atomic_mb_set(&cpu
->thread_kicked
, false);
1226 qemu_cpu_stop(cpu
, false);
1228 process_queued_cpu_work(cpu
);
1231 static void qemu_tcg_rr_wait_io_event(void)
1235 while (all_cpu_threads_idle()) {
1236 stop_tcg_kick_timer();
1237 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1240 start_tcg_kick_timer();
1243 qemu_wait_io_event_common(cpu
);
1247 static void qemu_wait_io_event(CPUState
*cpu
)
1249 while (cpu_thread_is_idle(cpu
)) {
1250 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1254 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1255 if (!tcg_enabled()) {
1259 qemu_wait_io_event_common(cpu
);
1262 static void *qemu_kvm_cpu_thread_fn(void *arg
)
1264 CPUState
*cpu
= arg
;
1267 rcu_register_thread();
1269 qemu_mutex_lock_iothread();
1270 qemu_thread_get_self(cpu
->thread
);
1271 cpu
->thread_id
= qemu_get_thread_id();
1275 r
= kvm_init_vcpu(cpu
);
1277 error_report("kvm_init_vcpu failed: %s", strerror(-r
));
1281 kvm_init_cpu_signals(cpu
);
1283 /* signal CPU creation */
1284 cpu
->created
= true;
1285 qemu_cond_signal(&qemu_cpu_cond
);
1286 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1289 if (cpu_can_run(cpu
)) {
1290 r
= kvm_cpu_exec(cpu
);
1291 if (r
== EXCP_DEBUG
) {
1292 cpu_handle_guest_debug(cpu
);
1295 qemu_wait_io_event(cpu
);
1296 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1298 qemu_kvm_destroy_vcpu(cpu
);
1299 cpu
->created
= false;
1300 qemu_cond_signal(&qemu_cpu_cond
);
1301 qemu_mutex_unlock_iothread();
1302 rcu_unregister_thread();
1306 static void *qemu_dummy_cpu_thread_fn(void *arg
)
1309 error_report("qtest is not supported under Windows");
1312 CPUState
*cpu
= arg
;
1316 rcu_register_thread();
1318 qemu_mutex_lock_iothread();
1319 qemu_thread_get_self(cpu
->thread
);
1320 cpu
->thread_id
= qemu_get_thread_id();
1324 sigemptyset(&waitset
);
1325 sigaddset(&waitset
, SIG_IPI
);
1327 /* signal CPU creation */
1328 cpu
->created
= true;
1329 qemu_cond_signal(&qemu_cpu_cond
);
1330 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1333 qemu_mutex_unlock_iothread();
1336 r
= sigwait(&waitset
, &sig
);
1337 } while (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
1342 qemu_mutex_lock_iothread();
1343 qemu_wait_io_event(cpu
);
1344 } while (!cpu
->unplug
);
1346 qemu_mutex_unlock_iothread();
1347 rcu_unregister_thread();
1352 static int64_t tcg_get_icount_limit(void)
1356 if (replay_mode
!= REPLAY_MODE_PLAY
) {
1358 * Include all the timers, because they may need an attention.
1359 * Too long CPU execution may create unnecessary delay in UI.
1361 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
,
1362 QEMU_TIMER_ATTR_ALL
);
1364 /* Maintain prior (possibly buggy) behaviour where if no deadline
1365 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1366 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1369 if ((deadline
< 0) || (deadline
> INT32_MAX
)) {
1370 deadline
= INT32_MAX
;
1373 return qemu_icount_round(deadline
);
1375 return replay_get_instructions();
1379 static void handle_icount_deadline(void)
1381 assert(qemu_in_vcpu_thread());
1383 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
,
1384 QEMU_TIMER_ATTR_ALL
);
1386 if (deadline
== 0) {
1387 /* Wake up other AioContexts. */
1388 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
1389 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
1394 static void prepare_icount_for_run(CPUState
*cpu
)
1399 /* These should always be cleared by process_icount_data after
1400 * each vCPU execution. However u16.high can be raised
1401 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1403 g_assert(cpu_neg(cpu
)->icount_decr
.u16
.low
== 0);
1404 g_assert(cpu
->icount_extra
== 0);
1406 cpu
->icount_budget
= tcg_get_icount_limit();
1407 insns_left
= MIN(0xffff, cpu
->icount_budget
);
1408 cpu_neg(cpu
)->icount_decr
.u16
.low
= insns_left
;
1409 cpu
->icount_extra
= cpu
->icount_budget
- insns_left
;
1411 replay_mutex_lock();
1415 static void process_icount_data(CPUState
*cpu
)
1418 /* Account for executed instructions */
1419 cpu_update_icount(cpu
);
1421 /* Reset the counters */
1422 cpu_neg(cpu
)->icount_decr
.u16
.low
= 0;
1423 cpu
->icount_extra
= 0;
1424 cpu
->icount_budget
= 0;
1426 replay_account_executed_instructions();
1428 replay_mutex_unlock();
1433 static int tcg_cpu_exec(CPUState
*cpu
)
1436 #ifdef CONFIG_PROFILER
1440 assert(tcg_enabled());
1441 #ifdef CONFIG_PROFILER
1442 ti
= profile_getclock();
1444 cpu_exec_start(cpu
);
1445 ret
= cpu_exec(cpu
);
1447 #ifdef CONFIG_PROFILER
1448 atomic_set(&tcg_ctx
->prof
.cpu_exec_time
,
1449 tcg_ctx
->prof
.cpu_exec_time
+ profile_getclock() - ti
);
1454 /* Destroy any remaining vCPUs which have been unplugged and have
1457 static void deal_with_unplugged_cpus(void)
1462 if (cpu
->unplug
&& !cpu_can_run(cpu
)) {
1463 qemu_tcg_destroy_vcpu(cpu
);
1464 cpu
->created
= false;
1465 qemu_cond_signal(&qemu_cpu_cond
);
1471 /* Single-threaded TCG
1473 * In the single-threaded case each vCPU is simulated in turn. If
1474 * there is more than a single vCPU we create a simple timer to kick
1475 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1476 * This is done explicitly rather than relying on side-effects
1480 static void *qemu_tcg_rr_cpu_thread_fn(void *arg
)
1482 CPUState
*cpu
= arg
;
1484 assert(tcg_enabled());
1485 rcu_register_thread();
1486 tcg_register_thread();
1488 qemu_mutex_lock_iothread();
1489 qemu_thread_get_self(cpu
->thread
);
1491 cpu
->thread_id
= qemu_get_thread_id();
1492 cpu
->created
= true;
1494 qemu_cond_signal(&qemu_cpu_cond
);
1495 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1497 /* wait for initial kick-off after machine start */
1498 while (first_cpu
->stopped
) {
1499 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1501 /* process any pending work */
1504 qemu_wait_io_event_common(cpu
);
1508 start_tcg_kick_timer();
1512 /* process any pending work */
1513 cpu
->exit_request
= 1;
1516 qemu_mutex_unlock_iothread();
1517 replay_mutex_lock();
1518 qemu_mutex_lock_iothread();
1519 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1520 qemu_account_warp_timer();
1522 /* Run the timers here. This is much more efficient than
1523 * waking up the I/O thread and waiting for completion.
1525 handle_icount_deadline();
1527 replay_mutex_unlock();
1533 while (cpu
&& !cpu
->queued_work_first
&& !cpu
->exit_request
) {
1535 atomic_mb_set(&tcg_current_rr_cpu
, cpu
);
1538 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
,
1539 (cpu
->singlestep_enabled
& SSTEP_NOTIMER
) == 0);
1541 if (cpu_can_run(cpu
)) {
1544 qemu_mutex_unlock_iothread();
1545 prepare_icount_for_run(cpu
);
1547 r
= tcg_cpu_exec(cpu
);
1549 process_icount_data(cpu
);
1550 qemu_mutex_lock_iothread();
1552 if (r
== EXCP_DEBUG
) {
1553 cpu_handle_guest_debug(cpu
);
1555 } else if (r
== EXCP_ATOMIC
) {
1556 qemu_mutex_unlock_iothread();
1557 cpu_exec_step_atomic(cpu
);
1558 qemu_mutex_lock_iothread();
1561 } else if (cpu
->stop
) {
1563 cpu
= CPU_NEXT(cpu
);
1568 cpu
= CPU_NEXT(cpu
);
1569 } /* while (cpu && !cpu->exit_request).. */
1571 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1572 atomic_set(&tcg_current_rr_cpu
, NULL
);
1574 if (cpu
&& cpu
->exit_request
) {
1575 atomic_mb_set(&cpu
->exit_request
, 0);
1578 if (use_icount
&& all_cpu_threads_idle()) {
1580 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1581 * in the main_loop, wake it up in order to start the warp timer.
1583 qemu_notify_event();
1586 qemu_tcg_rr_wait_io_event();
1587 deal_with_unplugged_cpus();
1590 rcu_unregister_thread();
1594 static void *qemu_hax_cpu_thread_fn(void *arg
)
1596 CPUState
*cpu
= arg
;
1599 rcu_register_thread();
1600 qemu_mutex_lock_iothread();
1601 qemu_thread_get_self(cpu
->thread
);
1603 cpu
->thread_id
= qemu_get_thread_id();
1604 cpu
->created
= true;
1608 qemu_cond_signal(&qemu_cpu_cond
);
1609 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1612 if (cpu_can_run(cpu
)) {
1613 r
= hax_smp_cpu_exec(cpu
);
1614 if (r
== EXCP_DEBUG
) {
1615 cpu_handle_guest_debug(cpu
);
1619 qemu_wait_io_event(cpu
);
1620 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1621 rcu_unregister_thread();
1625 /* The HVF-specific vCPU thread function. This one should only run when the host
1626 * CPU supports the VMX "unrestricted guest" feature. */
1627 static void *qemu_hvf_cpu_thread_fn(void *arg
)
1629 CPUState
*cpu
= arg
;
1633 assert(hvf_enabled());
1635 rcu_register_thread();
1637 qemu_mutex_lock_iothread();
1638 qemu_thread_get_self(cpu
->thread
);
1640 cpu
->thread_id
= qemu_get_thread_id();
1646 /* signal CPU creation */
1647 cpu
->created
= true;
1648 qemu_cond_signal(&qemu_cpu_cond
);
1649 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1652 if (cpu_can_run(cpu
)) {
1653 r
= hvf_vcpu_exec(cpu
);
1654 if (r
== EXCP_DEBUG
) {
1655 cpu_handle_guest_debug(cpu
);
1658 qemu_wait_io_event(cpu
);
1659 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1661 hvf_vcpu_destroy(cpu
);
1662 cpu
->created
= false;
1663 qemu_cond_signal(&qemu_cpu_cond
);
1664 qemu_mutex_unlock_iothread();
1665 rcu_unregister_thread();
1669 static void *qemu_whpx_cpu_thread_fn(void *arg
)
1671 CPUState
*cpu
= arg
;
1674 rcu_register_thread();
1676 qemu_mutex_lock_iothread();
1677 qemu_thread_get_self(cpu
->thread
);
1678 cpu
->thread_id
= qemu_get_thread_id();
1681 r
= whpx_init_vcpu(cpu
);
1683 fprintf(stderr
, "whpx_init_vcpu failed: %s\n", strerror(-r
));
1687 /* signal CPU creation */
1688 cpu
->created
= true;
1689 qemu_cond_signal(&qemu_cpu_cond
);
1690 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1693 if (cpu_can_run(cpu
)) {
1694 r
= whpx_vcpu_exec(cpu
);
1695 if (r
== EXCP_DEBUG
) {
1696 cpu_handle_guest_debug(cpu
);
1699 while (cpu_thread_is_idle(cpu
)) {
1700 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1702 qemu_wait_io_event_common(cpu
);
1703 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1705 whpx_destroy_vcpu(cpu
);
1706 cpu
->created
= false;
1707 qemu_cond_signal(&qemu_cpu_cond
);
1708 qemu_mutex_unlock_iothread();
1709 rcu_unregister_thread();
1714 static void CALLBACK
dummy_apc_func(ULONG_PTR unused
)
1719 /* Multi-threaded TCG
1721 * In the multi-threaded case each vCPU has its own thread. The TLS
1722 * variable current_cpu can be used deep in the code to find the
1723 * current CPUState for a given thread.
1726 static void *qemu_tcg_cpu_thread_fn(void *arg
)
1728 CPUState
*cpu
= arg
;
1730 assert(tcg_enabled());
1731 g_assert(!use_icount
);
1733 rcu_register_thread();
1734 tcg_register_thread();
1736 qemu_mutex_lock_iothread();
1737 qemu_thread_get_self(cpu
->thread
);
1739 cpu
->thread_id
= qemu_get_thread_id();
1740 cpu
->created
= true;
1743 qemu_cond_signal(&qemu_cpu_cond
);
1744 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1746 /* process any pending work */
1747 cpu
->exit_request
= 1;
1750 if (cpu_can_run(cpu
)) {
1752 qemu_mutex_unlock_iothread();
1753 r
= tcg_cpu_exec(cpu
);
1754 qemu_mutex_lock_iothread();
1757 cpu_handle_guest_debug(cpu
);
1760 /* during start-up the vCPU is reset and the thread is
1761 * kicked several times. If we don't ensure we go back
1762 * to sleep in the halted state we won't cleanly
1763 * start-up when the vCPU is enabled.
1765 * cpu->halted should ensure we sleep in wait_io_event
1767 g_assert(cpu
->halted
);
1770 qemu_mutex_unlock_iothread();
1771 cpu_exec_step_atomic(cpu
);
1772 qemu_mutex_lock_iothread();
1774 /* Ignore everything else? */
1779 atomic_mb_set(&cpu
->exit_request
, 0);
1780 qemu_wait_io_event(cpu
);
1781 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1783 qemu_tcg_destroy_vcpu(cpu
);
1784 cpu
->created
= false;
1785 qemu_cond_signal(&qemu_cpu_cond
);
1786 qemu_mutex_unlock_iothread();
1787 rcu_unregister_thread();
1791 static void qemu_cpu_kick_thread(CPUState
*cpu
)
1796 if (cpu
->thread_kicked
) {
1799 cpu
->thread_kicked
= true;
1800 err
= pthread_kill(cpu
->thread
->thread
, SIG_IPI
);
1801 if (err
&& err
!= ESRCH
) {
1802 fprintf(stderr
, "qemu:%s: %s", __func__
, strerror(err
));
1806 if (!qemu_cpu_is_self(cpu
)) {
1807 if (whpx_enabled()) {
1808 whpx_vcpu_kick(cpu
);
1809 } else if (!QueueUserAPC(dummy_apc_func
, cpu
->hThread
, 0)) {
1810 fprintf(stderr
, "%s: QueueUserAPC failed with error %lu\n",
1811 __func__
, GetLastError());
1818 void qemu_cpu_kick(CPUState
*cpu
)
1820 qemu_cond_broadcast(cpu
->halt_cond
);
1821 if (tcg_enabled()) {
1823 /* NOP unless doing single-thread RR */
1824 qemu_cpu_kick_rr_cpu();
1826 if (hax_enabled()) {
1828 * FIXME: race condition with the exit_request check in
1831 cpu
->exit_request
= 1;
1833 qemu_cpu_kick_thread(cpu
);
1837 void qemu_cpu_kick_self(void)
1839 assert(current_cpu
);
1840 qemu_cpu_kick_thread(current_cpu
);
1843 bool qemu_cpu_is_self(CPUState
*cpu
)
1845 return qemu_thread_is_self(cpu
->thread
);
1848 bool qemu_in_vcpu_thread(void)
1850 return current_cpu
&& qemu_cpu_is_self(current_cpu
);
1853 static __thread
bool iothread_locked
= false;
1855 bool qemu_mutex_iothread_locked(void)
1857 return iothread_locked
;
1861 * The BQL is taken from so many places that it is worth profiling the
1862 * callers directly, instead of funneling them all through a single function.
1864 void qemu_mutex_lock_iothread_impl(const char *file
, int line
)
1866 QemuMutexLockFunc bql_lock
= atomic_read(&qemu_bql_mutex_lock_func
);
1868 g_assert(!qemu_mutex_iothread_locked());
1869 bql_lock(&qemu_global_mutex
, file
, line
);
1870 iothread_locked
= true;
1873 void qemu_mutex_unlock_iothread(void)
1875 g_assert(qemu_mutex_iothread_locked());
1876 iothread_locked
= false;
1877 qemu_mutex_unlock(&qemu_global_mutex
);
1880 static bool all_vcpus_paused(void)
1885 if (!cpu
->stopped
) {
1893 void pause_all_vcpus(void)
1897 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, false);
1899 if (qemu_cpu_is_self(cpu
)) {
1900 qemu_cpu_stop(cpu
, true);
1907 /* We need to drop the replay_lock so any vCPU threads woken up
1908 * can finish their replay tasks
1910 replay_mutex_unlock();
1912 while (!all_vcpus_paused()) {
1913 qemu_cond_wait(&qemu_pause_cond
, &qemu_global_mutex
);
1919 qemu_mutex_unlock_iothread();
1920 replay_mutex_lock();
1921 qemu_mutex_lock_iothread();
1924 void cpu_resume(CPUState
*cpu
)
1927 cpu
->stopped
= false;
1931 void resume_all_vcpus(void)
1935 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, true);
1941 void cpu_remove_sync(CPUState
*cpu
)
1946 qemu_mutex_unlock_iothread();
1947 qemu_thread_join(cpu
->thread
);
1948 qemu_mutex_lock_iothread();
1951 /* For temporary buffers for forming a name */
1952 #define VCPU_THREAD_NAME_SIZE 16
1954 static void qemu_tcg_init_vcpu(CPUState
*cpu
)
1956 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1957 static QemuCond
*single_tcg_halt_cond
;
1958 static QemuThread
*single_tcg_cpu_thread
;
1959 static int tcg_region_inited
;
1961 assert(tcg_enabled());
1963 * Initialize TCG regions--once. Now is a good time, because:
1964 * (1) TCG's init context, prologue and target globals have been set up.
1965 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1966 * -accel flag is processed, so the check doesn't work then).
1968 if (!tcg_region_inited
) {
1969 tcg_region_inited
= 1;
1973 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread
) {
1974 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1975 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1976 qemu_cond_init(cpu
->halt_cond
);
1978 if (qemu_tcg_mttcg_enabled()) {
1979 /* create a thread per vCPU with TCG (MTTCG) */
1980 parallel_cpus
= true;
1981 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/TCG",
1984 qemu_thread_create(cpu
->thread
, thread_name
, qemu_tcg_cpu_thread_fn
,
1985 cpu
, QEMU_THREAD_JOINABLE
);
1988 /* share a single thread for all cpus with TCG */
1989 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "ALL CPUs/TCG");
1990 qemu_thread_create(cpu
->thread
, thread_name
,
1991 qemu_tcg_rr_cpu_thread_fn
,
1992 cpu
, QEMU_THREAD_JOINABLE
);
1994 single_tcg_halt_cond
= cpu
->halt_cond
;
1995 single_tcg_cpu_thread
= cpu
->thread
;
1998 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2001 /* For non-MTTCG cases we share the thread */
2002 cpu
->thread
= single_tcg_cpu_thread
;
2003 cpu
->halt_cond
= single_tcg_halt_cond
;
2004 cpu
->thread_id
= first_cpu
->thread_id
;
2006 cpu
->created
= true;
2010 static void qemu_hax_start_vcpu(CPUState
*cpu
)
2012 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2014 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2015 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2016 qemu_cond_init(cpu
->halt_cond
);
2018 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HAX",
2020 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hax_cpu_thread_fn
,
2021 cpu
, QEMU_THREAD_JOINABLE
);
2023 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2027 static void qemu_kvm_start_vcpu(CPUState
*cpu
)
2029 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2031 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2032 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2033 qemu_cond_init(cpu
->halt_cond
);
2034 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/KVM",
2036 qemu_thread_create(cpu
->thread
, thread_name
, qemu_kvm_cpu_thread_fn
,
2037 cpu
, QEMU_THREAD_JOINABLE
);
2040 static void qemu_hvf_start_vcpu(CPUState
*cpu
)
2042 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2044 /* HVF currently does not support TCG, and only runs in
2045 * unrestricted-guest mode. */
2046 assert(hvf_enabled());
2048 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2049 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2050 qemu_cond_init(cpu
->halt_cond
);
2052 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HVF",
2054 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hvf_cpu_thread_fn
,
2055 cpu
, QEMU_THREAD_JOINABLE
);
2058 static void qemu_whpx_start_vcpu(CPUState
*cpu
)
2060 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2062 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2063 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2064 qemu_cond_init(cpu
->halt_cond
);
2065 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/WHPX",
2067 qemu_thread_create(cpu
->thread
, thread_name
, qemu_whpx_cpu_thread_fn
,
2068 cpu
, QEMU_THREAD_JOINABLE
);
2070 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2074 static void qemu_dummy_start_vcpu(CPUState
*cpu
)
2076 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2078 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2079 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2080 qemu_cond_init(cpu
->halt_cond
);
2081 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/DUMMY",
2083 qemu_thread_create(cpu
->thread
, thread_name
, qemu_dummy_cpu_thread_fn
, cpu
,
2084 QEMU_THREAD_JOINABLE
);
2087 void qemu_init_vcpu(CPUState
*cpu
)
2089 MachineState
*ms
= MACHINE(qdev_get_machine());
2091 cpu
->nr_cores
= ms
->smp
.cores
;
2092 cpu
->nr_threads
= ms
->smp
.threads
;
2093 cpu
->stopped
= true;
2094 cpu
->random_seed
= qemu_guest_random_seed_thread_part1();
2097 /* If the target cpu hasn't set up any address spaces itself,
2098 * give it the default one.
2101 cpu_address_space_init(cpu
, 0, "cpu-memory", cpu
->memory
);
2104 if (kvm_enabled()) {
2105 qemu_kvm_start_vcpu(cpu
);
2106 } else if (hax_enabled()) {
2107 qemu_hax_start_vcpu(cpu
);
2108 } else if (hvf_enabled()) {
2109 qemu_hvf_start_vcpu(cpu
);
2110 } else if (tcg_enabled()) {
2111 qemu_tcg_init_vcpu(cpu
);
2112 } else if (whpx_enabled()) {
2113 qemu_whpx_start_vcpu(cpu
);
2115 qemu_dummy_start_vcpu(cpu
);
2118 while (!cpu
->created
) {
2119 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
2123 void cpu_stop_current(void)
2126 current_cpu
->stop
= true;
2127 cpu_exit(current_cpu
);
2131 int vm_stop(RunState state
)
2133 if (qemu_in_vcpu_thread()) {
2134 qemu_system_vmstop_request_prepare();
2135 qemu_system_vmstop_request(state
);
2137 * FIXME: should not return to device code in case
2138 * vm_stop() has been requested.
2144 return do_vm_stop(state
, true);
2148 * Prepare for (re)starting the VM.
2149 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2150 * running or in case of an error condition), 0 otherwise.
2152 int vm_prepare_start(void)
2156 qemu_vmstop_requested(&requested
);
2157 if (runstate_is_running() && requested
== RUN_STATE__MAX
) {
2161 /* Ensure that a STOP/RESUME pair of events is emitted if a
2162 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2163 * example, according to documentation is always followed by
2166 if (runstate_is_running()) {
2167 qapi_event_send_stop();
2168 qapi_event_send_resume();
2172 /* We are sending this now, but the CPUs will be resumed shortly later */
2173 qapi_event_send_resume();
2175 replay_enable_events();
2177 runstate_set(RUN_STATE_RUNNING
);
2178 vm_state_notify(1, RUN_STATE_RUNNING
);
2184 if (!vm_prepare_start()) {
2189 /* does a state transition even if the VM is already stopped,
2190 current state is forgotten forever */
2191 int vm_stop_force_state(RunState state
)
2193 if (runstate_is_running()) {
2194 return vm_stop(state
);
2196 runstate_set(state
);
2199 /* Make sure to return an error if the flush in a previous vm_stop()
2201 return bdrv_flush_all();
2205 void list_cpus(const char *optarg
)
2207 /* XXX: implement xxx_cpu_list for targets that still miss it */
2208 #if defined(cpu_list)
2213 void qmp_memsave(int64_t addr
, int64_t size
, const char *filename
,
2214 bool has_cpu
, int64_t cpu_index
, Error
**errp
)
2220 int64_t orig_addr
= addr
, orig_size
= size
;
2226 cpu
= qemu_get_cpu(cpu_index
);
2228 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cpu-index",
2233 f
= fopen(filename
, "wb");
2235 error_setg_file_open(errp
, errno
, filename
);
2243 if (cpu_memory_rw_debug(cpu
, addr
, buf
, l
, 0) != 0) {
2244 error_setg(errp
, "Invalid addr 0x%016" PRIx64
"/size %" PRId64
2245 " specified", orig_addr
, orig_size
);
2248 if (fwrite(buf
, 1, l
, f
) != l
) {
2249 error_setg(errp
, QERR_IO_ERROR
);
2260 void qmp_pmemsave(int64_t addr
, int64_t size
, const char *filename
,
2267 f
= fopen(filename
, "wb");
2269 error_setg_file_open(errp
, errno
, filename
);
2277 cpu_physical_memory_read(addr
, buf
, l
);
2278 if (fwrite(buf
, 1, l
, f
) != l
) {
2279 error_setg(errp
, QERR_IO_ERROR
);
2290 void qmp_inject_nmi(Error
**errp
)
2292 nmi_monitor_handle(monitor_get_cpu_index(), errp
);
2295 void dump_drift_info(void)
2301 qemu_printf("Host - Guest clock %"PRIi64
" ms\n",
2302 (cpu_get_clock() - cpu_get_icount())/SCALE_MS
);
2303 if (icount_align_option
) {
2304 qemu_printf("Max guest delay %"PRIi64
" ms\n",
2305 -max_delay
/ SCALE_MS
);
2306 qemu_printf("Max guest advance %"PRIi64
" ms\n",
2307 max_advance
/ SCALE_MS
);
2309 qemu_printf("Max guest delay NA\n");
2310 qemu_printf("Max guest advance NA\n");