4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu/config-file.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "sysemu/sysemu.h"
35 #include "sysemu/block-backend.h"
36 #include "exec/gdbstub.h"
37 #include "sysemu/dma.h"
38 #include "sysemu/hw_accel.h"
39 #include "sysemu/kvm.h"
40 #include "sysemu/hax.h"
41 #include "sysemu/hvf.h"
42 #include "sysemu/whpx.h"
43 #include "exec/exec-all.h"
45 #include "qemu/thread.h"
46 #include "sysemu/cpus.h"
47 #include "sysemu/qtest.h"
48 #include "qemu/main-loop.h"
49 #include "qemu/option.h"
50 #include "qemu/bitmap.h"
51 #include "qemu/seqlock.h"
54 #include "sysemu/replay.h"
55 #include "hw/boards.h"
59 #include <sys/prctl.h>
62 #define PR_MCE_KILL 33
65 #ifndef PR_MCE_KILL_SET
66 #define PR_MCE_KILL_SET 1
69 #ifndef PR_MCE_KILL_EARLY
70 #define PR_MCE_KILL_EARLY 1
73 #endif /* CONFIG_LINUX */
78 /* vcpu throttling controls */
79 static QEMUTimer
*throttle_timer
;
80 static unsigned int throttle_percentage
;
82 #define CPU_THROTTLE_PCT_MIN 1
83 #define CPU_THROTTLE_PCT_MAX 99
84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
86 bool cpu_is_stopped(CPUState
*cpu
)
88 return cpu
->stopped
|| !runstate_is_running();
91 static bool cpu_thread_is_idle(CPUState
*cpu
)
93 if (cpu
->stop
|| cpu
->queued_work_first
) {
96 if (cpu_is_stopped(cpu
)) {
99 if (!cpu
->halted
|| cpu_has_work(cpu
) ||
100 kvm_halt_in_kernel()) {
106 static bool all_cpu_threads_idle(void)
111 if (!cpu_thread_is_idle(cpu
)) {
118 /***********************************************************/
119 /* guest cycle counter */
121 /* Protected by TimersState seqlock */
123 static bool icount_sleep
= true;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
127 typedef struct TimersState
{
128 /* Protected by BQL. */
129 int64_t cpu_ticks_prev
;
130 int64_t cpu_ticks_offset
;
132 /* cpu_clock_offset can be read out of BQL, so protect it with
135 QemuSeqLock vm_clock_seqlock
;
136 int64_t cpu_clock_offset
;
137 int32_t cpu_ticks_enabled
;
139 /* Conversion factor from emulated instructions to virtual clock ticks. */
140 int icount_time_shift
;
141 /* Compensate for varying guest execution speed. */
142 int64_t qemu_icount_bias
;
143 /* Only written by TCG thread */
145 /* for adjusting icount */
146 int64_t vm_clock_warp_start
;
147 QEMUTimer
*icount_rt_timer
;
148 QEMUTimer
*icount_vm_timer
;
149 QEMUTimer
*icount_warp_timer
;
152 static TimersState timers_state
;
156 * We default to false if we know other options have been enabled
157 * which are currently incompatible with MTTCG. Otherwise when each
158 * guest (target) has been updated to support:
159 * - atomic instructions
160 * - memory ordering primitives (barriers)
161 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
163 * Once a guest architecture has been converted to the new primitives
164 * there are two remaining limitations to check.
166 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
167 * - The host must have a stronger memory order than the guest
169 * It may be possible in future to support strong guests on weak hosts
170 * but that will require tagging all load/stores in a guest with their
171 * implicit memory order requirements which would likely slow things
175 static bool check_tcg_memory_orders_compatible(void)
177 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
178 return (TCG_GUEST_DEFAULT_MO
& ~TCG_TARGET_DEFAULT_MO
) == 0;
184 static bool default_mttcg_enabled(void)
186 if (use_icount
|| TCG_OVERSIZED_GUEST
) {
189 #ifdef TARGET_SUPPORTS_MTTCG
190 return check_tcg_memory_orders_compatible();
197 void qemu_tcg_configure(QemuOpts
*opts
, Error
**errp
)
199 const char *t
= qemu_opt_get(opts
, "thread");
201 if (strcmp(t
, "multi") == 0) {
202 if (TCG_OVERSIZED_GUEST
) {
203 error_setg(errp
, "No MTTCG when guest word size > hosts");
204 } else if (use_icount
) {
205 error_setg(errp
, "No MTTCG when icount is enabled");
207 #ifndef TARGET_SUPPORTS_MTTCG
208 error_report("Guest not yet converted to MTTCG - "
209 "you may get unexpected results");
211 if (!check_tcg_memory_orders_compatible()) {
212 error_report("Guest expects a stronger memory ordering "
213 "than the host provides");
214 error_printf("This may cause strange/hard to debug errors\n");
216 mttcg_enabled
= true;
218 } else if (strcmp(t
, "single") == 0) {
219 mttcg_enabled
= false;
221 error_setg(errp
, "Invalid 'thread' setting %s", t
);
224 mttcg_enabled
= default_mttcg_enabled();
228 /* The current number of executed instructions is based on what we
229 * originally budgeted minus the current state of the decrementing
230 * icount counters in extra/u16.low.
232 static int64_t cpu_get_icount_executed(CPUState
*cpu
)
234 return cpu
->icount_budget
- (cpu
->icount_decr
.u16
.low
+ cpu
->icount_extra
);
238 * Update the global shared timer_state.qemu_icount to take into
239 * account executed instructions. This is done by the TCG vCPU
240 * thread so the main-loop can see time has moved forward.
242 void cpu_update_icount(CPUState
*cpu
)
244 int64_t executed
= cpu_get_icount_executed(cpu
);
245 cpu
->icount_budget
-= executed
;
247 #ifdef CONFIG_ATOMIC64
248 atomic_set__nocheck(&timers_state
.qemu_icount
,
249 timers_state
.qemu_icount
+ executed
);
250 #else /* FIXME: we need 64bit atomics to do this safely */
251 timers_state
.qemu_icount
+= executed
;
255 static int64_t cpu_get_icount_raw_locked(void)
257 CPUState
*cpu
= current_cpu
;
259 if (cpu
&& cpu
->running
) {
260 if (!cpu
->can_do_io
) {
261 error_report("Bad icount read");
264 /* Take into account what has run */
265 cpu_update_icount(cpu
);
267 /* The read is protected by the seqlock, so __nocheck is okay. */
268 return atomic_read__nocheck(&timers_state
.qemu_icount
);
271 static int64_t cpu_get_icount_locked(void)
273 int64_t icount
= cpu_get_icount_raw_locked();
274 return atomic_read__nocheck(&timers_state
.qemu_icount_bias
) + cpu_icount_to_ns(icount
);
277 int64_t cpu_get_icount_raw(void)
283 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
284 icount
= cpu_get_icount_raw_locked();
285 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
290 /* Return the virtual CPU time, based on the instruction counter. */
291 int64_t cpu_get_icount(void)
297 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
298 icount
= cpu_get_icount_locked();
299 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
304 int64_t cpu_icount_to_ns(int64_t icount
)
306 return icount
<< atomic_read(&timers_state
.icount_time_shift
);
309 /* return the time elapsed in VM between vm_start and vm_stop. Unless
310 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
313 * Caller must hold the BQL
315 int64_t cpu_get_ticks(void)
320 return cpu_get_icount();
323 ticks
= timers_state
.cpu_ticks_offset
;
324 if (timers_state
.cpu_ticks_enabled
) {
325 ticks
+= cpu_get_host_ticks();
328 if (timers_state
.cpu_ticks_prev
> ticks
) {
329 /* Note: non increasing ticks may happen if the host uses
331 timers_state
.cpu_ticks_offset
+= timers_state
.cpu_ticks_prev
- ticks
;
332 ticks
= timers_state
.cpu_ticks_prev
;
335 timers_state
.cpu_ticks_prev
= ticks
;
339 static int64_t cpu_get_clock_locked(void)
343 time
= timers_state
.cpu_clock_offset
;
344 if (timers_state
.cpu_ticks_enabled
) {
351 /* Return the monotonic time elapsed in VM, i.e.,
352 * the time between vm_start and vm_stop
354 int64_t cpu_get_clock(void)
360 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
361 ti
= cpu_get_clock_locked();
362 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
367 /* enable cpu_get_ticks()
368 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
370 void cpu_enable_ticks(void)
372 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
373 seqlock_write_begin(&timers_state
.vm_clock_seqlock
);
374 if (!timers_state
.cpu_ticks_enabled
) {
375 timers_state
.cpu_ticks_offset
-= cpu_get_host_ticks();
376 timers_state
.cpu_clock_offset
-= get_clock();
377 timers_state
.cpu_ticks_enabled
= 1;
379 seqlock_write_end(&timers_state
.vm_clock_seqlock
);
382 /* disable cpu_get_ticks() : the clock is stopped. You must not call
383 * cpu_get_ticks() after that.
384 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
386 void cpu_disable_ticks(void)
388 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
389 seqlock_write_begin(&timers_state
.vm_clock_seqlock
);
390 if (timers_state
.cpu_ticks_enabled
) {
391 timers_state
.cpu_ticks_offset
+= cpu_get_host_ticks();
392 timers_state
.cpu_clock_offset
= cpu_get_clock_locked();
393 timers_state
.cpu_ticks_enabled
= 0;
395 seqlock_write_end(&timers_state
.vm_clock_seqlock
);
398 /* Correlation between real and virtual time is always going to be
399 fairly approximate, so ignore small variation.
400 When the guest is idle real and virtual time will be aligned in
402 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
404 static void icount_adjust(void)
410 /* Protected by TimersState mutex. */
411 static int64_t last_delta
;
413 /* If the VM is not running, then do nothing. */
414 if (!runstate_is_running()) {
418 seqlock_write_begin(&timers_state
.vm_clock_seqlock
);
419 cur_time
= cpu_get_clock_locked();
420 cur_icount
= cpu_get_icount_locked();
422 delta
= cur_icount
- cur_time
;
423 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
425 && last_delta
+ ICOUNT_WOBBLE
< delta
* 2
426 && timers_state
.icount_time_shift
> 0) {
427 /* The guest is getting too far ahead. Slow time down. */
428 atomic_set(&timers_state
.icount_time_shift
,
429 timers_state
.icount_time_shift
- 1);
432 && last_delta
- ICOUNT_WOBBLE
> delta
* 2
433 && timers_state
.icount_time_shift
< MAX_ICOUNT_SHIFT
) {
434 /* The guest is getting too far behind. Speed time up. */
435 atomic_set(&timers_state
.icount_time_shift
,
436 timers_state
.icount_time_shift
+ 1);
439 atomic_set__nocheck(&timers_state
.qemu_icount_bias
,
440 cur_icount
- (timers_state
.qemu_icount
441 << timers_state
.icount_time_shift
));
442 seqlock_write_end(&timers_state
.vm_clock_seqlock
);
445 static void icount_adjust_rt(void *opaque
)
447 timer_mod(timers_state
.icount_rt_timer
,
448 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
452 static void icount_adjust_vm(void *opaque
)
454 timer_mod(timers_state
.icount_vm_timer
,
455 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
456 NANOSECONDS_PER_SECOND
/ 10);
460 static int64_t qemu_icount_round(int64_t count
)
462 int shift
= atomic_read(&timers_state
.icount_time_shift
);
463 return (count
+ (1 << shift
) - 1) >> shift
;
466 static void icount_warp_rt(void)
471 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
472 * changes from -1 to another value, so the race here is okay.
475 seq
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
476 warp_start
= timers_state
.vm_clock_warp_start
;
477 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, seq
));
479 if (warp_start
== -1) {
483 seqlock_write_begin(&timers_state
.vm_clock_seqlock
);
484 if (runstate_is_running()) {
485 int64_t clock
= REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT
,
486 cpu_get_clock_locked());
489 warp_delta
= clock
- timers_state
.vm_clock_warp_start
;
490 if (use_icount
== 2) {
492 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
493 * far ahead of real time.
495 int64_t cur_icount
= cpu_get_icount_locked();
496 int64_t delta
= clock
- cur_icount
;
497 warp_delta
= MIN(warp_delta
, delta
);
499 atomic_set__nocheck(&timers_state
.qemu_icount_bias
,
500 timers_state
.qemu_icount_bias
+ warp_delta
);
502 timers_state
.vm_clock_warp_start
= -1;
503 seqlock_write_end(&timers_state
.vm_clock_seqlock
);
505 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL
)) {
506 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
510 static void icount_timer_cb(void *opaque
)
512 /* No need for a checkpoint because the timer already synchronizes
513 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
518 void qtest_clock_warp(int64_t dest
)
520 int64_t clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
521 AioContext
*aio_context
;
522 assert(qtest_enabled());
523 aio_context
= qemu_get_aio_context();
524 while (clock
< dest
) {
525 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
526 int64_t warp
= qemu_soonest_timeout(dest
- clock
, deadline
);
528 seqlock_write_begin(&timers_state
.vm_clock_seqlock
);
529 atomic_set__nocheck(&timers_state
.qemu_icount_bias
,
530 timers_state
.qemu_icount_bias
+ warp
);
531 seqlock_write_end(&timers_state
.vm_clock_seqlock
);
533 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
534 timerlist_run_timers(aio_context
->tlg
.tl
[QEMU_CLOCK_VIRTUAL
]);
535 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
537 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
540 void qemu_start_warp_timer(void)
549 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
550 * do not fire, so computing the deadline does not make sense.
552 if (!runstate_is_running()) {
556 /* warp clock deterministically in record/replay mode */
557 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
)) {
561 if (!all_cpu_threads_idle()) {
565 if (qtest_enabled()) {
566 /* When testing, qtest commands advance icount. */
570 /* We want to use the earliest deadline from ALL vm_clocks */
571 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
);
572 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
574 static bool notified
;
575 if (!icount_sleep
&& !notified
) {
576 warn_report("icount sleep disabled and no active timers");
584 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
585 * sleep. Otherwise, the CPU might be waiting for a future timer
586 * interrupt to wake it up, but the interrupt never comes because
587 * the vCPU isn't running any insns and thus doesn't advance the
588 * QEMU_CLOCK_VIRTUAL.
592 * We never let VCPUs sleep in no sleep icount mode.
593 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
594 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
595 * It is useful when we want a deterministic execution time,
596 * isolated from host latencies.
598 seqlock_write_begin(&timers_state
.vm_clock_seqlock
);
599 atomic_set__nocheck(&timers_state
.qemu_icount_bias
,
600 timers_state
.qemu_icount_bias
+ deadline
);
601 seqlock_write_end(&timers_state
.vm_clock_seqlock
);
602 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
605 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
606 * "real" time, (related to the time left until the next event) has
607 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
608 * This avoids that the warps are visible externally; for example,
609 * you will not be sending network packets continuously instead of
612 seqlock_write_begin(&timers_state
.vm_clock_seqlock
);
613 if (timers_state
.vm_clock_warp_start
== -1
614 || timers_state
.vm_clock_warp_start
> clock
) {
615 timers_state
.vm_clock_warp_start
= clock
;
617 seqlock_write_end(&timers_state
.vm_clock_seqlock
);
618 timer_mod_anticipate(timers_state
.icount_warp_timer
,
621 } else if (deadline
== 0) {
622 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
626 static void qemu_account_warp_timer(void)
628 if (!use_icount
|| !icount_sleep
) {
632 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
633 * do not fire, so computing the deadline does not make sense.
635 if (!runstate_is_running()) {
639 /* warp clock deterministically in record/replay mode */
640 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT
)) {
644 timer_del(timers_state
.icount_warp_timer
);
648 static bool icount_state_needed(void *opaque
)
653 static bool warp_timer_state_needed(void *opaque
)
655 TimersState
*s
= opaque
;
656 return s
->icount_warp_timer
!= NULL
;
659 static bool adjust_timers_state_needed(void *opaque
)
661 TimersState
*s
= opaque
;
662 return s
->icount_rt_timer
!= NULL
;
666 * Subsection for warp timer migration is optional, because may not be created
668 static const VMStateDescription icount_vmstate_warp_timer
= {
669 .name
= "timer/icount/warp_timer",
671 .minimum_version_id
= 1,
672 .needed
= warp_timer_state_needed
,
673 .fields
= (VMStateField
[]) {
674 VMSTATE_INT64(vm_clock_warp_start
, TimersState
),
675 VMSTATE_TIMER_PTR(icount_warp_timer
, TimersState
),
676 VMSTATE_END_OF_LIST()
680 static const VMStateDescription icount_vmstate_adjust_timers
= {
681 .name
= "timer/icount/timers",
683 .minimum_version_id
= 1,
684 .needed
= adjust_timers_state_needed
,
685 .fields
= (VMStateField
[]) {
686 VMSTATE_TIMER_PTR(icount_rt_timer
, TimersState
),
687 VMSTATE_TIMER_PTR(icount_vm_timer
, TimersState
),
688 VMSTATE_END_OF_LIST()
693 * This is a subsection for icount migration.
695 static const VMStateDescription icount_vmstate_timers
= {
696 .name
= "timer/icount",
698 .minimum_version_id
= 1,
699 .needed
= icount_state_needed
,
700 .fields
= (VMStateField
[]) {
701 VMSTATE_INT64(qemu_icount_bias
, TimersState
),
702 VMSTATE_INT64(qemu_icount
, TimersState
),
703 VMSTATE_END_OF_LIST()
705 .subsections
= (const VMStateDescription
*[]) {
706 &icount_vmstate_warp_timer
,
707 &icount_vmstate_adjust_timers
,
712 static const VMStateDescription vmstate_timers
= {
715 .minimum_version_id
= 1,
716 .fields
= (VMStateField
[]) {
717 VMSTATE_INT64(cpu_ticks_offset
, TimersState
),
719 VMSTATE_INT64_V(cpu_clock_offset
, TimersState
, 2),
720 VMSTATE_END_OF_LIST()
722 .subsections
= (const VMStateDescription
*[]) {
723 &icount_vmstate_timers
,
728 static void cpu_throttle_thread(CPUState
*cpu
, run_on_cpu_data opaque
)
731 double throttle_ratio
;
734 if (!cpu_throttle_get_percentage()) {
738 pct
= (double)cpu_throttle_get_percentage()/100;
739 throttle_ratio
= pct
/ (1 - pct
);
740 sleeptime_ns
= (long)(throttle_ratio
* CPU_THROTTLE_TIMESLICE_NS
);
742 qemu_mutex_unlock_iothread();
743 g_usleep(sleeptime_ns
/ 1000); /* Convert ns to us for usleep call */
744 qemu_mutex_lock_iothread();
745 atomic_set(&cpu
->throttle_thread_scheduled
, 0);
748 static void cpu_throttle_timer_tick(void *opaque
)
753 /* Stop the timer if needed */
754 if (!cpu_throttle_get_percentage()) {
758 if (!atomic_xchg(&cpu
->throttle_thread_scheduled
, 1)) {
759 async_run_on_cpu(cpu
, cpu_throttle_thread
,
764 pct
= (double)cpu_throttle_get_percentage()/100;
765 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
766 CPU_THROTTLE_TIMESLICE_NS
/ (1-pct
));
769 void cpu_throttle_set(int new_throttle_pct
)
771 /* Ensure throttle percentage is within valid range */
772 new_throttle_pct
= MIN(new_throttle_pct
, CPU_THROTTLE_PCT_MAX
);
773 new_throttle_pct
= MAX(new_throttle_pct
, CPU_THROTTLE_PCT_MIN
);
775 atomic_set(&throttle_percentage
, new_throttle_pct
);
777 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
778 CPU_THROTTLE_TIMESLICE_NS
);
781 void cpu_throttle_stop(void)
783 atomic_set(&throttle_percentage
, 0);
786 bool cpu_throttle_active(void)
788 return (cpu_throttle_get_percentage() != 0);
791 int cpu_throttle_get_percentage(void)
793 return atomic_read(&throttle_percentage
);
796 void cpu_ticks_init(void)
798 seqlock_init(&timers_state
.vm_clock_seqlock
);
799 vmstate_register(NULL
, 0, &vmstate_timers
, &timers_state
);
800 throttle_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
801 cpu_throttle_timer_tick
, NULL
);
804 void configure_icount(QemuOpts
*opts
, Error
**errp
)
807 char *rem_str
= NULL
;
809 option
= qemu_opt_get(opts
, "shift");
811 if (qemu_opt_get(opts
, "align") != NULL
) {
812 error_setg(errp
, "Please specify shift option when using align");
817 icount_sleep
= qemu_opt_get_bool(opts
, "sleep", true);
819 timers_state
.icount_warp_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
820 icount_timer_cb
, NULL
);
823 icount_align_option
= qemu_opt_get_bool(opts
, "align", false);
825 if (icount_align_option
&& !icount_sleep
) {
826 error_setg(errp
, "align=on and sleep=off are incompatible");
828 if (strcmp(option
, "auto") != 0) {
830 timers_state
.icount_time_shift
= strtol(option
, &rem_str
, 0);
831 if (errno
!= 0 || *rem_str
!= '\0' || !strlen(option
)) {
832 error_setg(errp
, "icount: Invalid shift value");
836 } else if (icount_align_option
) {
837 error_setg(errp
, "shift=auto and align=on are incompatible");
838 } else if (!icount_sleep
) {
839 error_setg(errp
, "shift=auto and sleep=off are incompatible");
844 /* 125MIPS seems a reasonable initial guess at the guest speed.
845 It will be corrected fairly quickly anyway. */
846 timers_state
.icount_time_shift
= 3;
848 /* Have both realtime and virtual time triggers for speed adjustment.
849 The realtime trigger catches emulated time passing too slowly,
850 the virtual time trigger catches emulated time passing too fast.
851 Realtime triggers occur even when idle, so use them less frequently
853 timers_state
.vm_clock_warp_start
= -1;
854 timers_state
.icount_rt_timer
= timer_new_ms(QEMU_CLOCK_VIRTUAL_RT
,
855 icount_adjust_rt
, NULL
);
856 timer_mod(timers_state
.icount_rt_timer
,
857 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
858 timers_state
.icount_vm_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
859 icount_adjust_vm
, NULL
);
860 timer_mod(timers_state
.icount_vm_timer
,
861 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
862 NANOSECONDS_PER_SECOND
/ 10);
865 /***********************************************************/
866 /* TCG vCPU kick timer
868 * The kick timer is responsible for moving single threaded vCPU
869 * emulation on to the next vCPU. If more than one vCPU is running a
870 * timer event with force a cpu->exit so the next vCPU can get
873 * The timer is removed if all vCPUs are idle and restarted again once
874 * idleness is complete.
877 static QEMUTimer
*tcg_kick_vcpu_timer
;
878 static CPUState
*tcg_current_rr_cpu
;
880 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
882 static inline int64_t qemu_tcg_next_kick(void)
884 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + TCG_KICK_PERIOD
;
887 /* Kick the currently round-robin scheduled vCPU */
888 static void qemu_cpu_kick_rr_cpu(void)
892 cpu
= atomic_mb_read(&tcg_current_rr_cpu
);
896 } while (cpu
!= atomic_mb_read(&tcg_current_rr_cpu
));
899 static void do_nothing(CPUState
*cpu
, run_on_cpu_data unused
)
903 void qemu_timer_notify_cb(void *opaque
, QEMUClockType type
)
905 if (!use_icount
|| type
!= QEMU_CLOCK_VIRTUAL
) {
910 if (qemu_in_vcpu_thread()) {
911 /* A CPU is currently running; kick it back out to the
912 * tcg_cpu_exec() loop so it will recalculate its
913 * icount deadline immediately.
915 qemu_cpu_kick(current_cpu
);
916 } else if (first_cpu
) {
917 /* qemu_cpu_kick is not enough to kick a halted CPU out of
918 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
919 * causes cpu_thread_is_idle to return false. This way,
920 * handle_icount_deadline can run.
921 * If we have no CPUs at all for some reason, we don't
922 * need to do anything.
924 async_run_on_cpu(first_cpu
, do_nothing
, RUN_ON_CPU_NULL
);
928 static void kick_tcg_thread(void *opaque
)
930 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
931 qemu_cpu_kick_rr_cpu();
934 static void start_tcg_kick_timer(void)
936 assert(!mttcg_enabled
);
937 if (!tcg_kick_vcpu_timer
&& CPU_NEXT(first_cpu
)) {
938 tcg_kick_vcpu_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
939 kick_tcg_thread
, NULL
);
940 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
944 static void stop_tcg_kick_timer(void)
946 assert(!mttcg_enabled
);
947 if (tcg_kick_vcpu_timer
) {
948 timer_del(tcg_kick_vcpu_timer
);
949 tcg_kick_vcpu_timer
= NULL
;
953 /***********************************************************/
954 void hw_error(const char *fmt
, ...)
960 fprintf(stderr
, "qemu: hardware error: ");
961 vfprintf(stderr
, fmt
, ap
);
962 fprintf(stderr
, "\n");
964 fprintf(stderr
, "CPU #%d:\n", cpu
->cpu_index
);
965 cpu_dump_state(cpu
, stderr
, fprintf
, CPU_DUMP_FPU
);
971 void cpu_synchronize_all_states(void)
976 cpu_synchronize_state(cpu
);
977 /* TODO: move to cpu_synchronize_state() */
979 hvf_cpu_synchronize_state(cpu
);
984 void cpu_synchronize_all_post_reset(void)
989 cpu_synchronize_post_reset(cpu
);
990 /* TODO: move to cpu_synchronize_post_reset() */
992 hvf_cpu_synchronize_post_reset(cpu
);
997 void cpu_synchronize_all_post_init(void)
1002 cpu_synchronize_post_init(cpu
);
1003 /* TODO: move to cpu_synchronize_post_init() */
1004 if (hvf_enabled()) {
1005 hvf_cpu_synchronize_post_init(cpu
);
1010 void cpu_synchronize_all_pre_loadvm(void)
1015 cpu_synchronize_pre_loadvm(cpu
);
1019 static int do_vm_stop(RunState state
, bool send_stop
)
1023 if (runstate_is_running()) {
1024 cpu_disable_ticks();
1026 runstate_set(state
);
1027 vm_state_notify(0, state
);
1029 qapi_event_send_stop(&error_abort
);
1034 replay_disable_events();
1035 ret
= bdrv_flush_all();
1040 /* Special vm_stop() variant for terminating the process. Historically clients
1041 * did not expect a QMP STOP event and so we need to retain compatibility.
1043 int vm_shutdown(void)
1045 return do_vm_stop(RUN_STATE_SHUTDOWN
, false);
1048 static bool cpu_can_run(CPUState
*cpu
)
1053 if (cpu_is_stopped(cpu
)) {
1059 static void cpu_handle_guest_debug(CPUState
*cpu
)
1061 gdb_set_stop_cpu(cpu
);
1062 qemu_system_debug_request();
1063 cpu
->stopped
= true;
1067 static void sigbus_reraise(void)
1070 struct sigaction action
;
1072 memset(&action
, 0, sizeof(action
));
1073 action
.sa_handler
= SIG_DFL
;
1074 if (!sigaction(SIGBUS
, &action
, NULL
)) {
1077 sigaddset(&set
, SIGBUS
);
1078 pthread_sigmask(SIG_UNBLOCK
, &set
, NULL
);
1080 perror("Failed to re-raise SIGBUS!\n");
1084 static void sigbus_handler(int n
, siginfo_t
*siginfo
, void *ctx
)
1086 if (siginfo
->si_code
!= BUS_MCEERR_AO
&& siginfo
->si_code
!= BUS_MCEERR_AR
) {
1091 /* Called asynchronously in VCPU thread. */
1092 if (kvm_on_sigbus_vcpu(current_cpu
, siginfo
->si_code
, siginfo
->si_addr
)) {
1096 /* Called synchronously (via signalfd) in main thread. */
1097 if (kvm_on_sigbus(siginfo
->si_code
, siginfo
->si_addr
)) {
1103 static void qemu_init_sigbus(void)
1105 struct sigaction action
;
1107 memset(&action
, 0, sizeof(action
));
1108 action
.sa_flags
= SA_SIGINFO
;
1109 action
.sa_sigaction
= sigbus_handler
;
1110 sigaction(SIGBUS
, &action
, NULL
);
1112 prctl(PR_MCE_KILL
, PR_MCE_KILL_SET
, PR_MCE_KILL_EARLY
, 0, 0);
1114 #else /* !CONFIG_LINUX */
1115 static void qemu_init_sigbus(void)
1118 #endif /* !CONFIG_LINUX */
1120 static QemuMutex qemu_global_mutex
;
1122 static QemuThread io_thread
;
1125 static QemuCond qemu_cpu_cond
;
1127 static QemuCond qemu_pause_cond
;
1129 void qemu_init_cpu_loop(void)
1132 qemu_cond_init(&qemu_cpu_cond
);
1133 qemu_cond_init(&qemu_pause_cond
);
1134 qemu_mutex_init(&qemu_global_mutex
);
1136 qemu_thread_get_self(&io_thread
);
1139 void run_on_cpu(CPUState
*cpu
, run_on_cpu_func func
, run_on_cpu_data data
)
1141 do_run_on_cpu(cpu
, func
, data
, &qemu_global_mutex
);
1144 static void qemu_kvm_destroy_vcpu(CPUState
*cpu
)
1146 if (kvm_destroy_vcpu(cpu
) < 0) {
1147 error_report("kvm_destroy_vcpu failed");
1152 static void qemu_tcg_destroy_vcpu(CPUState
*cpu
)
1156 static void qemu_cpu_stop(CPUState
*cpu
, bool exit
)
1158 g_assert(qemu_cpu_is_self(cpu
));
1160 cpu
->stopped
= true;
1164 qemu_cond_broadcast(&qemu_pause_cond
);
1167 static void qemu_wait_io_event_common(CPUState
*cpu
)
1169 atomic_mb_set(&cpu
->thread_kicked
, false);
1171 qemu_cpu_stop(cpu
, false);
1173 process_queued_cpu_work(cpu
);
1176 static void qemu_tcg_rr_wait_io_event(CPUState
*cpu
)
1178 while (all_cpu_threads_idle()) {
1179 stop_tcg_kick_timer();
1180 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1183 start_tcg_kick_timer();
1185 qemu_wait_io_event_common(cpu
);
1188 static void qemu_wait_io_event(CPUState
*cpu
)
1190 while (cpu_thread_is_idle(cpu
)) {
1191 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1195 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1196 if (!tcg_enabled()) {
1200 qemu_wait_io_event_common(cpu
);
1203 static void *qemu_kvm_cpu_thread_fn(void *arg
)
1205 CPUState
*cpu
= arg
;
1208 rcu_register_thread();
1210 qemu_mutex_lock_iothread();
1211 qemu_thread_get_self(cpu
->thread
);
1212 cpu
->thread_id
= qemu_get_thread_id();
1216 r
= kvm_init_vcpu(cpu
);
1218 error_report("kvm_init_vcpu failed: %s", strerror(-r
));
1222 kvm_init_cpu_signals(cpu
);
1224 /* signal CPU creation */
1225 cpu
->created
= true;
1226 qemu_cond_signal(&qemu_cpu_cond
);
1229 if (cpu_can_run(cpu
)) {
1230 r
= kvm_cpu_exec(cpu
);
1231 if (r
== EXCP_DEBUG
) {
1232 cpu_handle_guest_debug(cpu
);
1235 qemu_wait_io_event(cpu
);
1236 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1238 qemu_kvm_destroy_vcpu(cpu
);
1239 cpu
->created
= false;
1240 qemu_cond_signal(&qemu_cpu_cond
);
1241 qemu_mutex_unlock_iothread();
1242 rcu_unregister_thread();
1246 static void *qemu_dummy_cpu_thread_fn(void *arg
)
1249 error_report("qtest is not supported under Windows");
1252 CPUState
*cpu
= arg
;
1256 rcu_register_thread();
1258 qemu_mutex_lock_iothread();
1259 qemu_thread_get_self(cpu
->thread
);
1260 cpu
->thread_id
= qemu_get_thread_id();
1264 sigemptyset(&waitset
);
1265 sigaddset(&waitset
, SIG_IPI
);
1267 /* signal CPU creation */
1268 cpu
->created
= true;
1269 qemu_cond_signal(&qemu_cpu_cond
);
1272 qemu_mutex_unlock_iothread();
1275 r
= sigwait(&waitset
, &sig
);
1276 } while (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
1281 qemu_mutex_lock_iothread();
1282 qemu_wait_io_event(cpu
);
1283 } while (!cpu
->unplug
);
1285 rcu_unregister_thread();
1290 static int64_t tcg_get_icount_limit(void)
1294 if (replay_mode
!= REPLAY_MODE_PLAY
) {
1295 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1297 /* Maintain prior (possibly buggy) behaviour where if no deadline
1298 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1299 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1302 if ((deadline
< 0) || (deadline
> INT32_MAX
)) {
1303 deadline
= INT32_MAX
;
1306 return qemu_icount_round(deadline
);
1308 return replay_get_instructions();
1312 static void handle_icount_deadline(void)
1314 assert(qemu_in_vcpu_thread());
1317 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1319 if (deadline
== 0) {
1320 /* Wake up other AioContexts. */
1321 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
1322 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
1327 static void prepare_icount_for_run(CPUState
*cpu
)
1332 /* These should always be cleared by process_icount_data after
1333 * each vCPU execution. However u16.high can be raised
1334 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1336 g_assert(cpu
->icount_decr
.u16
.low
== 0);
1337 g_assert(cpu
->icount_extra
== 0);
1339 cpu
->icount_budget
= tcg_get_icount_limit();
1340 insns_left
= MIN(0xffff, cpu
->icount_budget
);
1341 cpu
->icount_decr
.u16
.low
= insns_left
;
1342 cpu
->icount_extra
= cpu
->icount_budget
- insns_left
;
1344 replay_mutex_lock();
1348 static void process_icount_data(CPUState
*cpu
)
1351 /* Account for executed instructions */
1352 cpu_update_icount(cpu
);
1354 /* Reset the counters */
1355 cpu
->icount_decr
.u16
.low
= 0;
1356 cpu
->icount_extra
= 0;
1357 cpu
->icount_budget
= 0;
1359 replay_account_executed_instructions();
1361 replay_mutex_unlock();
1366 static int tcg_cpu_exec(CPUState
*cpu
)
1369 #ifdef CONFIG_PROFILER
1373 assert(tcg_enabled());
1374 #ifdef CONFIG_PROFILER
1375 ti
= profile_getclock();
1377 cpu_exec_start(cpu
);
1378 ret
= cpu_exec(cpu
);
1380 #ifdef CONFIG_PROFILER
1381 tcg_time
+= profile_getclock() - ti
;
1386 /* Destroy any remaining vCPUs which have been unplugged and have
1389 static void deal_with_unplugged_cpus(void)
1394 if (cpu
->unplug
&& !cpu_can_run(cpu
)) {
1395 qemu_tcg_destroy_vcpu(cpu
);
1396 cpu
->created
= false;
1397 qemu_cond_signal(&qemu_cpu_cond
);
1403 /* Single-threaded TCG
1405 * In the single-threaded case each vCPU is simulated in turn. If
1406 * there is more than a single vCPU we create a simple timer to kick
1407 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1408 * This is done explicitly rather than relying on side-effects
1412 static void *qemu_tcg_rr_cpu_thread_fn(void *arg
)
1414 CPUState
*cpu
= arg
;
1416 assert(tcg_enabled());
1417 rcu_register_thread();
1418 tcg_register_thread();
1420 qemu_mutex_lock_iothread();
1421 qemu_thread_get_self(cpu
->thread
);
1423 cpu
->thread_id
= qemu_get_thread_id();
1424 cpu
->created
= true;
1426 qemu_cond_signal(&qemu_cpu_cond
);
1428 /* wait for initial kick-off after machine start */
1429 while (first_cpu
->stopped
) {
1430 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1432 /* process any pending work */
1435 qemu_wait_io_event_common(cpu
);
1439 start_tcg_kick_timer();
1443 /* process any pending work */
1444 cpu
->exit_request
= 1;
1447 qemu_mutex_unlock_iothread();
1448 replay_mutex_lock();
1449 qemu_mutex_lock_iothread();
1450 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1451 qemu_account_warp_timer();
1453 /* Run the timers here. This is much more efficient than
1454 * waking up the I/O thread and waiting for completion.
1456 handle_icount_deadline();
1458 replay_mutex_unlock();
1464 while (cpu
&& !cpu
->queued_work_first
&& !cpu
->exit_request
) {
1466 atomic_mb_set(&tcg_current_rr_cpu
, cpu
);
1469 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
,
1470 (cpu
->singlestep_enabled
& SSTEP_NOTIMER
) == 0);
1472 if (cpu_can_run(cpu
)) {
1475 qemu_mutex_unlock_iothread();
1476 prepare_icount_for_run(cpu
);
1478 r
= tcg_cpu_exec(cpu
);
1480 process_icount_data(cpu
);
1481 qemu_mutex_lock_iothread();
1483 if (r
== EXCP_DEBUG
) {
1484 cpu_handle_guest_debug(cpu
);
1486 } else if (r
== EXCP_ATOMIC
) {
1487 qemu_mutex_unlock_iothread();
1488 cpu_exec_step_atomic(cpu
);
1489 qemu_mutex_lock_iothread();
1492 } else if (cpu
->stop
) {
1494 cpu
= CPU_NEXT(cpu
);
1499 cpu
= CPU_NEXT(cpu
);
1500 } /* while (cpu && !cpu->exit_request).. */
1502 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1503 atomic_set(&tcg_current_rr_cpu
, NULL
);
1505 if (cpu
&& cpu
->exit_request
) {
1506 atomic_mb_set(&cpu
->exit_request
, 0);
1509 qemu_tcg_rr_wait_io_event(cpu
? cpu
: first_cpu
);
1510 deal_with_unplugged_cpus();
1513 rcu_unregister_thread();
1517 static void *qemu_hax_cpu_thread_fn(void *arg
)
1519 CPUState
*cpu
= arg
;
1522 rcu_register_thread();
1523 qemu_mutex_lock_iothread();
1524 qemu_thread_get_self(cpu
->thread
);
1526 cpu
->thread_id
= qemu_get_thread_id();
1527 cpu
->created
= true;
1532 qemu_cond_signal(&qemu_cpu_cond
);
1535 if (cpu_can_run(cpu
)) {
1536 r
= hax_smp_cpu_exec(cpu
);
1537 if (r
== EXCP_DEBUG
) {
1538 cpu_handle_guest_debug(cpu
);
1542 qemu_wait_io_event(cpu
);
1543 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1544 rcu_unregister_thread();
1548 /* The HVF-specific vCPU thread function. This one should only run when the host
1549 * CPU supports the VMX "unrestricted guest" feature. */
1550 static void *qemu_hvf_cpu_thread_fn(void *arg
)
1552 CPUState
*cpu
= arg
;
1556 assert(hvf_enabled());
1558 rcu_register_thread();
1560 qemu_mutex_lock_iothread();
1561 qemu_thread_get_self(cpu
->thread
);
1563 cpu
->thread_id
= qemu_get_thread_id();
1569 /* signal CPU creation */
1570 cpu
->created
= true;
1571 qemu_cond_signal(&qemu_cpu_cond
);
1574 if (cpu_can_run(cpu
)) {
1575 r
= hvf_vcpu_exec(cpu
);
1576 if (r
== EXCP_DEBUG
) {
1577 cpu_handle_guest_debug(cpu
);
1580 qemu_wait_io_event(cpu
);
1581 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1583 hvf_vcpu_destroy(cpu
);
1584 cpu
->created
= false;
1585 qemu_cond_signal(&qemu_cpu_cond
);
1586 qemu_mutex_unlock_iothread();
1587 rcu_unregister_thread();
1591 static void *qemu_whpx_cpu_thread_fn(void *arg
)
1593 CPUState
*cpu
= arg
;
1596 rcu_register_thread();
1598 qemu_mutex_lock_iothread();
1599 qemu_thread_get_self(cpu
->thread
);
1600 cpu
->thread_id
= qemu_get_thread_id();
1603 r
= whpx_init_vcpu(cpu
);
1605 fprintf(stderr
, "whpx_init_vcpu failed: %s\n", strerror(-r
));
1609 /* signal CPU creation */
1610 cpu
->created
= true;
1611 qemu_cond_signal(&qemu_cpu_cond
);
1614 if (cpu_can_run(cpu
)) {
1615 r
= whpx_vcpu_exec(cpu
);
1616 if (r
== EXCP_DEBUG
) {
1617 cpu_handle_guest_debug(cpu
);
1620 while (cpu_thread_is_idle(cpu
)) {
1621 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1623 qemu_wait_io_event_common(cpu
);
1624 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1626 whpx_destroy_vcpu(cpu
);
1627 cpu
->created
= false;
1628 qemu_cond_signal(&qemu_cpu_cond
);
1629 qemu_mutex_unlock_iothread();
1630 rcu_unregister_thread();
1635 static void CALLBACK
dummy_apc_func(ULONG_PTR unused
)
1640 /* Multi-threaded TCG
1642 * In the multi-threaded case each vCPU has its own thread. The TLS
1643 * variable current_cpu can be used deep in the code to find the
1644 * current CPUState for a given thread.
1647 static void *qemu_tcg_cpu_thread_fn(void *arg
)
1649 CPUState
*cpu
= arg
;
1651 assert(tcg_enabled());
1652 g_assert(!use_icount
);
1654 rcu_register_thread();
1655 tcg_register_thread();
1657 qemu_mutex_lock_iothread();
1658 qemu_thread_get_self(cpu
->thread
);
1660 cpu
->thread_id
= qemu_get_thread_id();
1661 cpu
->created
= true;
1664 qemu_cond_signal(&qemu_cpu_cond
);
1666 /* process any pending work */
1667 cpu
->exit_request
= 1;
1670 if (cpu_can_run(cpu
)) {
1672 qemu_mutex_unlock_iothread();
1673 r
= tcg_cpu_exec(cpu
);
1674 qemu_mutex_lock_iothread();
1677 cpu_handle_guest_debug(cpu
);
1680 /* during start-up the vCPU is reset and the thread is
1681 * kicked several times. If we don't ensure we go back
1682 * to sleep in the halted state we won't cleanly
1683 * start-up when the vCPU is enabled.
1685 * cpu->halted should ensure we sleep in wait_io_event
1687 g_assert(cpu
->halted
);
1690 qemu_mutex_unlock_iothread();
1691 cpu_exec_step_atomic(cpu
);
1692 qemu_mutex_lock_iothread();
1694 /* Ignore everything else? */
1699 atomic_mb_set(&cpu
->exit_request
, 0);
1700 qemu_wait_io_event(cpu
);
1701 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1703 qemu_tcg_destroy_vcpu(cpu
);
1704 cpu
->created
= false;
1705 qemu_cond_signal(&qemu_cpu_cond
);
1706 qemu_mutex_unlock_iothread();
1707 rcu_unregister_thread();
1711 static void qemu_cpu_kick_thread(CPUState
*cpu
)
1716 if (cpu
->thread_kicked
) {
1719 cpu
->thread_kicked
= true;
1720 err
= pthread_kill(cpu
->thread
->thread
, SIG_IPI
);
1722 fprintf(stderr
, "qemu:%s: %s", __func__
, strerror(err
));
1726 if (!qemu_cpu_is_self(cpu
)) {
1727 if (whpx_enabled()) {
1728 whpx_vcpu_kick(cpu
);
1729 } else if (!QueueUserAPC(dummy_apc_func
, cpu
->hThread
, 0)) {
1730 fprintf(stderr
, "%s: QueueUserAPC failed with error %lu\n",
1731 __func__
, GetLastError());
1738 void qemu_cpu_kick(CPUState
*cpu
)
1740 qemu_cond_broadcast(cpu
->halt_cond
);
1741 if (tcg_enabled()) {
1743 /* NOP unless doing single-thread RR */
1744 qemu_cpu_kick_rr_cpu();
1746 if (hax_enabled()) {
1748 * FIXME: race condition with the exit_request check in
1751 cpu
->exit_request
= 1;
1753 qemu_cpu_kick_thread(cpu
);
1757 void qemu_cpu_kick_self(void)
1759 assert(current_cpu
);
1760 qemu_cpu_kick_thread(current_cpu
);
1763 bool qemu_cpu_is_self(CPUState
*cpu
)
1765 return qemu_thread_is_self(cpu
->thread
);
1768 bool qemu_in_vcpu_thread(void)
1770 return current_cpu
&& qemu_cpu_is_self(current_cpu
);
1773 static __thread
bool iothread_locked
= false;
1775 bool qemu_mutex_iothread_locked(void)
1777 return iothread_locked
;
1781 * The BQL is taken from so many places that it is worth profiling the
1782 * callers directly, instead of funneling them all through a single function.
1784 void qemu_mutex_lock_iothread_impl(const char *file
, int line
)
1786 QemuMutexLockFunc bql_lock
= atomic_read(&qemu_bql_mutex_lock_func
);
1788 g_assert(!qemu_mutex_iothread_locked());
1789 bql_lock(&qemu_global_mutex
, file
, line
);
1790 iothread_locked
= true;
1793 void qemu_mutex_unlock_iothread(void)
1795 g_assert(qemu_mutex_iothread_locked());
1796 iothread_locked
= false;
1797 qemu_mutex_unlock(&qemu_global_mutex
);
1800 static bool all_vcpus_paused(void)
1805 if (!cpu
->stopped
) {
1813 void pause_all_vcpus(void)
1817 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, false);
1819 if (qemu_cpu_is_self(cpu
)) {
1820 qemu_cpu_stop(cpu
, true);
1827 /* We need to drop the replay_lock so any vCPU threads woken up
1828 * can finish their replay tasks
1830 replay_mutex_unlock();
1832 while (!all_vcpus_paused()) {
1833 qemu_cond_wait(&qemu_pause_cond
, &qemu_global_mutex
);
1839 qemu_mutex_unlock_iothread();
1840 replay_mutex_lock();
1841 qemu_mutex_lock_iothread();
1844 void cpu_resume(CPUState
*cpu
)
1847 cpu
->stopped
= false;
1851 void resume_all_vcpus(void)
1855 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, true);
1861 void cpu_remove_sync(CPUState
*cpu
)
1866 qemu_mutex_unlock_iothread();
1867 qemu_thread_join(cpu
->thread
);
1868 qemu_mutex_lock_iothread();
1871 /* For temporary buffers for forming a name */
1872 #define VCPU_THREAD_NAME_SIZE 16
1874 static void qemu_tcg_init_vcpu(CPUState
*cpu
)
1876 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1877 static QemuCond
*single_tcg_halt_cond
;
1878 static QemuThread
*single_tcg_cpu_thread
;
1879 static int tcg_region_inited
;
1881 assert(tcg_enabled());
1883 * Initialize TCG regions--once. Now is a good time, because:
1884 * (1) TCG's init context, prologue and target globals have been set up.
1885 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1886 * -accel flag is processed, so the check doesn't work then).
1888 if (!tcg_region_inited
) {
1889 tcg_region_inited
= 1;
1893 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread
) {
1894 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1895 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1896 qemu_cond_init(cpu
->halt_cond
);
1898 if (qemu_tcg_mttcg_enabled()) {
1899 /* create a thread per vCPU with TCG (MTTCG) */
1900 parallel_cpus
= true;
1901 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/TCG",
1904 qemu_thread_create(cpu
->thread
, thread_name
, qemu_tcg_cpu_thread_fn
,
1905 cpu
, QEMU_THREAD_JOINABLE
);
1908 /* share a single thread for all cpus with TCG */
1909 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "ALL CPUs/TCG");
1910 qemu_thread_create(cpu
->thread
, thread_name
,
1911 qemu_tcg_rr_cpu_thread_fn
,
1912 cpu
, QEMU_THREAD_JOINABLE
);
1914 single_tcg_halt_cond
= cpu
->halt_cond
;
1915 single_tcg_cpu_thread
= cpu
->thread
;
1918 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1921 /* For non-MTTCG cases we share the thread */
1922 cpu
->thread
= single_tcg_cpu_thread
;
1923 cpu
->halt_cond
= single_tcg_halt_cond
;
1924 cpu
->thread_id
= first_cpu
->thread_id
;
1926 cpu
->created
= true;
1930 static void qemu_hax_start_vcpu(CPUState
*cpu
)
1932 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1934 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1935 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1936 qemu_cond_init(cpu
->halt_cond
);
1938 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HAX",
1940 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hax_cpu_thread_fn
,
1941 cpu
, QEMU_THREAD_JOINABLE
);
1943 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1947 static void qemu_kvm_start_vcpu(CPUState
*cpu
)
1949 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1951 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1952 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1953 qemu_cond_init(cpu
->halt_cond
);
1954 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/KVM",
1956 qemu_thread_create(cpu
->thread
, thread_name
, qemu_kvm_cpu_thread_fn
,
1957 cpu
, QEMU_THREAD_JOINABLE
);
1960 static void qemu_hvf_start_vcpu(CPUState
*cpu
)
1962 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1964 /* HVF currently does not support TCG, and only runs in
1965 * unrestricted-guest mode. */
1966 assert(hvf_enabled());
1968 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1969 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1970 qemu_cond_init(cpu
->halt_cond
);
1972 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HVF",
1974 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hvf_cpu_thread_fn
,
1975 cpu
, QEMU_THREAD_JOINABLE
);
1978 static void qemu_whpx_start_vcpu(CPUState
*cpu
)
1980 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1982 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1983 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1984 qemu_cond_init(cpu
->halt_cond
);
1985 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/WHPX",
1987 qemu_thread_create(cpu
->thread
, thread_name
, qemu_whpx_cpu_thread_fn
,
1988 cpu
, QEMU_THREAD_JOINABLE
);
1990 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1994 static void qemu_dummy_start_vcpu(CPUState
*cpu
)
1996 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1998 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1999 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2000 qemu_cond_init(cpu
->halt_cond
);
2001 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/DUMMY",
2003 qemu_thread_create(cpu
->thread
, thread_name
, qemu_dummy_cpu_thread_fn
, cpu
,
2004 QEMU_THREAD_JOINABLE
);
2007 void qemu_init_vcpu(CPUState
*cpu
)
2009 cpu
->nr_cores
= smp_cores
;
2010 cpu
->nr_threads
= smp_threads
;
2011 cpu
->stopped
= true;
2014 /* If the target cpu hasn't set up any address spaces itself,
2015 * give it the default one.
2018 cpu_address_space_init(cpu
, 0, "cpu-memory", cpu
->memory
);
2021 if (kvm_enabled()) {
2022 qemu_kvm_start_vcpu(cpu
);
2023 } else if (hax_enabled()) {
2024 qemu_hax_start_vcpu(cpu
);
2025 } else if (hvf_enabled()) {
2026 qemu_hvf_start_vcpu(cpu
);
2027 } else if (tcg_enabled()) {
2028 qemu_tcg_init_vcpu(cpu
);
2029 } else if (whpx_enabled()) {
2030 qemu_whpx_start_vcpu(cpu
);
2032 qemu_dummy_start_vcpu(cpu
);
2035 while (!cpu
->created
) {
2036 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
2040 void cpu_stop_current(void)
2043 qemu_cpu_stop(current_cpu
, true);
2047 int vm_stop(RunState state
)
2049 if (qemu_in_vcpu_thread()) {
2050 qemu_system_vmstop_request_prepare();
2051 qemu_system_vmstop_request(state
);
2053 * FIXME: should not return to device code in case
2054 * vm_stop() has been requested.
2060 return do_vm_stop(state
, true);
2064 * Prepare for (re)starting the VM.
2065 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2066 * running or in case of an error condition), 0 otherwise.
2068 int vm_prepare_start(void)
2072 qemu_vmstop_requested(&requested
);
2073 if (runstate_is_running() && requested
== RUN_STATE__MAX
) {
2077 /* Ensure that a STOP/RESUME pair of events is emitted if a
2078 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2079 * example, according to documentation is always followed by
2082 if (runstate_is_running()) {
2083 qapi_event_send_stop(&error_abort
);
2084 qapi_event_send_resume(&error_abort
);
2088 /* We are sending this now, but the CPUs will be resumed shortly later */
2089 qapi_event_send_resume(&error_abort
);
2091 replay_enable_events();
2093 runstate_set(RUN_STATE_RUNNING
);
2094 vm_state_notify(1, RUN_STATE_RUNNING
);
2100 if (!vm_prepare_start()) {
2105 /* does a state transition even if the VM is already stopped,
2106 current state is forgotten forever */
2107 int vm_stop_force_state(RunState state
)
2109 if (runstate_is_running()) {
2110 return vm_stop(state
);
2112 runstate_set(state
);
2115 /* Make sure to return an error if the flush in a previous vm_stop()
2117 return bdrv_flush_all();
2121 void list_cpus(FILE *f
, fprintf_function cpu_fprintf
, const char *optarg
)
2123 /* XXX: implement xxx_cpu_list for targets that still miss it */
2124 #if defined(cpu_list)
2125 cpu_list(f
, cpu_fprintf
);
2129 CpuInfoList
*qmp_query_cpus(Error
**errp
)
2131 MachineState
*ms
= MACHINE(qdev_get_machine());
2132 MachineClass
*mc
= MACHINE_GET_CLASS(ms
);
2133 CpuInfoList
*head
= NULL
, *cur_item
= NULL
;
2138 #if defined(TARGET_I386)
2139 X86CPU
*x86_cpu
= X86_CPU(cpu
);
2140 CPUX86State
*env
= &x86_cpu
->env
;
2141 #elif defined(TARGET_PPC)
2142 PowerPCCPU
*ppc_cpu
= POWERPC_CPU(cpu
);
2143 CPUPPCState
*env
= &ppc_cpu
->env
;
2144 #elif defined(TARGET_SPARC)
2145 SPARCCPU
*sparc_cpu
= SPARC_CPU(cpu
);
2146 CPUSPARCState
*env
= &sparc_cpu
->env
;
2147 #elif defined(TARGET_RISCV)
2148 RISCVCPU
*riscv_cpu
= RISCV_CPU(cpu
);
2149 CPURISCVState
*env
= &riscv_cpu
->env
;
2150 #elif defined(TARGET_MIPS)
2151 MIPSCPU
*mips_cpu
= MIPS_CPU(cpu
);
2152 CPUMIPSState
*env
= &mips_cpu
->env
;
2153 #elif defined(TARGET_TRICORE)
2154 TriCoreCPU
*tricore_cpu
= TRICORE_CPU(cpu
);
2155 CPUTriCoreState
*env
= &tricore_cpu
->env
;
2156 #elif defined(TARGET_S390X)
2157 S390CPU
*s390_cpu
= S390_CPU(cpu
);
2158 CPUS390XState
*env
= &s390_cpu
->env
;
2161 cpu_synchronize_state(cpu
);
2163 info
= g_malloc0(sizeof(*info
));
2164 info
->value
= g_malloc0(sizeof(*info
->value
));
2165 info
->value
->CPU
= cpu
->cpu_index
;
2166 info
->value
->current
= (cpu
== first_cpu
);
2167 info
->value
->halted
= cpu
->halted
;
2168 info
->value
->qom_path
= object_get_canonical_path(OBJECT(cpu
));
2169 info
->value
->thread_id
= cpu
->thread_id
;
2170 #if defined(TARGET_I386)
2171 info
->value
->arch
= CPU_INFO_ARCH_X86
;
2172 info
->value
->u
.x86
.pc
= env
->eip
+ env
->segs
[R_CS
].base
;
2173 #elif defined(TARGET_PPC)
2174 info
->value
->arch
= CPU_INFO_ARCH_PPC
;
2175 info
->value
->u
.ppc
.nip
= env
->nip
;
2176 #elif defined(TARGET_SPARC)
2177 info
->value
->arch
= CPU_INFO_ARCH_SPARC
;
2178 info
->value
->u
.q_sparc
.pc
= env
->pc
;
2179 info
->value
->u
.q_sparc
.npc
= env
->npc
;
2180 #elif defined(TARGET_MIPS)
2181 info
->value
->arch
= CPU_INFO_ARCH_MIPS
;
2182 info
->value
->u
.q_mips
.PC
= env
->active_tc
.PC
;
2183 #elif defined(TARGET_TRICORE)
2184 info
->value
->arch
= CPU_INFO_ARCH_TRICORE
;
2185 info
->value
->u
.tricore
.PC
= env
->PC
;
2186 #elif defined(TARGET_S390X)
2187 info
->value
->arch
= CPU_INFO_ARCH_S390
;
2188 info
->value
->u
.s390
.cpu_state
= env
->cpu_state
;
2189 #elif defined(TARGET_RISCV)
2190 info
->value
->arch
= CPU_INFO_ARCH_RISCV
;
2191 info
->value
->u
.riscv
.pc
= env
->pc
;
2193 info
->value
->arch
= CPU_INFO_ARCH_OTHER
;
2195 info
->value
->has_props
= !!mc
->cpu_index_to_instance_props
;
2196 if (info
->value
->has_props
) {
2197 CpuInstanceProperties
*props
;
2198 props
= g_malloc0(sizeof(*props
));
2199 *props
= mc
->cpu_index_to_instance_props(ms
, cpu
->cpu_index
);
2200 info
->value
->props
= props
;
2203 /* XXX: waiting for the qapi to support GSList */
2205 head
= cur_item
= info
;
2207 cur_item
->next
= info
;
2215 static CpuInfoArch
sysemu_target_to_cpuinfo_arch(SysEmuTarget target
)
2218 * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2219 * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2222 case SYS_EMU_TARGET_I386
:
2223 case SYS_EMU_TARGET_X86_64
:
2224 return CPU_INFO_ARCH_X86
;
2226 case SYS_EMU_TARGET_PPC
:
2227 case SYS_EMU_TARGET_PPCEMB
:
2228 case SYS_EMU_TARGET_PPC64
:
2229 return CPU_INFO_ARCH_PPC
;
2231 case SYS_EMU_TARGET_SPARC
:
2232 case SYS_EMU_TARGET_SPARC64
:
2233 return CPU_INFO_ARCH_SPARC
;
2235 case SYS_EMU_TARGET_MIPS
:
2236 case SYS_EMU_TARGET_MIPSEL
:
2237 case SYS_EMU_TARGET_MIPS64
:
2238 case SYS_EMU_TARGET_MIPS64EL
:
2239 return CPU_INFO_ARCH_MIPS
;
2241 case SYS_EMU_TARGET_TRICORE
:
2242 return CPU_INFO_ARCH_TRICORE
;
2244 case SYS_EMU_TARGET_S390X
:
2245 return CPU_INFO_ARCH_S390
;
2247 case SYS_EMU_TARGET_RISCV32
:
2248 case SYS_EMU_TARGET_RISCV64
:
2249 return CPU_INFO_ARCH_RISCV
;
2252 return CPU_INFO_ARCH_OTHER
;
2256 static void cpustate_to_cpuinfo_s390(CpuInfoS390
*info
, const CPUState
*cpu
)
2259 S390CPU
*s390_cpu
= S390_CPU(cpu
);
2260 CPUS390XState
*env
= &s390_cpu
->env
;
2262 info
->cpu_state
= env
->cpu_state
;
2269 * fast means: we NEVER interrupt vCPU threads to retrieve
2270 * information from KVM.
2272 CpuInfoFastList
*qmp_query_cpus_fast(Error
**errp
)
2274 MachineState
*ms
= MACHINE(qdev_get_machine());
2275 MachineClass
*mc
= MACHINE_GET_CLASS(ms
);
2276 CpuInfoFastList
*head
= NULL
, *cur_item
= NULL
;
2277 SysEmuTarget target
= qapi_enum_parse(&SysEmuTarget_lookup
, TARGET_NAME
,
2282 CpuInfoFastList
*info
= g_malloc0(sizeof(*info
));
2283 info
->value
= g_malloc0(sizeof(*info
->value
));
2285 info
->value
->cpu_index
= cpu
->cpu_index
;
2286 info
->value
->qom_path
= object_get_canonical_path(OBJECT(cpu
));
2287 info
->value
->thread_id
= cpu
->thread_id
;
2289 info
->value
->has_props
= !!mc
->cpu_index_to_instance_props
;
2290 if (info
->value
->has_props
) {
2291 CpuInstanceProperties
*props
;
2292 props
= g_malloc0(sizeof(*props
));
2293 *props
= mc
->cpu_index_to_instance_props(ms
, cpu
->cpu_index
);
2294 info
->value
->props
= props
;
2297 info
->value
->arch
= sysemu_target_to_cpuinfo_arch(target
);
2298 info
->value
->target
= target
;
2299 if (target
== SYS_EMU_TARGET_S390X
) {
2300 cpustate_to_cpuinfo_s390(&info
->value
->u
.s390x
, cpu
);
2304 head
= cur_item
= info
;
2306 cur_item
->next
= info
;
2314 void qmp_memsave(int64_t addr
, int64_t size
, const char *filename
,
2315 bool has_cpu
, int64_t cpu_index
, Error
**errp
)
2321 int64_t orig_addr
= addr
, orig_size
= size
;
2327 cpu
= qemu_get_cpu(cpu_index
);
2329 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cpu-index",
2334 f
= fopen(filename
, "wb");
2336 error_setg_file_open(errp
, errno
, filename
);
2344 if (cpu_memory_rw_debug(cpu
, addr
, buf
, l
, 0) != 0) {
2345 error_setg(errp
, "Invalid addr 0x%016" PRIx64
"/size %" PRId64
2346 " specified", orig_addr
, orig_size
);
2349 if (fwrite(buf
, 1, l
, f
) != l
) {
2350 error_setg(errp
, QERR_IO_ERROR
);
2361 void qmp_pmemsave(int64_t addr
, int64_t size
, const char *filename
,
2368 f
= fopen(filename
, "wb");
2370 error_setg_file_open(errp
, errno
, filename
);
2378 cpu_physical_memory_read(addr
, buf
, l
);
2379 if (fwrite(buf
, 1, l
, f
) != l
) {
2380 error_setg(errp
, QERR_IO_ERROR
);
2391 void qmp_inject_nmi(Error
**errp
)
2393 nmi_monitor_handle(monitor_get_cpu_index(), errp
);
2396 void dump_drift_info(FILE *f
, fprintf_function cpu_fprintf
)
2402 cpu_fprintf(f
, "Host - Guest clock %"PRIi64
" ms\n",
2403 (cpu_get_clock() - cpu_get_icount())/SCALE_MS
);
2404 if (icount_align_option
) {
2405 cpu_fprintf(f
, "Max guest delay %"PRIi64
" ms\n", -max_delay
/SCALE_MS
);
2406 cpu_fprintf(f
, "Max guest advance %"PRIi64
" ms\n", max_advance
/SCALE_MS
);
2408 cpu_fprintf(f
, "Max guest delay NA\n");
2409 cpu_fprintf(f
, "Max guest advance NA\n");