4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "qemu/qemu-print.h"
35 #include "sysemu/tcg.h"
36 #include "sysemu/block-backend.h"
37 #include "exec/gdbstub.h"
38 #include "sysemu/dma.h"
39 #include "sysemu/hw_accel.h"
40 #include "sysemu/kvm.h"
41 #include "sysemu/hax.h"
42 #include "sysemu/hvf.h"
43 #include "sysemu/whpx.h"
44 #include "exec/exec-all.h"
46 #include "qemu/thread.h"
47 #include "sysemu/cpus.h"
48 #include "sysemu/qtest.h"
49 #include "qemu/main-loop.h"
50 #include "qemu/option.h"
51 #include "qemu/bitmap.h"
52 #include "qemu/seqlock.h"
53 #include "qemu/guest-random.h"
56 #include "sysemu/replay.h"
60 #include <sys/prctl.h>
63 #define PR_MCE_KILL 33
66 #ifndef PR_MCE_KILL_SET
67 #define PR_MCE_KILL_SET 1
70 #ifndef PR_MCE_KILL_EARLY
71 #define PR_MCE_KILL_EARLY 1
74 #endif /* CONFIG_LINUX */
79 /* vcpu throttling controls */
80 static QEMUTimer
*throttle_timer
;
81 static unsigned int throttle_percentage
;
83 #define CPU_THROTTLE_PCT_MIN 1
84 #define CPU_THROTTLE_PCT_MAX 99
85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
87 bool cpu_is_stopped(CPUState
*cpu
)
89 return cpu
->stopped
|| !runstate_is_running();
92 static bool cpu_thread_is_idle(CPUState
*cpu
)
94 if (cpu
->stop
|| cpu
->queued_work_first
) {
97 if (cpu_is_stopped(cpu
)) {
100 if (!cpu
->halted
|| cpu_has_work(cpu
) ||
101 kvm_halt_in_kernel()) {
107 static bool all_cpu_threads_idle(void)
112 if (!cpu_thread_is_idle(cpu
)) {
119 /***********************************************************/
120 /* guest cycle counter */
122 /* Protected by TimersState seqlock */
124 static bool icount_sleep
= true;
125 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
126 #define MAX_ICOUNT_SHIFT 10
128 typedef struct TimersState
{
129 /* Protected by BQL. */
130 int64_t cpu_ticks_prev
;
131 int64_t cpu_ticks_offset
;
133 /* Protect fields that can be respectively read outside the
134 * BQL, and written from multiple threads.
136 QemuSeqLock vm_clock_seqlock
;
137 QemuSpin vm_clock_lock
;
139 int16_t cpu_ticks_enabled
;
141 /* Conversion factor from emulated instructions to virtual clock ticks. */
142 int16_t icount_time_shift
;
144 /* Compensate for varying guest execution speed. */
145 int64_t qemu_icount_bias
;
147 int64_t vm_clock_warp_start
;
148 int64_t cpu_clock_offset
;
150 /* Only written by TCG thread */
153 /* for adjusting icount */
154 QEMUTimer
*icount_rt_timer
;
155 QEMUTimer
*icount_vm_timer
;
156 QEMUTimer
*icount_warp_timer
;
159 static TimersState timers_state
;
163 * We default to false if we know other options have been enabled
164 * which are currently incompatible with MTTCG. Otherwise when each
165 * guest (target) has been updated to support:
166 * - atomic instructions
167 * - memory ordering primitives (barriers)
168 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
170 * Once a guest architecture has been converted to the new primitives
171 * there are two remaining limitations to check.
173 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
174 * - The host must have a stronger memory order than the guest
176 * It may be possible in future to support strong guests on weak hosts
177 * but that will require tagging all load/stores in a guest with their
178 * implicit memory order requirements which would likely slow things
182 static bool check_tcg_memory_orders_compatible(void)
184 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
185 return (TCG_GUEST_DEFAULT_MO
& ~TCG_TARGET_DEFAULT_MO
) == 0;
191 static bool default_mttcg_enabled(void)
193 if (use_icount
|| TCG_OVERSIZED_GUEST
) {
196 #ifdef TARGET_SUPPORTS_MTTCG
197 return check_tcg_memory_orders_compatible();
204 void qemu_tcg_configure(QemuOpts
*opts
, Error
**errp
)
206 const char *t
= qemu_opt_get(opts
, "thread");
208 if (strcmp(t
, "multi") == 0) {
209 if (TCG_OVERSIZED_GUEST
) {
210 error_setg(errp
, "No MTTCG when guest word size > hosts");
211 } else if (use_icount
) {
212 error_setg(errp
, "No MTTCG when icount is enabled");
214 #ifndef TARGET_SUPPORTS_MTTCG
215 warn_report("Guest not yet converted to MTTCG - "
216 "you may get unexpected results");
218 if (!check_tcg_memory_orders_compatible()) {
219 warn_report("Guest expects a stronger memory ordering "
220 "than the host provides");
221 error_printf("This may cause strange/hard to debug errors\n");
223 mttcg_enabled
= true;
225 } else if (strcmp(t
, "single") == 0) {
226 mttcg_enabled
= false;
228 error_setg(errp
, "Invalid 'thread' setting %s", t
);
231 mttcg_enabled
= default_mttcg_enabled();
235 /* The current number of executed instructions is based on what we
236 * originally budgeted minus the current state of the decrementing
237 * icount counters in extra/u16.low.
239 static int64_t cpu_get_icount_executed(CPUState
*cpu
)
241 return (cpu
->icount_budget
-
242 (cpu_neg(cpu
)->icount_decr
.u16
.low
+ cpu
->icount_extra
));
246 * Update the global shared timer_state.qemu_icount to take into
247 * account executed instructions. This is done by the TCG vCPU
248 * thread so the main-loop can see time has moved forward.
250 static void cpu_update_icount_locked(CPUState
*cpu
)
252 int64_t executed
= cpu_get_icount_executed(cpu
);
253 cpu
->icount_budget
-= executed
;
255 atomic_set_i64(&timers_state
.qemu_icount
,
256 timers_state
.qemu_icount
+ executed
);
260 * Update the global shared timer_state.qemu_icount to take into
261 * account executed instructions. This is done by the TCG vCPU
262 * thread so the main-loop can see time has moved forward.
264 void cpu_update_icount(CPUState
*cpu
)
266 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
267 &timers_state
.vm_clock_lock
);
268 cpu_update_icount_locked(cpu
);
269 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
270 &timers_state
.vm_clock_lock
);
273 static int64_t cpu_get_icount_raw_locked(void)
275 CPUState
*cpu
= current_cpu
;
277 if (cpu
&& cpu
->running
) {
278 if (!cpu
->can_do_io
) {
279 error_report("Bad icount read");
282 /* Take into account what has run */
283 cpu_update_icount_locked(cpu
);
285 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
286 return atomic_read_i64(&timers_state
.qemu_icount
);
289 static int64_t cpu_get_icount_locked(void)
291 int64_t icount
= cpu_get_icount_raw_locked();
292 return atomic_read_i64(&timers_state
.qemu_icount_bias
) +
293 cpu_icount_to_ns(icount
);
296 int64_t cpu_get_icount_raw(void)
302 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
303 icount
= cpu_get_icount_raw_locked();
304 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
309 /* Return the virtual CPU time, based on the instruction counter. */
310 int64_t cpu_get_icount(void)
316 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
317 icount
= cpu_get_icount_locked();
318 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
323 int64_t cpu_icount_to_ns(int64_t icount
)
325 return icount
<< atomic_read(&timers_state
.icount_time_shift
);
328 static int64_t cpu_get_ticks_locked(void)
330 int64_t ticks
= timers_state
.cpu_ticks_offset
;
331 if (timers_state
.cpu_ticks_enabled
) {
332 ticks
+= cpu_get_host_ticks();
335 if (timers_state
.cpu_ticks_prev
> ticks
) {
336 /* Non increasing ticks may happen if the host uses software suspend. */
337 timers_state
.cpu_ticks_offset
+= timers_state
.cpu_ticks_prev
- ticks
;
338 ticks
= timers_state
.cpu_ticks_prev
;
341 timers_state
.cpu_ticks_prev
= ticks
;
345 /* return the time elapsed in VM between vm_start and vm_stop. Unless
346 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
349 int64_t cpu_get_ticks(void)
354 return cpu_get_icount();
357 qemu_spin_lock(&timers_state
.vm_clock_lock
);
358 ticks
= cpu_get_ticks_locked();
359 qemu_spin_unlock(&timers_state
.vm_clock_lock
);
363 static int64_t cpu_get_clock_locked(void)
367 time
= timers_state
.cpu_clock_offset
;
368 if (timers_state
.cpu_ticks_enabled
) {
375 /* Return the monotonic time elapsed in VM, i.e.,
376 * the time between vm_start and vm_stop
378 int64_t cpu_get_clock(void)
384 start
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
385 ti
= cpu_get_clock_locked();
386 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, start
));
391 /* enable cpu_get_ticks()
392 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
394 void cpu_enable_ticks(void)
396 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
397 &timers_state
.vm_clock_lock
);
398 if (!timers_state
.cpu_ticks_enabled
) {
399 timers_state
.cpu_ticks_offset
-= cpu_get_host_ticks();
400 timers_state
.cpu_clock_offset
-= get_clock();
401 timers_state
.cpu_ticks_enabled
= 1;
403 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
404 &timers_state
.vm_clock_lock
);
407 /* disable cpu_get_ticks() : the clock is stopped. You must not call
408 * cpu_get_ticks() after that.
409 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
411 void cpu_disable_ticks(void)
413 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
414 &timers_state
.vm_clock_lock
);
415 if (timers_state
.cpu_ticks_enabled
) {
416 timers_state
.cpu_ticks_offset
+= cpu_get_host_ticks();
417 timers_state
.cpu_clock_offset
= cpu_get_clock_locked();
418 timers_state
.cpu_ticks_enabled
= 0;
420 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
421 &timers_state
.vm_clock_lock
);
424 /* Correlation between real and virtual time is always going to be
425 fairly approximate, so ignore small variation.
426 When the guest is idle real and virtual time will be aligned in
428 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
430 static void icount_adjust(void)
436 /* Protected by TimersState mutex. */
437 static int64_t last_delta
;
439 /* If the VM is not running, then do nothing. */
440 if (!runstate_is_running()) {
444 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
445 &timers_state
.vm_clock_lock
);
446 cur_time
= cpu_get_clock_locked();
447 cur_icount
= cpu_get_icount_locked();
449 delta
= cur_icount
- cur_time
;
450 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
452 && last_delta
+ ICOUNT_WOBBLE
< delta
* 2
453 && timers_state
.icount_time_shift
> 0) {
454 /* The guest is getting too far ahead. Slow time down. */
455 atomic_set(&timers_state
.icount_time_shift
,
456 timers_state
.icount_time_shift
- 1);
459 && last_delta
- ICOUNT_WOBBLE
> delta
* 2
460 && timers_state
.icount_time_shift
< MAX_ICOUNT_SHIFT
) {
461 /* The guest is getting too far behind. Speed time up. */
462 atomic_set(&timers_state
.icount_time_shift
,
463 timers_state
.icount_time_shift
+ 1);
466 atomic_set_i64(&timers_state
.qemu_icount_bias
,
467 cur_icount
- (timers_state
.qemu_icount
468 << timers_state
.icount_time_shift
));
469 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
470 &timers_state
.vm_clock_lock
);
473 static void icount_adjust_rt(void *opaque
)
475 timer_mod(timers_state
.icount_rt_timer
,
476 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
480 static void icount_adjust_vm(void *opaque
)
482 timer_mod(timers_state
.icount_vm_timer
,
483 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
484 NANOSECONDS_PER_SECOND
/ 10);
488 static int64_t qemu_icount_round(int64_t count
)
490 int shift
= atomic_read(&timers_state
.icount_time_shift
);
491 return (count
+ (1 << shift
) - 1) >> shift
;
494 static void icount_warp_rt(void)
499 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
500 * changes from -1 to another value, so the race here is okay.
503 seq
= seqlock_read_begin(&timers_state
.vm_clock_seqlock
);
504 warp_start
= timers_state
.vm_clock_warp_start
;
505 } while (seqlock_read_retry(&timers_state
.vm_clock_seqlock
, seq
));
507 if (warp_start
== -1) {
511 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
512 &timers_state
.vm_clock_lock
);
513 if (runstate_is_running()) {
514 int64_t clock
= REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT
,
515 cpu_get_clock_locked());
518 warp_delta
= clock
- timers_state
.vm_clock_warp_start
;
519 if (use_icount
== 2) {
521 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
522 * far ahead of real time.
524 int64_t cur_icount
= cpu_get_icount_locked();
525 int64_t delta
= clock
- cur_icount
;
526 warp_delta
= MIN(warp_delta
, delta
);
528 atomic_set_i64(&timers_state
.qemu_icount_bias
,
529 timers_state
.qemu_icount_bias
+ warp_delta
);
531 timers_state
.vm_clock_warp_start
= -1;
532 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
533 &timers_state
.vm_clock_lock
);
535 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL
)) {
536 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
540 static void icount_timer_cb(void *opaque
)
542 /* No need for a checkpoint because the timer already synchronizes
543 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
548 void qtest_clock_warp(int64_t dest
)
550 int64_t clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
551 AioContext
*aio_context
;
552 assert(qtest_enabled());
553 aio_context
= qemu_get_aio_context();
554 while (clock
< dest
) {
555 int64_t deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
556 int64_t warp
= qemu_soonest_timeout(dest
- clock
, deadline
);
558 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
559 &timers_state
.vm_clock_lock
);
560 atomic_set_i64(&timers_state
.qemu_icount_bias
,
561 timers_state
.qemu_icount_bias
+ warp
);
562 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
563 &timers_state
.vm_clock_lock
);
565 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
566 timerlist_run_timers(aio_context
->tlg
.tl
[QEMU_CLOCK_VIRTUAL
]);
567 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
569 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
572 void qemu_start_warp_timer(void)
581 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
582 * do not fire, so computing the deadline does not make sense.
584 if (!runstate_is_running()) {
588 if (replay_mode
!= REPLAY_MODE_PLAY
) {
589 if (!all_cpu_threads_idle()) {
593 if (qtest_enabled()) {
594 /* When testing, qtest commands advance icount. */
598 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
);
600 /* warp clock deterministically in record/replay mode */
601 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START
)) {
602 /* vCPU is sleeping and warp can't be started.
603 It is probably a race condition: notification sent
604 to vCPU was processed in advance and vCPU went to sleep.
605 Therefore we have to wake it up for doing someting. */
606 if (replay_has_checkpoint()) {
607 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
613 /* We want to use the earliest deadline from ALL vm_clocks */
614 clock
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
);
615 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
617 static bool notified
;
618 if (!icount_sleep
&& !notified
) {
619 warn_report("icount sleep disabled and no active timers");
627 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
628 * sleep. Otherwise, the CPU might be waiting for a future timer
629 * interrupt to wake it up, but the interrupt never comes because
630 * the vCPU isn't running any insns and thus doesn't advance the
631 * QEMU_CLOCK_VIRTUAL.
635 * We never let VCPUs sleep in no sleep icount mode.
636 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
637 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
638 * It is useful when we want a deterministic execution time,
639 * isolated from host latencies.
641 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
642 &timers_state
.vm_clock_lock
);
643 atomic_set_i64(&timers_state
.qemu_icount_bias
,
644 timers_state
.qemu_icount_bias
+ deadline
);
645 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
646 &timers_state
.vm_clock_lock
);
647 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
650 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
651 * "real" time, (related to the time left until the next event) has
652 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
653 * This avoids that the warps are visible externally; for example,
654 * you will not be sending network packets continuously instead of
657 seqlock_write_lock(&timers_state
.vm_clock_seqlock
,
658 &timers_state
.vm_clock_lock
);
659 if (timers_state
.vm_clock_warp_start
== -1
660 || timers_state
.vm_clock_warp_start
> clock
) {
661 timers_state
.vm_clock_warp_start
= clock
;
663 seqlock_write_unlock(&timers_state
.vm_clock_seqlock
,
664 &timers_state
.vm_clock_lock
);
665 timer_mod_anticipate(timers_state
.icount_warp_timer
,
668 } else if (deadline
== 0) {
669 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
673 static void qemu_account_warp_timer(void)
675 if (!use_icount
|| !icount_sleep
) {
679 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
680 * do not fire, so computing the deadline does not make sense.
682 if (!runstate_is_running()) {
686 /* warp clock deterministically in record/replay mode */
687 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT
)) {
691 timer_del(timers_state
.icount_warp_timer
);
695 static bool icount_state_needed(void *opaque
)
700 static bool warp_timer_state_needed(void *opaque
)
702 TimersState
*s
= opaque
;
703 return s
->icount_warp_timer
!= NULL
;
706 static bool adjust_timers_state_needed(void *opaque
)
708 TimersState
*s
= opaque
;
709 return s
->icount_rt_timer
!= NULL
;
713 * Subsection for warp timer migration is optional, because may not be created
715 static const VMStateDescription icount_vmstate_warp_timer
= {
716 .name
= "timer/icount/warp_timer",
718 .minimum_version_id
= 1,
719 .needed
= warp_timer_state_needed
,
720 .fields
= (VMStateField
[]) {
721 VMSTATE_INT64(vm_clock_warp_start
, TimersState
),
722 VMSTATE_TIMER_PTR(icount_warp_timer
, TimersState
),
723 VMSTATE_END_OF_LIST()
727 static const VMStateDescription icount_vmstate_adjust_timers
= {
728 .name
= "timer/icount/timers",
730 .minimum_version_id
= 1,
731 .needed
= adjust_timers_state_needed
,
732 .fields
= (VMStateField
[]) {
733 VMSTATE_TIMER_PTR(icount_rt_timer
, TimersState
),
734 VMSTATE_TIMER_PTR(icount_vm_timer
, TimersState
),
735 VMSTATE_END_OF_LIST()
740 * This is a subsection for icount migration.
742 static const VMStateDescription icount_vmstate_timers
= {
743 .name
= "timer/icount",
745 .minimum_version_id
= 1,
746 .needed
= icount_state_needed
,
747 .fields
= (VMStateField
[]) {
748 VMSTATE_INT64(qemu_icount_bias
, TimersState
),
749 VMSTATE_INT64(qemu_icount
, TimersState
),
750 VMSTATE_END_OF_LIST()
752 .subsections
= (const VMStateDescription
*[]) {
753 &icount_vmstate_warp_timer
,
754 &icount_vmstate_adjust_timers
,
759 static const VMStateDescription vmstate_timers
= {
762 .minimum_version_id
= 1,
763 .fields
= (VMStateField
[]) {
764 VMSTATE_INT64(cpu_ticks_offset
, TimersState
),
766 VMSTATE_INT64_V(cpu_clock_offset
, TimersState
, 2),
767 VMSTATE_END_OF_LIST()
769 .subsections
= (const VMStateDescription
*[]) {
770 &icount_vmstate_timers
,
775 static void cpu_throttle_thread(CPUState
*cpu
, run_on_cpu_data opaque
)
778 double throttle_ratio
;
781 if (!cpu_throttle_get_percentage()) {
785 pct
= (double)cpu_throttle_get_percentage()/100;
786 throttle_ratio
= pct
/ (1 - pct
);
787 sleeptime_ns
= (long)(throttle_ratio
* CPU_THROTTLE_TIMESLICE_NS
);
789 qemu_mutex_unlock_iothread();
790 g_usleep(sleeptime_ns
/ 1000); /* Convert ns to us for usleep call */
791 qemu_mutex_lock_iothread();
792 atomic_set(&cpu
->throttle_thread_scheduled
, 0);
795 static void cpu_throttle_timer_tick(void *opaque
)
800 /* Stop the timer if needed */
801 if (!cpu_throttle_get_percentage()) {
805 if (!atomic_xchg(&cpu
->throttle_thread_scheduled
, 1)) {
806 async_run_on_cpu(cpu
, cpu_throttle_thread
,
811 pct
= (double)cpu_throttle_get_percentage()/100;
812 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
813 CPU_THROTTLE_TIMESLICE_NS
/ (1-pct
));
816 void cpu_throttle_set(int new_throttle_pct
)
818 /* Ensure throttle percentage is within valid range */
819 new_throttle_pct
= MIN(new_throttle_pct
, CPU_THROTTLE_PCT_MAX
);
820 new_throttle_pct
= MAX(new_throttle_pct
, CPU_THROTTLE_PCT_MIN
);
822 atomic_set(&throttle_percentage
, new_throttle_pct
);
824 timer_mod(throttle_timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT
) +
825 CPU_THROTTLE_TIMESLICE_NS
);
828 void cpu_throttle_stop(void)
830 atomic_set(&throttle_percentage
, 0);
833 bool cpu_throttle_active(void)
835 return (cpu_throttle_get_percentage() != 0);
838 int cpu_throttle_get_percentage(void)
840 return atomic_read(&throttle_percentage
);
843 void cpu_ticks_init(void)
845 seqlock_init(&timers_state
.vm_clock_seqlock
);
846 qemu_spin_init(&timers_state
.vm_clock_lock
);
847 vmstate_register(NULL
, 0, &vmstate_timers
, &timers_state
);
848 throttle_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
849 cpu_throttle_timer_tick
, NULL
);
852 void configure_icount(QemuOpts
*opts
, Error
**errp
)
855 char *rem_str
= NULL
;
857 option
= qemu_opt_get(opts
, "shift");
859 if (qemu_opt_get(opts
, "align") != NULL
) {
860 error_setg(errp
, "Please specify shift option when using align");
865 icount_sleep
= qemu_opt_get_bool(opts
, "sleep", true);
867 timers_state
.icount_warp_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL_RT
,
868 icount_timer_cb
, NULL
);
871 icount_align_option
= qemu_opt_get_bool(opts
, "align", false);
873 if (icount_align_option
&& !icount_sleep
) {
874 error_setg(errp
, "align=on and sleep=off are incompatible");
876 if (strcmp(option
, "auto") != 0) {
878 timers_state
.icount_time_shift
= strtol(option
, &rem_str
, 0);
879 if (errno
!= 0 || *rem_str
!= '\0' || !strlen(option
)) {
880 error_setg(errp
, "icount: Invalid shift value");
884 } else if (icount_align_option
) {
885 error_setg(errp
, "shift=auto and align=on are incompatible");
886 } else if (!icount_sleep
) {
887 error_setg(errp
, "shift=auto and sleep=off are incompatible");
892 /* 125MIPS seems a reasonable initial guess at the guest speed.
893 It will be corrected fairly quickly anyway. */
894 timers_state
.icount_time_shift
= 3;
896 /* Have both realtime and virtual time triggers for speed adjustment.
897 The realtime trigger catches emulated time passing too slowly,
898 the virtual time trigger catches emulated time passing too fast.
899 Realtime triggers occur even when idle, so use them less frequently
901 timers_state
.vm_clock_warp_start
= -1;
902 timers_state
.icount_rt_timer
= timer_new_ms(QEMU_CLOCK_VIRTUAL_RT
,
903 icount_adjust_rt
, NULL
);
904 timer_mod(timers_state
.icount_rt_timer
,
905 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT
) + 1000);
906 timers_state
.icount_vm_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
907 icount_adjust_vm
, NULL
);
908 timer_mod(timers_state
.icount_vm_timer
,
909 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) +
910 NANOSECONDS_PER_SECOND
/ 10);
913 /***********************************************************/
914 /* TCG vCPU kick timer
916 * The kick timer is responsible for moving single threaded vCPU
917 * emulation on to the next vCPU. If more than one vCPU is running a
918 * timer event with force a cpu->exit so the next vCPU can get
921 * The timer is removed if all vCPUs are idle and restarted again once
922 * idleness is complete.
925 static QEMUTimer
*tcg_kick_vcpu_timer
;
926 static CPUState
*tcg_current_rr_cpu
;
928 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
930 static inline int64_t qemu_tcg_next_kick(void)
932 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + TCG_KICK_PERIOD
;
935 /* Kick the currently round-robin scheduled vCPU */
936 static void qemu_cpu_kick_rr_cpu(void)
940 cpu
= atomic_mb_read(&tcg_current_rr_cpu
);
944 } while (cpu
!= atomic_mb_read(&tcg_current_rr_cpu
));
947 static void do_nothing(CPUState
*cpu
, run_on_cpu_data unused
)
951 void qemu_timer_notify_cb(void *opaque
, QEMUClockType type
)
953 if (!use_icount
|| type
!= QEMU_CLOCK_VIRTUAL
) {
958 if (qemu_in_vcpu_thread()) {
959 /* A CPU is currently running; kick it back out to the
960 * tcg_cpu_exec() loop so it will recalculate its
961 * icount deadline immediately.
963 qemu_cpu_kick(current_cpu
);
964 } else if (first_cpu
) {
965 /* qemu_cpu_kick is not enough to kick a halted CPU out of
966 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
967 * causes cpu_thread_is_idle to return false. This way,
968 * handle_icount_deadline can run.
969 * If we have no CPUs at all for some reason, we don't
970 * need to do anything.
972 async_run_on_cpu(first_cpu
, do_nothing
, RUN_ON_CPU_NULL
);
976 static void kick_tcg_thread(void *opaque
)
978 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
979 qemu_cpu_kick_rr_cpu();
982 static void start_tcg_kick_timer(void)
984 assert(!mttcg_enabled
);
985 if (!tcg_kick_vcpu_timer
&& CPU_NEXT(first_cpu
)) {
986 tcg_kick_vcpu_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
987 kick_tcg_thread
, NULL
);
989 if (tcg_kick_vcpu_timer
&& !timer_pending(tcg_kick_vcpu_timer
)) {
990 timer_mod(tcg_kick_vcpu_timer
, qemu_tcg_next_kick());
994 static void stop_tcg_kick_timer(void)
996 assert(!mttcg_enabled
);
997 if (tcg_kick_vcpu_timer
&& timer_pending(tcg_kick_vcpu_timer
)) {
998 timer_del(tcg_kick_vcpu_timer
);
1002 /***********************************************************/
1003 void hw_error(const char *fmt
, ...)
1009 fprintf(stderr
, "qemu: hardware error: ");
1010 vfprintf(stderr
, fmt
, ap
);
1011 fprintf(stderr
, "\n");
1013 fprintf(stderr
, "CPU #%d:\n", cpu
->cpu_index
);
1014 cpu_dump_state(cpu
, stderr
, CPU_DUMP_FPU
);
1020 void cpu_synchronize_all_states(void)
1025 cpu_synchronize_state(cpu
);
1026 /* TODO: move to cpu_synchronize_state() */
1027 if (hvf_enabled()) {
1028 hvf_cpu_synchronize_state(cpu
);
1033 void cpu_synchronize_all_post_reset(void)
1038 cpu_synchronize_post_reset(cpu
);
1039 /* TODO: move to cpu_synchronize_post_reset() */
1040 if (hvf_enabled()) {
1041 hvf_cpu_synchronize_post_reset(cpu
);
1046 void cpu_synchronize_all_post_init(void)
1051 cpu_synchronize_post_init(cpu
);
1052 /* TODO: move to cpu_synchronize_post_init() */
1053 if (hvf_enabled()) {
1054 hvf_cpu_synchronize_post_init(cpu
);
1059 void cpu_synchronize_all_pre_loadvm(void)
1064 cpu_synchronize_pre_loadvm(cpu
);
1068 static int do_vm_stop(RunState state
, bool send_stop
)
1072 if (runstate_is_running()) {
1073 cpu_disable_ticks();
1075 runstate_set(state
);
1076 vm_state_notify(0, state
);
1078 qapi_event_send_stop();
1083 replay_disable_events();
1084 ret
= bdrv_flush_all();
1089 /* Special vm_stop() variant for terminating the process. Historically clients
1090 * did not expect a QMP STOP event and so we need to retain compatibility.
1092 int vm_shutdown(void)
1094 return do_vm_stop(RUN_STATE_SHUTDOWN
, false);
1097 static bool cpu_can_run(CPUState
*cpu
)
1102 if (cpu_is_stopped(cpu
)) {
1108 static void cpu_handle_guest_debug(CPUState
*cpu
)
1110 gdb_set_stop_cpu(cpu
);
1111 qemu_system_debug_request();
1112 cpu
->stopped
= true;
1116 static void sigbus_reraise(void)
1119 struct sigaction action
;
1121 memset(&action
, 0, sizeof(action
));
1122 action
.sa_handler
= SIG_DFL
;
1123 if (!sigaction(SIGBUS
, &action
, NULL
)) {
1126 sigaddset(&set
, SIGBUS
);
1127 pthread_sigmask(SIG_UNBLOCK
, &set
, NULL
);
1129 perror("Failed to re-raise SIGBUS!\n");
1133 static void sigbus_handler(int n
, siginfo_t
*siginfo
, void *ctx
)
1135 if (siginfo
->si_code
!= BUS_MCEERR_AO
&& siginfo
->si_code
!= BUS_MCEERR_AR
) {
1140 /* Called asynchronously in VCPU thread. */
1141 if (kvm_on_sigbus_vcpu(current_cpu
, siginfo
->si_code
, siginfo
->si_addr
)) {
1145 /* Called synchronously (via signalfd) in main thread. */
1146 if (kvm_on_sigbus(siginfo
->si_code
, siginfo
->si_addr
)) {
1152 static void qemu_init_sigbus(void)
1154 struct sigaction action
;
1156 memset(&action
, 0, sizeof(action
));
1157 action
.sa_flags
= SA_SIGINFO
;
1158 action
.sa_sigaction
= sigbus_handler
;
1159 sigaction(SIGBUS
, &action
, NULL
);
1161 prctl(PR_MCE_KILL
, PR_MCE_KILL_SET
, PR_MCE_KILL_EARLY
, 0, 0);
1163 #else /* !CONFIG_LINUX */
1164 static void qemu_init_sigbus(void)
1167 #endif /* !CONFIG_LINUX */
1169 static QemuMutex qemu_global_mutex
;
1171 static QemuThread io_thread
;
1174 static QemuCond qemu_cpu_cond
;
1176 static QemuCond qemu_pause_cond
;
1178 void qemu_init_cpu_loop(void)
1181 qemu_cond_init(&qemu_cpu_cond
);
1182 qemu_cond_init(&qemu_pause_cond
);
1183 qemu_mutex_init(&qemu_global_mutex
);
1185 qemu_thread_get_self(&io_thread
);
1188 void run_on_cpu(CPUState
*cpu
, run_on_cpu_func func
, run_on_cpu_data data
)
1190 do_run_on_cpu(cpu
, func
, data
, &qemu_global_mutex
);
1193 static void qemu_kvm_destroy_vcpu(CPUState
*cpu
)
1195 if (kvm_destroy_vcpu(cpu
) < 0) {
1196 error_report("kvm_destroy_vcpu failed");
1201 static void qemu_tcg_destroy_vcpu(CPUState
*cpu
)
1205 static void qemu_cpu_stop(CPUState
*cpu
, bool exit
)
1207 g_assert(qemu_cpu_is_self(cpu
));
1209 cpu
->stopped
= true;
1213 qemu_cond_broadcast(&qemu_pause_cond
);
1216 static void qemu_wait_io_event_common(CPUState
*cpu
)
1218 atomic_mb_set(&cpu
->thread_kicked
, false);
1220 qemu_cpu_stop(cpu
, false);
1222 process_queued_cpu_work(cpu
);
1225 static void qemu_tcg_rr_wait_io_event(void)
1229 while (all_cpu_threads_idle()) {
1230 stop_tcg_kick_timer();
1231 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1234 start_tcg_kick_timer();
1237 qemu_wait_io_event_common(cpu
);
1241 static void qemu_wait_io_event(CPUState
*cpu
)
1243 while (cpu_thread_is_idle(cpu
)) {
1244 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1248 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1249 if (!tcg_enabled()) {
1253 qemu_wait_io_event_common(cpu
);
1256 static void *qemu_kvm_cpu_thread_fn(void *arg
)
1258 CPUState
*cpu
= arg
;
1261 rcu_register_thread();
1263 qemu_mutex_lock_iothread();
1264 qemu_thread_get_self(cpu
->thread
);
1265 cpu
->thread_id
= qemu_get_thread_id();
1269 r
= kvm_init_vcpu(cpu
);
1271 error_report("kvm_init_vcpu failed: %s", strerror(-r
));
1275 kvm_init_cpu_signals(cpu
);
1277 /* signal CPU creation */
1278 cpu
->created
= true;
1279 qemu_cond_signal(&qemu_cpu_cond
);
1280 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1283 if (cpu_can_run(cpu
)) {
1284 r
= kvm_cpu_exec(cpu
);
1285 if (r
== EXCP_DEBUG
) {
1286 cpu_handle_guest_debug(cpu
);
1289 qemu_wait_io_event(cpu
);
1290 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1292 qemu_kvm_destroy_vcpu(cpu
);
1293 cpu
->created
= false;
1294 qemu_cond_signal(&qemu_cpu_cond
);
1295 qemu_mutex_unlock_iothread();
1296 rcu_unregister_thread();
1300 static void *qemu_dummy_cpu_thread_fn(void *arg
)
1303 error_report("qtest is not supported under Windows");
1306 CPUState
*cpu
= arg
;
1310 rcu_register_thread();
1312 qemu_mutex_lock_iothread();
1313 qemu_thread_get_self(cpu
->thread
);
1314 cpu
->thread_id
= qemu_get_thread_id();
1318 sigemptyset(&waitset
);
1319 sigaddset(&waitset
, SIG_IPI
);
1321 /* signal CPU creation */
1322 cpu
->created
= true;
1323 qemu_cond_signal(&qemu_cpu_cond
);
1324 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1327 qemu_mutex_unlock_iothread();
1330 r
= sigwait(&waitset
, &sig
);
1331 } while (r
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
1336 qemu_mutex_lock_iothread();
1337 qemu_wait_io_event(cpu
);
1338 } while (!cpu
->unplug
);
1340 qemu_mutex_unlock_iothread();
1341 rcu_unregister_thread();
1346 static int64_t tcg_get_icount_limit(void)
1350 if (replay_mode
!= REPLAY_MODE_PLAY
) {
1351 deadline
= qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1353 /* Maintain prior (possibly buggy) behaviour where if no deadline
1354 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1355 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1358 if ((deadline
< 0) || (deadline
> INT32_MAX
)) {
1359 deadline
= INT32_MAX
;
1362 return qemu_icount_round(deadline
);
1364 return replay_get_instructions();
1368 static void handle_icount_deadline(void)
1370 assert(qemu_in_vcpu_thread());
1373 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL
);
1375 if (deadline
== 0) {
1376 /* Wake up other AioContexts. */
1377 qemu_clock_notify(QEMU_CLOCK_VIRTUAL
);
1378 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL
);
1383 static void prepare_icount_for_run(CPUState
*cpu
)
1388 /* These should always be cleared by process_icount_data after
1389 * each vCPU execution. However u16.high can be raised
1390 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1392 g_assert(cpu_neg(cpu
)->icount_decr
.u16
.low
== 0);
1393 g_assert(cpu
->icount_extra
== 0);
1395 cpu
->icount_budget
= tcg_get_icount_limit();
1396 insns_left
= MIN(0xffff, cpu
->icount_budget
);
1397 cpu_neg(cpu
)->icount_decr
.u16
.low
= insns_left
;
1398 cpu
->icount_extra
= cpu
->icount_budget
- insns_left
;
1400 replay_mutex_lock();
1404 static void process_icount_data(CPUState
*cpu
)
1407 /* Account for executed instructions */
1408 cpu_update_icount(cpu
);
1410 /* Reset the counters */
1411 cpu_neg(cpu
)->icount_decr
.u16
.low
= 0;
1412 cpu
->icount_extra
= 0;
1413 cpu
->icount_budget
= 0;
1415 replay_account_executed_instructions();
1417 replay_mutex_unlock();
1422 static int tcg_cpu_exec(CPUState
*cpu
)
1425 #ifdef CONFIG_PROFILER
1429 assert(tcg_enabled());
1430 #ifdef CONFIG_PROFILER
1431 ti
= profile_getclock();
1433 cpu_exec_start(cpu
);
1434 ret
= cpu_exec(cpu
);
1436 #ifdef CONFIG_PROFILER
1437 atomic_set(&tcg_ctx
->prof
.cpu_exec_time
,
1438 tcg_ctx
->prof
.cpu_exec_time
+ profile_getclock() - ti
);
1443 /* Destroy any remaining vCPUs which have been unplugged and have
1446 static void deal_with_unplugged_cpus(void)
1451 if (cpu
->unplug
&& !cpu_can_run(cpu
)) {
1452 qemu_tcg_destroy_vcpu(cpu
);
1453 cpu
->created
= false;
1454 qemu_cond_signal(&qemu_cpu_cond
);
1460 /* Single-threaded TCG
1462 * In the single-threaded case each vCPU is simulated in turn. If
1463 * there is more than a single vCPU we create a simple timer to kick
1464 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1465 * This is done explicitly rather than relying on side-effects
1469 static void *qemu_tcg_rr_cpu_thread_fn(void *arg
)
1471 CPUState
*cpu
= arg
;
1473 assert(tcg_enabled());
1474 rcu_register_thread();
1475 tcg_register_thread();
1477 qemu_mutex_lock_iothread();
1478 qemu_thread_get_self(cpu
->thread
);
1480 cpu
->thread_id
= qemu_get_thread_id();
1481 cpu
->created
= true;
1483 qemu_cond_signal(&qemu_cpu_cond
);
1484 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1486 /* wait for initial kick-off after machine start */
1487 while (first_cpu
->stopped
) {
1488 qemu_cond_wait(first_cpu
->halt_cond
, &qemu_global_mutex
);
1490 /* process any pending work */
1493 qemu_wait_io_event_common(cpu
);
1497 start_tcg_kick_timer();
1501 /* process any pending work */
1502 cpu
->exit_request
= 1;
1505 qemu_mutex_unlock_iothread();
1506 replay_mutex_lock();
1507 qemu_mutex_lock_iothread();
1508 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1509 qemu_account_warp_timer();
1511 /* Run the timers here. This is much more efficient than
1512 * waking up the I/O thread and waiting for completion.
1514 handle_icount_deadline();
1516 replay_mutex_unlock();
1522 while (cpu
&& !cpu
->queued_work_first
&& !cpu
->exit_request
) {
1524 atomic_mb_set(&tcg_current_rr_cpu
, cpu
);
1527 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
,
1528 (cpu
->singlestep_enabled
& SSTEP_NOTIMER
) == 0);
1530 if (cpu_can_run(cpu
)) {
1533 qemu_mutex_unlock_iothread();
1534 prepare_icount_for_run(cpu
);
1536 r
= tcg_cpu_exec(cpu
);
1538 process_icount_data(cpu
);
1539 qemu_mutex_lock_iothread();
1541 if (r
== EXCP_DEBUG
) {
1542 cpu_handle_guest_debug(cpu
);
1544 } else if (r
== EXCP_ATOMIC
) {
1545 qemu_mutex_unlock_iothread();
1546 cpu_exec_step_atomic(cpu
);
1547 qemu_mutex_lock_iothread();
1550 } else if (cpu
->stop
) {
1552 cpu
= CPU_NEXT(cpu
);
1557 cpu
= CPU_NEXT(cpu
);
1558 } /* while (cpu && !cpu->exit_request).. */
1560 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1561 atomic_set(&tcg_current_rr_cpu
, NULL
);
1563 if (cpu
&& cpu
->exit_request
) {
1564 atomic_mb_set(&cpu
->exit_request
, 0);
1567 if (use_icount
&& all_cpu_threads_idle()) {
1569 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1570 * in the main_loop, wake it up in order to start the warp timer.
1572 qemu_notify_event();
1575 qemu_tcg_rr_wait_io_event();
1576 deal_with_unplugged_cpus();
1579 rcu_unregister_thread();
1583 static void *qemu_hax_cpu_thread_fn(void *arg
)
1585 CPUState
*cpu
= arg
;
1588 rcu_register_thread();
1589 qemu_mutex_lock_iothread();
1590 qemu_thread_get_self(cpu
->thread
);
1592 cpu
->thread_id
= qemu_get_thread_id();
1593 cpu
->created
= true;
1597 qemu_cond_signal(&qemu_cpu_cond
);
1598 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1601 if (cpu_can_run(cpu
)) {
1602 r
= hax_smp_cpu_exec(cpu
);
1603 if (r
== EXCP_DEBUG
) {
1604 cpu_handle_guest_debug(cpu
);
1608 qemu_wait_io_event(cpu
);
1609 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1610 rcu_unregister_thread();
1614 /* The HVF-specific vCPU thread function. This one should only run when the host
1615 * CPU supports the VMX "unrestricted guest" feature. */
1616 static void *qemu_hvf_cpu_thread_fn(void *arg
)
1618 CPUState
*cpu
= arg
;
1622 assert(hvf_enabled());
1624 rcu_register_thread();
1626 qemu_mutex_lock_iothread();
1627 qemu_thread_get_self(cpu
->thread
);
1629 cpu
->thread_id
= qemu_get_thread_id();
1635 /* signal CPU creation */
1636 cpu
->created
= true;
1637 qemu_cond_signal(&qemu_cpu_cond
);
1638 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1641 if (cpu_can_run(cpu
)) {
1642 r
= hvf_vcpu_exec(cpu
);
1643 if (r
== EXCP_DEBUG
) {
1644 cpu_handle_guest_debug(cpu
);
1647 qemu_wait_io_event(cpu
);
1648 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1650 hvf_vcpu_destroy(cpu
);
1651 cpu
->created
= false;
1652 qemu_cond_signal(&qemu_cpu_cond
);
1653 qemu_mutex_unlock_iothread();
1654 rcu_unregister_thread();
1658 static void *qemu_whpx_cpu_thread_fn(void *arg
)
1660 CPUState
*cpu
= arg
;
1663 rcu_register_thread();
1665 qemu_mutex_lock_iothread();
1666 qemu_thread_get_self(cpu
->thread
);
1667 cpu
->thread_id
= qemu_get_thread_id();
1670 r
= whpx_init_vcpu(cpu
);
1672 fprintf(stderr
, "whpx_init_vcpu failed: %s\n", strerror(-r
));
1676 /* signal CPU creation */
1677 cpu
->created
= true;
1678 qemu_cond_signal(&qemu_cpu_cond
);
1679 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1682 if (cpu_can_run(cpu
)) {
1683 r
= whpx_vcpu_exec(cpu
);
1684 if (r
== EXCP_DEBUG
) {
1685 cpu_handle_guest_debug(cpu
);
1688 while (cpu_thread_is_idle(cpu
)) {
1689 qemu_cond_wait(cpu
->halt_cond
, &qemu_global_mutex
);
1691 qemu_wait_io_event_common(cpu
);
1692 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1694 whpx_destroy_vcpu(cpu
);
1695 cpu
->created
= false;
1696 qemu_cond_signal(&qemu_cpu_cond
);
1697 qemu_mutex_unlock_iothread();
1698 rcu_unregister_thread();
1703 static void CALLBACK
dummy_apc_func(ULONG_PTR unused
)
1708 /* Multi-threaded TCG
1710 * In the multi-threaded case each vCPU has its own thread. The TLS
1711 * variable current_cpu can be used deep in the code to find the
1712 * current CPUState for a given thread.
1715 static void *qemu_tcg_cpu_thread_fn(void *arg
)
1717 CPUState
*cpu
= arg
;
1719 assert(tcg_enabled());
1720 g_assert(!use_icount
);
1722 rcu_register_thread();
1723 tcg_register_thread();
1725 qemu_mutex_lock_iothread();
1726 qemu_thread_get_self(cpu
->thread
);
1728 cpu
->thread_id
= qemu_get_thread_id();
1729 cpu
->created
= true;
1732 qemu_cond_signal(&qemu_cpu_cond
);
1733 qemu_guest_random_seed_thread_part2(cpu
->random_seed
);
1735 /* process any pending work */
1736 cpu
->exit_request
= 1;
1739 if (cpu_can_run(cpu
)) {
1741 qemu_mutex_unlock_iothread();
1742 r
= tcg_cpu_exec(cpu
);
1743 qemu_mutex_lock_iothread();
1746 cpu_handle_guest_debug(cpu
);
1749 /* during start-up the vCPU is reset and the thread is
1750 * kicked several times. If we don't ensure we go back
1751 * to sleep in the halted state we won't cleanly
1752 * start-up when the vCPU is enabled.
1754 * cpu->halted should ensure we sleep in wait_io_event
1756 g_assert(cpu
->halted
);
1759 qemu_mutex_unlock_iothread();
1760 cpu_exec_step_atomic(cpu
);
1761 qemu_mutex_lock_iothread();
1763 /* Ignore everything else? */
1768 atomic_mb_set(&cpu
->exit_request
, 0);
1769 qemu_wait_io_event(cpu
);
1770 } while (!cpu
->unplug
|| cpu_can_run(cpu
));
1772 qemu_tcg_destroy_vcpu(cpu
);
1773 cpu
->created
= false;
1774 qemu_cond_signal(&qemu_cpu_cond
);
1775 qemu_mutex_unlock_iothread();
1776 rcu_unregister_thread();
1780 static void qemu_cpu_kick_thread(CPUState
*cpu
)
1785 if (cpu
->thread_kicked
) {
1788 cpu
->thread_kicked
= true;
1789 err
= pthread_kill(cpu
->thread
->thread
, SIG_IPI
);
1790 if (err
&& err
!= ESRCH
) {
1791 fprintf(stderr
, "qemu:%s: %s", __func__
, strerror(err
));
1795 if (!qemu_cpu_is_self(cpu
)) {
1796 if (whpx_enabled()) {
1797 whpx_vcpu_kick(cpu
);
1798 } else if (!QueueUserAPC(dummy_apc_func
, cpu
->hThread
, 0)) {
1799 fprintf(stderr
, "%s: QueueUserAPC failed with error %lu\n",
1800 __func__
, GetLastError());
1807 void qemu_cpu_kick(CPUState
*cpu
)
1809 qemu_cond_broadcast(cpu
->halt_cond
);
1810 if (tcg_enabled()) {
1812 /* NOP unless doing single-thread RR */
1813 qemu_cpu_kick_rr_cpu();
1815 if (hax_enabled()) {
1817 * FIXME: race condition with the exit_request check in
1820 cpu
->exit_request
= 1;
1822 qemu_cpu_kick_thread(cpu
);
1826 void qemu_cpu_kick_self(void)
1828 assert(current_cpu
);
1829 qemu_cpu_kick_thread(current_cpu
);
1832 bool qemu_cpu_is_self(CPUState
*cpu
)
1834 return qemu_thread_is_self(cpu
->thread
);
1837 bool qemu_in_vcpu_thread(void)
1839 return current_cpu
&& qemu_cpu_is_self(current_cpu
);
1842 static __thread
bool iothread_locked
= false;
1844 bool qemu_mutex_iothread_locked(void)
1846 return iothread_locked
;
1850 * The BQL is taken from so many places that it is worth profiling the
1851 * callers directly, instead of funneling them all through a single function.
1853 void qemu_mutex_lock_iothread_impl(const char *file
, int line
)
1855 QemuMutexLockFunc bql_lock
= atomic_read(&qemu_bql_mutex_lock_func
);
1857 g_assert(!qemu_mutex_iothread_locked());
1858 bql_lock(&qemu_global_mutex
, file
, line
);
1859 iothread_locked
= true;
1862 void qemu_mutex_unlock_iothread(void)
1864 g_assert(qemu_mutex_iothread_locked());
1865 iothread_locked
= false;
1866 qemu_mutex_unlock(&qemu_global_mutex
);
1869 static bool all_vcpus_paused(void)
1874 if (!cpu
->stopped
) {
1882 void pause_all_vcpus(void)
1886 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, false);
1888 if (qemu_cpu_is_self(cpu
)) {
1889 qemu_cpu_stop(cpu
, true);
1896 /* We need to drop the replay_lock so any vCPU threads woken up
1897 * can finish their replay tasks
1899 replay_mutex_unlock();
1901 while (!all_vcpus_paused()) {
1902 qemu_cond_wait(&qemu_pause_cond
, &qemu_global_mutex
);
1908 qemu_mutex_unlock_iothread();
1909 replay_mutex_lock();
1910 qemu_mutex_lock_iothread();
1913 void cpu_resume(CPUState
*cpu
)
1916 cpu
->stopped
= false;
1920 void resume_all_vcpus(void)
1924 qemu_clock_enable(QEMU_CLOCK_VIRTUAL
, true);
1930 void cpu_remove_sync(CPUState
*cpu
)
1935 qemu_mutex_unlock_iothread();
1936 qemu_thread_join(cpu
->thread
);
1937 qemu_mutex_lock_iothread();
1940 /* For temporary buffers for forming a name */
1941 #define VCPU_THREAD_NAME_SIZE 16
1943 static void qemu_tcg_init_vcpu(CPUState
*cpu
)
1945 char thread_name
[VCPU_THREAD_NAME_SIZE
];
1946 static QemuCond
*single_tcg_halt_cond
;
1947 static QemuThread
*single_tcg_cpu_thread
;
1948 static int tcg_region_inited
;
1950 assert(tcg_enabled());
1952 * Initialize TCG regions--once. Now is a good time, because:
1953 * (1) TCG's init context, prologue and target globals have been set up.
1954 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1955 * -accel flag is processed, so the check doesn't work then).
1957 if (!tcg_region_inited
) {
1958 tcg_region_inited
= 1;
1962 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread
) {
1963 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
1964 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
1965 qemu_cond_init(cpu
->halt_cond
);
1967 if (qemu_tcg_mttcg_enabled()) {
1968 /* create a thread per vCPU with TCG (MTTCG) */
1969 parallel_cpus
= true;
1970 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/TCG",
1973 qemu_thread_create(cpu
->thread
, thread_name
, qemu_tcg_cpu_thread_fn
,
1974 cpu
, QEMU_THREAD_JOINABLE
);
1977 /* share a single thread for all cpus with TCG */
1978 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "ALL CPUs/TCG");
1979 qemu_thread_create(cpu
->thread
, thread_name
,
1980 qemu_tcg_rr_cpu_thread_fn
,
1981 cpu
, QEMU_THREAD_JOINABLE
);
1983 single_tcg_halt_cond
= cpu
->halt_cond
;
1984 single_tcg_cpu_thread
= cpu
->thread
;
1987 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
1990 /* For non-MTTCG cases we share the thread */
1991 cpu
->thread
= single_tcg_cpu_thread
;
1992 cpu
->halt_cond
= single_tcg_halt_cond
;
1993 cpu
->thread_id
= first_cpu
->thread_id
;
1995 cpu
->created
= true;
1999 static void qemu_hax_start_vcpu(CPUState
*cpu
)
2001 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2003 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2004 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2005 qemu_cond_init(cpu
->halt_cond
);
2007 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HAX",
2009 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hax_cpu_thread_fn
,
2010 cpu
, QEMU_THREAD_JOINABLE
);
2012 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2016 static void qemu_kvm_start_vcpu(CPUState
*cpu
)
2018 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2020 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2021 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2022 qemu_cond_init(cpu
->halt_cond
);
2023 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/KVM",
2025 qemu_thread_create(cpu
->thread
, thread_name
, qemu_kvm_cpu_thread_fn
,
2026 cpu
, QEMU_THREAD_JOINABLE
);
2029 static void qemu_hvf_start_vcpu(CPUState
*cpu
)
2031 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2033 /* HVF currently does not support TCG, and only runs in
2034 * unrestricted-guest mode. */
2035 assert(hvf_enabled());
2037 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2038 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2039 qemu_cond_init(cpu
->halt_cond
);
2041 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/HVF",
2043 qemu_thread_create(cpu
->thread
, thread_name
, qemu_hvf_cpu_thread_fn
,
2044 cpu
, QEMU_THREAD_JOINABLE
);
2047 static void qemu_whpx_start_vcpu(CPUState
*cpu
)
2049 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2051 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2052 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2053 qemu_cond_init(cpu
->halt_cond
);
2054 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/WHPX",
2056 qemu_thread_create(cpu
->thread
, thread_name
, qemu_whpx_cpu_thread_fn
,
2057 cpu
, QEMU_THREAD_JOINABLE
);
2059 cpu
->hThread
= qemu_thread_get_handle(cpu
->thread
);
2063 static void qemu_dummy_start_vcpu(CPUState
*cpu
)
2065 char thread_name
[VCPU_THREAD_NAME_SIZE
];
2067 cpu
->thread
= g_malloc0(sizeof(QemuThread
));
2068 cpu
->halt_cond
= g_malloc0(sizeof(QemuCond
));
2069 qemu_cond_init(cpu
->halt_cond
);
2070 snprintf(thread_name
, VCPU_THREAD_NAME_SIZE
, "CPU %d/DUMMY",
2072 qemu_thread_create(cpu
->thread
, thread_name
, qemu_dummy_cpu_thread_fn
, cpu
,
2073 QEMU_THREAD_JOINABLE
);
2076 void qemu_init_vcpu(CPUState
*cpu
)
2078 cpu
->nr_cores
= smp_cores
;
2079 cpu
->nr_threads
= smp_threads
;
2080 cpu
->stopped
= true;
2081 cpu
->random_seed
= qemu_guest_random_seed_thread_part1();
2084 /* If the target cpu hasn't set up any address spaces itself,
2085 * give it the default one.
2088 cpu_address_space_init(cpu
, 0, "cpu-memory", cpu
->memory
);
2091 if (kvm_enabled()) {
2092 qemu_kvm_start_vcpu(cpu
);
2093 } else if (hax_enabled()) {
2094 qemu_hax_start_vcpu(cpu
);
2095 } else if (hvf_enabled()) {
2096 qemu_hvf_start_vcpu(cpu
);
2097 } else if (tcg_enabled()) {
2098 qemu_tcg_init_vcpu(cpu
);
2099 } else if (whpx_enabled()) {
2100 qemu_whpx_start_vcpu(cpu
);
2102 qemu_dummy_start_vcpu(cpu
);
2105 while (!cpu
->created
) {
2106 qemu_cond_wait(&qemu_cpu_cond
, &qemu_global_mutex
);
2110 void cpu_stop_current(void)
2113 current_cpu
->stop
= true;
2114 cpu_exit(current_cpu
);
2118 int vm_stop(RunState state
)
2120 if (qemu_in_vcpu_thread()) {
2121 qemu_system_vmstop_request_prepare();
2122 qemu_system_vmstop_request(state
);
2124 * FIXME: should not return to device code in case
2125 * vm_stop() has been requested.
2131 return do_vm_stop(state
, true);
2135 * Prepare for (re)starting the VM.
2136 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2137 * running or in case of an error condition), 0 otherwise.
2139 int vm_prepare_start(void)
2143 qemu_vmstop_requested(&requested
);
2144 if (runstate_is_running() && requested
== RUN_STATE__MAX
) {
2148 /* Ensure that a STOP/RESUME pair of events is emitted if a
2149 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2150 * example, according to documentation is always followed by
2153 if (runstate_is_running()) {
2154 qapi_event_send_stop();
2155 qapi_event_send_resume();
2159 /* We are sending this now, but the CPUs will be resumed shortly later */
2160 qapi_event_send_resume();
2162 replay_enable_events();
2164 runstate_set(RUN_STATE_RUNNING
);
2165 vm_state_notify(1, RUN_STATE_RUNNING
);
2171 if (!vm_prepare_start()) {
2176 /* does a state transition even if the VM is already stopped,
2177 current state is forgotten forever */
2178 int vm_stop_force_state(RunState state
)
2180 if (runstate_is_running()) {
2181 return vm_stop(state
);
2183 runstate_set(state
);
2186 /* Make sure to return an error if the flush in a previous vm_stop()
2188 return bdrv_flush_all();
2192 void list_cpus(const char *optarg
)
2194 /* XXX: implement xxx_cpu_list for targets that still miss it */
2195 #if defined(cpu_list)
2200 void qmp_memsave(int64_t addr
, int64_t size
, const char *filename
,
2201 bool has_cpu
, int64_t cpu_index
, Error
**errp
)
2207 int64_t orig_addr
= addr
, orig_size
= size
;
2213 cpu
= qemu_get_cpu(cpu_index
);
2215 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cpu-index",
2220 f
= fopen(filename
, "wb");
2222 error_setg_file_open(errp
, errno
, filename
);
2230 if (cpu_memory_rw_debug(cpu
, addr
, buf
, l
, 0) != 0) {
2231 error_setg(errp
, "Invalid addr 0x%016" PRIx64
"/size %" PRId64
2232 " specified", orig_addr
, orig_size
);
2235 if (fwrite(buf
, 1, l
, f
) != l
) {
2236 error_setg(errp
, QERR_IO_ERROR
);
2247 void qmp_pmemsave(int64_t addr
, int64_t size
, const char *filename
,
2254 f
= fopen(filename
, "wb");
2256 error_setg_file_open(errp
, errno
, filename
);
2264 cpu_physical_memory_read(addr
, buf
, l
);
2265 if (fwrite(buf
, 1, l
, f
) != l
) {
2266 error_setg(errp
, QERR_IO_ERROR
);
2277 void qmp_inject_nmi(Error
**errp
)
2279 nmi_monitor_handle(monitor_get_cpu_index(), errp
);
2282 void dump_drift_info(void)
2288 qemu_printf("Host - Guest clock %"PRIi64
" ms\n",
2289 (cpu_get_clock() - cpu_get_icount())/SCALE_MS
);
2290 if (icount_align_option
) {
2291 qemu_printf("Max guest delay %"PRIi64
" ms\n",
2292 -max_delay
/ SCALE_MS
);
2293 qemu_printf("Max guest advance %"PRIi64
" ms\n",
2294 max_advance
/ SCALE_MS
);
2296 qemu_printf("Max guest delay NA\n");
2297 qemu_printf("Max guest advance NA\n");