memory: add MemoryRegionIOMMUOps.replay() callback
[qemu.git] / cpus.c
blob740b8dc3f808b320cce92c434af93f8bc315eb79
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
54 #ifdef CONFIG_LINUX
56 #include <sys/prctl.h>
58 #ifndef PR_MCE_KILL
59 #define PR_MCE_KILL 33
60 #endif
62 #ifndef PR_MCE_KILL_SET
63 #define PR_MCE_KILL_SET 1
64 #endif
66 #ifndef PR_MCE_KILL_EARLY
67 #define PR_MCE_KILL_EARLY 1
68 #endif
70 #endif /* CONFIG_LINUX */
72 int64_t max_delay;
73 int64_t max_advance;
75 /* vcpu throttling controls */
76 static QEMUTimer *throttle_timer;
77 static unsigned int throttle_percentage;
79 #define CPU_THROTTLE_PCT_MIN 1
80 #define CPU_THROTTLE_PCT_MAX 99
81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
83 bool cpu_is_stopped(CPUState *cpu)
85 return cpu->stopped || !runstate_is_running();
88 static bool cpu_thread_is_idle(CPUState *cpu)
90 if (cpu->stop || cpu->queued_work_first) {
91 return false;
93 if (cpu_is_stopped(cpu)) {
94 return true;
96 if (!cpu->halted || cpu_has_work(cpu) ||
97 kvm_halt_in_kernel()) {
98 return false;
100 return true;
103 static bool all_cpu_threads_idle(void)
105 CPUState *cpu;
107 CPU_FOREACH(cpu) {
108 if (!cpu_thread_is_idle(cpu)) {
109 return false;
112 return true;
115 /***********************************************************/
116 /* guest cycle counter */
118 /* Protected by TimersState seqlock */
120 static bool icount_sleep = true;
121 static int64_t vm_clock_warp_start = -1;
122 /* Conversion factor from emulated instructions to virtual clock ticks. */
123 static int icount_time_shift;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
127 static QEMUTimer *icount_rt_timer;
128 static QEMUTimer *icount_vm_timer;
129 static QEMUTimer *icount_warp_timer;
131 typedef struct TimersState {
132 /* Protected by BQL. */
133 int64_t cpu_ticks_prev;
134 int64_t cpu_ticks_offset;
136 /* cpu_clock_offset can be read out of BQL, so protect it with
137 * this lock.
139 QemuSeqLock vm_clock_seqlock;
140 int64_t cpu_clock_offset;
141 int32_t cpu_ticks_enabled;
142 int64_t dummy;
144 /* Compensate for varying guest execution speed. */
145 int64_t qemu_icount_bias;
146 /* Only written by TCG thread */
147 int64_t qemu_icount;
148 } TimersState;
150 static TimersState timers_state;
151 bool mttcg_enabled;
154 * We default to false if we know other options have been enabled
155 * which are currently incompatible with MTTCG. Otherwise when each
156 * guest (target) has been updated to support:
157 * - atomic instructions
158 * - memory ordering primitives (barriers)
159 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
161 * Once a guest architecture has been converted to the new primitives
162 * there are two remaining limitations to check.
164 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
165 * - The host must have a stronger memory order than the guest
167 * It may be possible in future to support strong guests on weak hosts
168 * but that will require tagging all load/stores in a guest with their
169 * implicit memory order requirements which would likely slow things
170 * down a lot.
173 static bool check_tcg_memory_orders_compatible(void)
175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
176 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
177 #else
178 return false;
179 #endif
182 static bool default_mttcg_enabled(void)
184 if (use_icount || TCG_OVERSIZED_GUEST) {
185 return false;
186 } else {
187 #ifdef TARGET_SUPPORTS_MTTCG
188 return check_tcg_memory_orders_compatible();
189 #else
190 return false;
191 #endif
195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
197 const char *t = qemu_opt_get(opts, "thread");
198 if (t) {
199 if (strcmp(t, "multi") == 0) {
200 if (TCG_OVERSIZED_GUEST) {
201 error_setg(errp, "No MTTCG when guest word size > hosts");
202 } else if (use_icount) {
203 error_setg(errp, "No MTTCG when icount is enabled");
204 } else {
205 #ifndef TARGET_SUPPORTS_MTTCG
206 error_report("Guest not yet converted to MTTCG - "
207 "you may get unexpected results");
208 #endif
209 if (!check_tcg_memory_orders_compatible()) {
210 error_report("Guest expects a stronger memory ordering "
211 "than the host provides");
212 error_printf("This may cause strange/hard to debug errors\n");
214 mttcg_enabled = true;
216 } else if (strcmp(t, "single") == 0) {
217 mttcg_enabled = false;
218 } else {
219 error_setg(errp, "Invalid 'thread' setting %s", t);
221 } else {
222 mttcg_enabled = default_mttcg_enabled();
226 /* The current number of executed instructions is based on what we
227 * originally budgeted minus the current state of the decrementing
228 * icount counters in extra/u16.low.
230 static int64_t cpu_get_icount_executed(CPUState *cpu)
232 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
236 * Update the global shared timer_state.qemu_icount to take into
237 * account executed instructions. This is done by the TCG vCPU
238 * thread so the main-loop can see time has moved forward.
240 void cpu_update_icount(CPUState *cpu)
242 int64_t executed = cpu_get_icount_executed(cpu);
243 cpu->icount_budget -= executed;
245 #ifdef CONFIG_ATOMIC64
246 atomic_set__nocheck(&timers_state.qemu_icount,
247 atomic_read__nocheck(&timers_state.qemu_icount) +
248 executed);
249 #else /* FIXME: we need 64bit atomics to do this safely */
250 timers_state.qemu_icount += executed;
251 #endif
254 int64_t cpu_get_icount_raw(void)
256 CPUState *cpu = current_cpu;
258 if (cpu && cpu->running) {
259 if (!cpu->can_do_io) {
260 fprintf(stderr, "Bad icount read\n");
261 exit(1);
263 /* Take into account what has run */
264 cpu_update_icount(cpu);
266 #ifdef CONFIG_ATOMIC64
267 return atomic_read__nocheck(&timers_state.qemu_icount);
268 #else /* FIXME: we need 64bit atomics to do this safely */
269 return timers_state.qemu_icount;
270 #endif
273 /* Return the virtual CPU time, based on the instruction counter. */
274 static int64_t cpu_get_icount_locked(void)
276 int64_t icount = cpu_get_icount_raw();
277 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
280 int64_t cpu_get_icount(void)
282 int64_t icount;
283 unsigned start;
285 do {
286 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
287 icount = cpu_get_icount_locked();
288 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
290 return icount;
293 int64_t cpu_icount_to_ns(int64_t icount)
295 return icount << icount_time_shift;
298 /* return the time elapsed in VM between vm_start and vm_stop. Unless
299 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
300 * counter.
302 * Caller must hold the BQL
304 int64_t cpu_get_ticks(void)
306 int64_t ticks;
308 if (use_icount) {
309 return cpu_get_icount();
312 ticks = timers_state.cpu_ticks_offset;
313 if (timers_state.cpu_ticks_enabled) {
314 ticks += cpu_get_host_ticks();
317 if (timers_state.cpu_ticks_prev > ticks) {
318 /* Note: non increasing ticks may happen if the host uses
319 software suspend */
320 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
321 ticks = timers_state.cpu_ticks_prev;
324 timers_state.cpu_ticks_prev = ticks;
325 return ticks;
328 static int64_t cpu_get_clock_locked(void)
330 int64_t time;
332 time = timers_state.cpu_clock_offset;
333 if (timers_state.cpu_ticks_enabled) {
334 time += get_clock();
337 return time;
340 /* Return the monotonic time elapsed in VM, i.e.,
341 * the time between vm_start and vm_stop
343 int64_t cpu_get_clock(void)
345 int64_t ti;
346 unsigned start;
348 do {
349 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
350 ti = cpu_get_clock_locked();
351 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
353 return ti;
356 /* enable cpu_get_ticks()
357 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
359 void cpu_enable_ticks(void)
361 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
362 seqlock_write_begin(&timers_state.vm_clock_seqlock);
363 if (!timers_state.cpu_ticks_enabled) {
364 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
365 timers_state.cpu_clock_offset -= get_clock();
366 timers_state.cpu_ticks_enabled = 1;
368 seqlock_write_end(&timers_state.vm_clock_seqlock);
371 /* disable cpu_get_ticks() : the clock is stopped. You must not call
372 * cpu_get_ticks() after that.
373 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
375 void cpu_disable_ticks(void)
377 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
378 seqlock_write_begin(&timers_state.vm_clock_seqlock);
379 if (timers_state.cpu_ticks_enabled) {
380 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
381 timers_state.cpu_clock_offset = cpu_get_clock_locked();
382 timers_state.cpu_ticks_enabled = 0;
384 seqlock_write_end(&timers_state.vm_clock_seqlock);
387 /* Correlation between real and virtual time is always going to be
388 fairly approximate, so ignore small variation.
389 When the guest is idle real and virtual time will be aligned in
390 the IO wait loop. */
391 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
393 static void icount_adjust(void)
395 int64_t cur_time;
396 int64_t cur_icount;
397 int64_t delta;
399 /* Protected by TimersState mutex. */
400 static int64_t last_delta;
402 /* If the VM is not running, then do nothing. */
403 if (!runstate_is_running()) {
404 return;
407 seqlock_write_begin(&timers_state.vm_clock_seqlock);
408 cur_time = cpu_get_clock_locked();
409 cur_icount = cpu_get_icount_locked();
411 delta = cur_icount - cur_time;
412 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
413 if (delta > 0
414 && last_delta + ICOUNT_WOBBLE < delta * 2
415 && icount_time_shift > 0) {
416 /* The guest is getting too far ahead. Slow time down. */
417 icount_time_shift--;
419 if (delta < 0
420 && last_delta - ICOUNT_WOBBLE > delta * 2
421 && icount_time_shift < MAX_ICOUNT_SHIFT) {
422 /* The guest is getting too far behind. Speed time up. */
423 icount_time_shift++;
425 last_delta = delta;
426 timers_state.qemu_icount_bias = cur_icount
427 - (timers_state.qemu_icount << icount_time_shift);
428 seqlock_write_end(&timers_state.vm_clock_seqlock);
431 static void icount_adjust_rt(void *opaque)
433 timer_mod(icount_rt_timer,
434 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
435 icount_adjust();
438 static void icount_adjust_vm(void *opaque)
440 timer_mod(icount_vm_timer,
441 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
442 NANOSECONDS_PER_SECOND / 10);
443 icount_adjust();
446 static int64_t qemu_icount_round(int64_t count)
448 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
451 static void icount_warp_rt(void)
453 unsigned seq;
454 int64_t warp_start;
456 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
457 * changes from -1 to another value, so the race here is okay.
459 do {
460 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
461 warp_start = vm_clock_warp_start;
462 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
464 if (warp_start == -1) {
465 return;
468 seqlock_write_begin(&timers_state.vm_clock_seqlock);
469 if (runstate_is_running()) {
470 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
471 cpu_get_clock_locked());
472 int64_t warp_delta;
474 warp_delta = clock - vm_clock_warp_start;
475 if (use_icount == 2) {
477 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
478 * far ahead of real time.
480 int64_t cur_icount = cpu_get_icount_locked();
481 int64_t delta = clock - cur_icount;
482 warp_delta = MIN(warp_delta, delta);
484 timers_state.qemu_icount_bias += warp_delta;
486 vm_clock_warp_start = -1;
487 seqlock_write_end(&timers_state.vm_clock_seqlock);
489 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
490 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
494 static void icount_timer_cb(void *opaque)
496 /* No need for a checkpoint because the timer already synchronizes
497 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
499 icount_warp_rt();
502 void qtest_clock_warp(int64_t dest)
504 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
505 AioContext *aio_context;
506 assert(qtest_enabled());
507 aio_context = qemu_get_aio_context();
508 while (clock < dest) {
509 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
510 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
512 seqlock_write_begin(&timers_state.vm_clock_seqlock);
513 timers_state.qemu_icount_bias += warp;
514 seqlock_write_end(&timers_state.vm_clock_seqlock);
516 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
517 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
518 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
520 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
523 void qemu_start_warp_timer(void)
525 int64_t clock;
526 int64_t deadline;
528 if (!use_icount) {
529 return;
532 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
533 * do not fire, so computing the deadline does not make sense.
535 if (!runstate_is_running()) {
536 return;
539 /* warp clock deterministically in record/replay mode */
540 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
541 return;
544 if (!all_cpu_threads_idle()) {
545 return;
548 if (qtest_enabled()) {
549 /* When testing, qtest commands advance icount. */
550 return;
553 /* We want to use the earliest deadline from ALL vm_clocks */
554 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
555 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
556 if (deadline < 0) {
557 static bool notified;
558 if (!icount_sleep && !notified) {
559 error_report("WARNING: icount sleep disabled and no active timers");
560 notified = true;
562 return;
565 if (deadline > 0) {
567 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
568 * sleep. Otherwise, the CPU might be waiting for a future timer
569 * interrupt to wake it up, but the interrupt never comes because
570 * the vCPU isn't running any insns and thus doesn't advance the
571 * QEMU_CLOCK_VIRTUAL.
573 if (!icount_sleep) {
575 * We never let VCPUs sleep in no sleep icount mode.
576 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
577 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
578 * It is useful when we want a deterministic execution time,
579 * isolated from host latencies.
581 seqlock_write_begin(&timers_state.vm_clock_seqlock);
582 timers_state.qemu_icount_bias += deadline;
583 seqlock_write_end(&timers_state.vm_clock_seqlock);
584 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
585 } else {
587 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
588 * "real" time, (related to the time left until the next event) has
589 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
590 * This avoids that the warps are visible externally; for example,
591 * you will not be sending network packets continuously instead of
592 * every 100ms.
594 seqlock_write_begin(&timers_state.vm_clock_seqlock);
595 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
596 vm_clock_warp_start = clock;
598 seqlock_write_end(&timers_state.vm_clock_seqlock);
599 timer_mod_anticipate(icount_warp_timer, clock + deadline);
601 } else if (deadline == 0) {
602 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
606 static void qemu_account_warp_timer(void)
608 if (!use_icount || !icount_sleep) {
609 return;
612 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
613 * do not fire, so computing the deadline does not make sense.
615 if (!runstate_is_running()) {
616 return;
619 /* warp clock deterministically in record/replay mode */
620 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
621 return;
624 timer_del(icount_warp_timer);
625 icount_warp_rt();
628 static bool icount_state_needed(void *opaque)
630 return use_icount;
634 * This is a subsection for icount migration.
636 static const VMStateDescription icount_vmstate_timers = {
637 .name = "timer/icount",
638 .version_id = 1,
639 .minimum_version_id = 1,
640 .needed = icount_state_needed,
641 .fields = (VMStateField[]) {
642 VMSTATE_INT64(qemu_icount_bias, TimersState),
643 VMSTATE_INT64(qemu_icount, TimersState),
644 VMSTATE_END_OF_LIST()
648 static const VMStateDescription vmstate_timers = {
649 .name = "timer",
650 .version_id = 2,
651 .minimum_version_id = 1,
652 .fields = (VMStateField[]) {
653 VMSTATE_INT64(cpu_ticks_offset, TimersState),
654 VMSTATE_INT64(dummy, TimersState),
655 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
656 VMSTATE_END_OF_LIST()
658 .subsections = (const VMStateDescription*[]) {
659 &icount_vmstate_timers,
660 NULL
664 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
666 double pct;
667 double throttle_ratio;
668 long sleeptime_ns;
670 if (!cpu_throttle_get_percentage()) {
671 return;
674 pct = (double)cpu_throttle_get_percentage()/100;
675 throttle_ratio = pct / (1 - pct);
676 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
678 qemu_mutex_unlock_iothread();
679 atomic_set(&cpu->throttle_thread_scheduled, 0);
680 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
681 qemu_mutex_lock_iothread();
684 static void cpu_throttle_timer_tick(void *opaque)
686 CPUState *cpu;
687 double pct;
689 /* Stop the timer if needed */
690 if (!cpu_throttle_get_percentage()) {
691 return;
693 CPU_FOREACH(cpu) {
694 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
695 async_run_on_cpu(cpu, cpu_throttle_thread,
696 RUN_ON_CPU_NULL);
700 pct = (double)cpu_throttle_get_percentage()/100;
701 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
702 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
705 void cpu_throttle_set(int new_throttle_pct)
707 /* Ensure throttle percentage is within valid range */
708 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
709 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
711 atomic_set(&throttle_percentage, new_throttle_pct);
713 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
714 CPU_THROTTLE_TIMESLICE_NS);
717 void cpu_throttle_stop(void)
719 atomic_set(&throttle_percentage, 0);
722 bool cpu_throttle_active(void)
724 return (cpu_throttle_get_percentage() != 0);
727 int cpu_throttle_get_percentage(void)
729 return atomic_read(&throttle_percentage);
732 void cpu_ticks_init(void)
734 seqlock_init(&timers_state.vm_clock_seqlock);
735 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
736 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
737 cpu_throttle_timer_tick, NULL);
740 void configure_icount(QemuOpts *opts, Error **errp)
742 const char *option;
743 char *rem_str = NULL;
745 option = qemu_opt_get(opts, "shift");
746 if (!option) {
747 if (qemu_opt_get(opts, "align") != NULL) {
748 error_setg(errp, "Please specify shift option when using align");
750 return;
753 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
754 if (icount_sleep) {
755 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
756 icount_timer_cb, NULL);
759 icount_align_option = qemu_opt_get_bool(opts, "align", false);
761 if (icount_align_option && !icount_sleep) {
762 error_setg(errp, "align=on and sleep=off are incompatible");
764 if (strcmp(option, "auto") != 0) {
765 errno = 0;
766 icount_time_shift = strtol(option, &rem_str, 0);
767 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
768 error_setg(errp, "icount: Invalid shift value");
770 use_icount = 1;
771 return;
772 } else if (icount_align_option) {
773 error_setg(errp, "shift=auto and align=on are incompatible");
774 } else if (!icount_sleep) {
775 error_setg(errp, "shift=auto and sleep=off are incompatible");
778 use_icount = 2;
780 /* 125MIPS seems a reasonable initial guess at the guest speed.
781 It will be corrected fairly quickly anyway. */
782 icount_time_shift = 3;
784 /* Have both realtime and virtual time triggers for speed adjustment.
785 The realtime trigger catches emulated time passing too slowly,
786 the virtual time trigger catches emulated time passing too fast.
787 Realtime triggers occur even when idle, so use them less frequently
788 than VM triggers. */
789 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
790 icount_adjust_rt, NULL);
791 timer_mod(icount_rt_timer,
792 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
793 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
794 icount_adjust_vm, NULL);
795 timer_mod(icount_vm_timer,
796 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
797 NANOSECONDS_PER_SECOND / 10);
800 /***********************************************************/
801 /* TCG vCPU kick timer
803 * The kick timer is responsible for moving single threaded vCPU
804 * emulation on to the next vCPU. If more than one vCPU is running a
805 * timer event with force a cpu->exit so the next vCPU can get
806 * scheduled.
808 * The timer is removed if all vCPUs are idle and restarted again once
809 * idleness is complete.
812 static QEMUTimer *tcg_kick_vcpu_timer;
813 static CPUState *tcg_current_rr_cpu;
815 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
817 static inline int64_t qemu_tcg_next_kick(void)
819 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
822 /* Kick the currently round-robin scheduled vCPU */
823 static void qemu_cpu_kick_rr_cpu(void)
825 CPUState *cpu;
826 do {
827 cpu = atomic_mb_read(&tcg_current_rr_cpu);
828 if (cpu) {
829 cpu_exit(cpu);
831 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
834 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
838 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
840 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
841 qemu_notify_event();
842 return;
845 if (!qemu_in_vcpu_thread() && first_cpu) {
846 /* qemu_cpu_kick is not enough to kick a halted CPU out of
847 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
848 * causes cpu_thread_is_idle to return false. This way,
849 * handle_icount_deadline can run.
851 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
855 static void kick_tcg_thread(void *opaque)
857 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
858 qemu_cpu_kick_rr_cpu();
861 static void start_tcg_kick_timer(void)
863 if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
864 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
865 kick_tcg_thread, NULL);
866 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
870 static void stop_tcg_kick_timer(void)
872 if (tcg_kick_vcpu_timer) {
873 timer_del(tcg_kick_vcpu_timer);
874 tcg_kick_vcpu_timer = NULL;
878 /***********************************************************/
879 void hw_error(const char *fmt, ...)
881 va_list ap;
882 CPUState *cpu;
884 va_start(ap, fmt);
885 fprintf(stderr, "qemu: hardware error: ");
886 vfprintf(stderr, fmt, ap);
887 fprintf(stderr, "\n");
888 CPU_FOREACH(cpu) {
889 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
890 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
892 va_end(ap);
893 abort();
896 void cpu_synchronize_all_states(void)
898 CPUState *cpu;
900 CPU_FOREACH(cpu) {
901 cpu_synchronize_state(cpu);
905 void cpu_synchronize_all_post_reset(void)
907 CPUState *cpu;
909 CPU_FOREACH(cpu) {
910 cpu_synchronize_post_reset(cpu);
914 void cpu_synchronize_all_post_init(void)
916 CPUState *cpu;
918 CPU_FOREACH(cpu) {
919 cpu_synchronize_post_init(cpu);
923 static int do_vm_stop(RunState state)
925 int ret = 0;
927 if (runstate_is_running()) {
928 cpu_disable_ticks();
929 pause_all_vcpus();
930 runstate_set(state);
931 vm_state_notify(0, state);
932 qapi_event_send_stop(&error_abort);
935 bdrv_drain_all();
936 replay_disable_events();
937 ret = bdrv_flush_all();
939 return ret;
942 static bool cpu_can_run(CPUState *cpu)
944 if (cpu->stop) {
945 return false;
947 if (cpu_is_stopped(cpu)) {
948 return false;
950 return true;
953 static void cpu_handle_guest_debug(CPUState *cpu)
955 gdb_set_stop_cpu(cpu);
956 qemu_system_debug_request();
957 cpu->stopped = true;
960 #ifdef CONFIG_LINUX
961 static void sigbus_reraise(void)
963 sigset_t set;
964 struct sigaction action;
966 memset(&action, 0, sizeof(action));
967 action.sa_handler = SIG_DFL;
968 if (!sigaction(SIGBUS, &action, NULL)) {
969 raise(SIGBUS);
970 sigemptyset(&set);
971 sigaddset(&set, SIGBUS);
972 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
974 perror("Failed to re-raise SIGBUS!\n");
975 abort();
978 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
980 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
981 sigbus_reraise();
984 if (current_cpu) {
985 /* Called asynchronously in VCPU thread. */
986 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
987 sigbus_reraise();
989 } else {
990 /* Called synchronously (via signalfd) in main thread. */
991 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
992 sigbus_reraise();
997 static void qemu_init_sigbus(void)
999 struct sigaction action;
1001 memset(&action, 0, sizeof(action));
1002 action.sa_flags = SA_SIGINFO;
1003 action.sa_sigaction = sigbus_handler;
1004 sigaction(SIGBUS, &action, NULL);
1006 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1008 #else /* !CONFIG_LINUX */
1009 static void qemu_init_sigbus(void)
1012 #endif /* !CONFIG_LINUX */
1014 static QemuMutex qemu_global_mutex;
1016 static QemuThread io_thread;
1018 /* cpu creation */
1019 static QemuCond qemu_cpu_cond;
1020 /* system init */
1021 static QemuCond qemu_pause_cond;
1023 void qemu_init_cpu_loop(void)
1025 qemu_init_sigbus();
1026 qemu_cond_init(&qemu_cpu_cond);
1027 qemu_cond_init(&qemu_pause_cond);
1028 qemu_mutex_init(&qemu_global_mutex);
1030 qemu_thread_get_self(&io_thread);
1033 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1035 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1038 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1040 if (kvm_destroy_vcpu(cpu) < 0) {
1041 error_report("kvm_destroy_vcpu failed");
1042 exit(EXIT_FAILURE);
1046 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1050 static void qemu_wait_io_event_common(CPUState *cpu)
1052 atomic_mb_set(&cpu->thread_kicked, false);
1053 if (cpu->stop) {
1054 cpu->stop = false;
1055 cpu->stopped = true;
1056 qemu_cond_broadcast(&qemu_pause_cond);
1058 process_queued_cpu_work(cpu);
1061 static bool qemu_tcg_should_sleep(CPUState *cpu)
1063 if (mttcg_enabled) {
1064 return cpu_thread_is_idle(cpu);
1065 } else {
1066 return all_cpu_threads_idle();
1070 static void qemu_tcg_wait_io_event(CPUState *cpu)
1072 while (qemu_tcg_should_sleep(cpu)) {
1073 stop_tcg_kick_timer();
1074 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1077 start_tcg_kick_timer();
1079 qemu_wait_io_event_common(cpu);
1082 static void qemu_kvm_wait_io_event(CPUState *cpu)
1084 while (cpu_thread_is_idle(cpu)) {
1085 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1088 qemu_wait_io_event_common(cpu);
1091 static void *qemu_kvm_cpu_thread_fn(void *arg)
1093 CPUState *cpu = arg;
1094 int r;
1096 rcu_register_thread();
1098 qemu_mutex_lock_iothread();
1099 qemu_thread_get_self(cpu->thread);
1100 cpu->thread_id = qemu_get_thread_id();
1101 cpu->can_do_io = 1;
1102 current_cpu = cpu;
1104 r = kvm_init_vcpu(cpu);
1105 if (r < 0) {
1106 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1107 exit(1);
1110 kvm_init_cpu_signals(cpu);
1112 /* signal CPU creation */
1113 cpu->created = true;
1114 qemu_cond_signal(&qemu_cpu_cond);
1116 do {
1117 if (cpu_can_run(cpu)) {
1118 r = kvm_cpu_exec(cpu);
1119 if (r == EXCP_DEBUG) {
1120 cpu_handle_guest_debug(cpu);
1123 qemu_kvm_wait_io_event(cpu);
1124 } while (!cpu->unplug || cpu_can_run(cpu));
1126 qemu_kvm_destroy_vcpu(cpu);
1127 cpu->created = false;
1128 qemu_cond_signal(&qemu_cpu_cond);
1129 qemu_mutex_unlock_iothread();
1130 return NULL;
1133 static void *qemu_dummy_cpu_thread_fn(void *arg)
1135 #ifdef _WIN32
1136 fprintf(stderr, "qtest is not supported under Windows\n");
1137 exit(1);
1138 #else
1139 CPUState *cpu = arg;
1140 sigset_t waitset;
1141 int r;
1143 rcu_register_thread();
1145 qemu_mutex_lock_iothread();
1146 qemu_thread_get_self(cpu->thread);
1147 cpu->thread_id = qemu_get_thread_id();
1148 cpu->can_do_io = 1;
1149 current_cpu = cpu;
1151 sigemptyset(&waitset);
1152 sigaddset(&waitset, SIG_IPI);
1154 /* signal CPU creation */
1155 cpu->created = true;
1156 qemu_cond_signal(&qemu_cpu_cond);
1158 while (1) {
1159 qemu_mutex_unlock_iothread();
1160 do {
1161 int sig;
1162 r = sigwait(&waitset, &sig);
1163 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1164 if (r == -1) {
1165 perror("sigwait");
1166 exit(1);
1168 qemu_mutex_lock_iothread();
1169 qemu_wait_io_event_common(cpu);
1172 return NULL;
1173 #endif
1176 static int64_t tcg_get_icount_limit(void)
1178 int64_t deadline;
1180 if (replay_mode != REPLAY_MODE_PLAY) {
1181 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1183 /* Maintain prior (possibly buggy) behaviour where if no deadline
1184 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1185 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1186 * nanoseconds.
1188 if ((deadline < 0) || (deadline > INT32_MAX)) {
1189 deadline = INT32_MAX;
1192 return qemu_icount_round(deadline);
1193 } else {
1194 return replay_get_instructions();
1198 static void handle_icount_deadline(void)
1200 assert(qemu_in_vcpu_thread());
1201 if (use_icount) {
1202 int64_t deadline =
1203 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1205 if (deadline == 0) {
1206 /* Wake up other AioContexts. */
1207 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1208 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1213 static void prepare_icount_for_run(CPUState *cpu)
1215 if (use_icount) {
1216 int insns_left;
1218 /* These should always be cleared by process_icount_data after
1219 * each vCPU execution. However u16.high can be raised
1220 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1222 g_assert(cpu->icount_decr.u16.low == 0);
1223 g_assert(cpu->icount_extra == 0);
1225 cpu->icount_budget = tcg_get_icount_limit();
1226 insns_left = MIN(0xffff, cpu->icount_budget);
1227 cpu->icount_decr.u16.low = insns_left;
1228 cpu->icount_extra = cpu->icount_budget - insns_left;
1232 static void process_icount_data(CPUState *cpu)
1234 if (use_icount) {
1235 /* Account for executed instructions */
1236 cpu_update_icount(cpu);
1238 /* Reset the counters */
1239 cpu->icount_decr.u16.low = 0;
1240 cpu->icount_extra = 0;
1241 cpu->icount_budget = 0;
1243 replay_account_executed_instructions();
1248 static int tcg_cpu_exec(CPUState *cpu)
1250 int ret;
1251 #ifdef CONFIG_PROFILER
1252 int64_t ti;
1253 #endif
1255 #ifdef CONFIG_PROFILER
1256 ti = profile_getclock();
1257 #endif
1258 qemu_mutex_unlock_iothread();
1259 cpu_exec_start(cpu);
1260 ret = cpu_exec(cpu);
1261 cpu_exec_end(cpu);
1262 qemu_mutex_lock_iothread();
1263 #ifdef CONFIG_PROFILER
1264 tcg_time += profile_getclock() - ti;
1265 #endif
1266 return ret;
1269 /* Destroy any remaining vCPUs which have been unplugged and have
1270 * finished running
1272 static void deal_with_unplugged_cpus(void)
1274 CPUState *cpu;
1276 CPU_FOREACH(cpu) {
1277 if (cpu->unplug && !cpu_can_run(cpu)) {
1278 qemu_tcg_destroy_vcpu(cpu);
1279 cpu->created = false;
1280 qemu_cond_signal(&qemu_cpu_cond);
1281 break;
1286 /* Single-threaded TCG
1288 * In the single-threaded case each vCPU is simulated in turn. If
1289 * there is more than a single vCPU we create a simple timer to kick
1290 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1291 * This is done explicitly rather than relying on side-effects
1292 * elsewhere.
1295 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1297 CPUState *cpu = arg;
1299 rcu_register_thread();
1301 qemu_mutex_lock_iothread();
1302 qemu_thread_get_self(cpu->thread);
1304 CPU_FOREACH(cpu) {
1305 cpu->thread_id = qemu_get_thread_id();
1306 cpu->created = true;
1307 cpu->can_do_io = 1;
1309 qemu_cond_signal(&qemu_cpu_cond);
1311 /* wait for initial kick-off after machine start */
1312 while (first_cpu->stopped) {
1313 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1315 /* process any pending work */
1316 CPU_FOREACH(cpu) {
1317 current_cpu = cpu;
1318 qemu_wait_io_event_common(cpu);
1322 start_tcg_kick_timer();
1324 cpu = first_cpu;
1326 /* process any pending work */
1327 cpu->exit_request = 1;
1329 while (1) {
1330 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1331 qemu_account_warp_timer();
1333 /* Run the timers here. This is much more efficient than
1334 * waking up the I/O thread and waiting for completion.
1336 handle_icount_deadline();
1338 if (!cpu) {
1339 cpu = first_cpu;
1342 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1344 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1345 current_cpu = cpu;
1347 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1348 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1350 if (cpu_can_run(cpu)) {
1351 int r;
1353 prepare_icount_for_run(cpu);
1355 r = tcg_cpu_exec(cpu);
1357 process_icount_data(cpu);
1359 if (r == EXCP_DEBUG) {
1360 cpu_handle_guest_debug(cpu);
1361 break;
1362 } else if (r == EXCP_ATOMIC) {
1363 qemu_mutex_unlock_iothread();
1364 cpu_exec_step_atomic(cpu);
1365 qemu_mutex_lock_iothread();
1366 break;
1368 } else if (cpu->stop) {
1369 if (cpu->unplug) {
1370 cpu = CPU_NEXT(cpu);
1372 break;
1375 cpu = CPU_NEXT(cpu);
1376 } /* while (cpu && !cpu->exit_request).. */
1378 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1379 atomic_set(&tcg_current_rr_cpu, NULL);
1381 if (cpu && cpu->exit_request) {
1382 atomic_mb_set(&cpu->exit_request, 0);
1385 qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1386 deal_with_unplugged_cpus();
1389 return NULL;
1392 static void *qemu_hax_cpu_thread_fn(void *arg)
1394 CPUState *cpu = arg;
1395 int r;
1397 qemu_mutex_lock_iothread();
1398 qemu_thread_get_self(cpu->thread);
1400 cpu->thread_id = qemu_get_thread_id();
1401 cpu->created = true;
1402 cpu->halted = 0;
1403 current_cpu = cpu;
1405 hax_init_vcpu(cpu);
1406 qemu_cond_signal(&qemu_cpu_cond);
1408 while (1) {
1409 if (cpu_can_run(cpu)) {
1410 r = hax_smp_cpu_exec(cpu);
1411 if (r == EXCP_DEBUG) {
1412 cpu_handle_guest_debug(cpu);
1416 while (cpu_thread_is_idle(cpu)) {
1417 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1419 #ifdef _WIN32
1420 SleepEx(0, TRUE);
1421 #endif
1422 qemu_wait_io_event_common(cpu);
1424 return NULL;
1427 #ifdef _WIN32
1428 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1431 #endif
1433 /* Multi-threaded TCG
1435 * In the multi-threaded case each vCPU has its own thread. The TLS
1436 * variable current_cpu can be used deep in the code to find the
1437 * current CPUState for a given thread.
1440 static void *qemu_tcg_cpu_thread_fn(void *arg)
1442 CPUState *cpu = arg;
1444 g_assert(!use_icount);
1446 rcu_register_thread();
1448 qemu_mutex_lock_iothread();
1449 qemu_thread_get_self(cpu->thread);
1451 cpu->thread_id = qemu_get_thread_id();
1452 cpu->created = true;
1453 cpu->can_do_io = 1;
1454 current_cpu = cpu;
1455 qemu_cond_signal(&qemu_cpu_cond);
1457 /* process any pending work */
1458 cpu->exit_request = 1;
1460 while (1) {
1461 if (cpu_can_run(cpu)) {
1462 int r;
1463 r = tcg_cpu_exec(cpu);
1464 switch (r) {
1465 case EXCP_DEBUG:
1466 cpu_handle_guest_debug(cpu);
1467 break;
1468 case EXCP_HALTED:
1469 /* during start-up the vCPU is reset and the thread is
1470 * kicked several times. If we don't ensure we go back
1471 * to sleep in the halted state we won't cleanly
1472 * start-up when the vCPU is enabled.
1474 * cpu->halted should ensure we sleep in wait_io_event
1476 g_assert(cpu->halted);
1477 break;
1478 case EXCP_ATOMIC:
1479 qemu_mutex_unlock_iothread();
1480 cpu_exec_step_atomic(cpu);
1481 qemu_mutex_lock_iothread();
1482 default:
1483 /* Ignore everything else? */
1484 break;
1488 atomic_mb_set(&cpu->exit_request, 0);
1489 qemu_tcg_wait_io_event(cpu);
1492 return NULL;
1495 static void qemu_cpu_kick_thread(CPUState *cpu)
1497 #ifndef _WIN32
1498 int err;
1500 if (cpu->thread_kicked) {
1501 return;
1503 cpu->thread_kicked = true;
1504 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1505 if (err) {
1506 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1507 exit(1);
1509 #else /* _WIN32 */
1510 if (!qemu_cpu_is_self(cpu)) {
1511 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1512 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1513 __func__, GetLastError());
1514 exit(1);
1517 #endif
1520 void qemu_cpu_kick(CPUState *cpu)
1522 qemu_cond_broadcast(cpu->halt_cond);
1523 if (tcg_enabled()) {
1524 cpu_exit(cpu);
1525 /* NOP unless doing single-thread RR */
1526 qemu_cpu_kick_rr_cpu();
1527 } else {
1528 if (hax_enabled()) {
1530 * FIXME: race condition with the exit_request check in
1531 * hax_vcpu_hax_exec
1533 cpu->exit_request = 1;
1535 qemu_cpu_kick_thread(cpu);
1539 void qemu_cpu_kick_self(void)
1541 assert(current_cpu);
1542 qemu_cpu_kick_thread(current_cpu);
1545 bool qemu_cpu_is_self(CPUState *cpu)
1547 return qemu_thread_is_self(cpu->thread);
1550 bool qemu_in_vcpu_thread(void)
1552 return current_cpu && qemu_cpu_is_self(current_cpu);
1555 static __thread bool iothread_locked = false;
1557 bool qemu_mutex_iothread_locked(void)
1559 return iothread_locked;
1562 void qemu_mutex_lock_iothread(void)
1564 g_assert(!qemu_mutex_iothread_locked());
1565 qemu_mutex_lock(&qemu_global_mutex);
1566 iothread_locked = true;
1569 void qemu_mutex_unlock_iothread(void)
1571 g_assert(qemu_mutex_iothread_locked());
1572 iothread_locked = false;
1573 qemu_mutex_unlock(&qemu_global_mutex);
1576 static bool all_vcpus_paused(void)
1578 CPUState *cpu;
1580 CPU_FOREACH(cpu) {
1581 if (!cpu->stopped) {
1582 return false;
1586 return true;
1589 void pause_all_vcpus(void)
1591 CPUState *cpu;
1593 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1594 CPU_FOREACH(cpu) {
1595 cpu->stop = true;
1596 qemu_cpu_kick(cpu);
1599 if (qemu_in_vcpu_thread()) {
1600 cpu_stop_current();
1603 while (!all_vcpus_paused()) {
1604 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1605 CPU_FOREACH(cpu) {
1606 qemu_cpu_kick(cpu);
1611 void cpu_resume(CPUState *cpu)
1613 cpu->stop = false;
1614 cpu->stopped = false;
1615 qemu_cpu_kick(cpu);
1618 void resume_all_vcpus(void)
1620 CPUState *cpu;
1622 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1623 CPU_FOREACH(cpu) {
1624 cpu_resume(cpu);
1628 void cpu_remove(CPUState *cpu)
1630 cpu->stop = true;
1631 cpu->unplug = true;
1632 qemu_cpu_kick(cpu);
1635 void cpu_remove_sync(CPUState *cpu)
1637 cpu_remove(cpu);
1638 while (cpu->created) {
1639 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1643 /* For temporary buffers for forming a name */
1644 #define VCPU_THREAD_NAME_SIZE 16
1646 static void qemu_tcg_init_vcpu(CPUState *cpu)
1648 char thread_name[VCPU_THREAD_NAME_SIZE];
1649 static QemuCond *single_tcg_halt_cond;
1650 static QemuThread *single_tcg_cpu_thread;
1652 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1653 cpu->thread = g_malloc0(sizeof(QemuThread));
1654 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1655 qemu_cond_init(cpu->halt_cond);
1657 if (qemu_tcg_mttcg_enabled()) {
1658 /* create a thread per vCPU with TCG (MTTCG) */
1659 parallel_cpus = true;
1660 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1661 cpu->cpu_index);
1663 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1664 cpu, QEMU_THREAD_JOINABLE);
1666 } else {
1667 /* share a single thread for all cpus with TCG */
1668 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1669 qemu_thread_create(cpu->thread, thread_name,
1670 qemu_tcg_rr_cpu_thread_fn,
1671 cpu, QEMU_THREAD_JOINABLE);
1673 single_tcg_halt_cond = cpu->halt_cond;
1674 single_tcg_cpu_thread = cpu->thread;
1676 #ifdef _WIN32
1677 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1678 #endif
1679 while (!cpu->created) {
1680 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1682 } else {
1683 /* For non-MTTCG cases we share the thread */
1684 cpu->thread = single_tcg_cpu_thread;
1685 cpu->halt_cond = single_tcg_halt_cond;
1689 static void qemu_hax_start_vcpu(CPUState *cpu)
1691 char thread_name[VCPU_THREAD_NAME_SIZE];
1693 cpu->thread = g_malloc0(sizeof(QemuThread));
1694 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1695 qemu_cond_init(cpu->halt_cond);
1697 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1698 cpu->cpu_index);
1699 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1700 cpu, QEMU_THREAD_JOINABLE);
1701 #ifdef _WIN32
1702 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1703 #endif
1704 while (!cpu->created) {
1705 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1709 static void qemu_kvm_start_vcpu(CPUState *cpu)
1711 char thread_name[VCPU_THREAD_NAME_SIZE];
1713 cpu->thread = g_malloc0(sizeof(QemuThread));
1714 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1715 qemu_cond_init(cpu->halt_cond);
1716 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1717 cpu->cpu_index);
1718 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1719 cpu, QEMU_THREAD_JOINABLE);
1720 while (!cpu->created) {
1721 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1725 static void qemu_dummy_start_vcpu(CPUState *cpu)
1727 char thread_name[VCPU_THREAD_NAME_SIZE];
1729 cpu->thread = g_malloc0(sizeof(QemuThread));
1730 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1731 qemu_cond_init(cpu->halt_cond);
1732 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1733 cpu->cpu_index);
1734 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1735 QEMU_THREAD_JOINABLE);
1736 while (!cpu->created) {
1737 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1741 void qemu_init_vcpu(CPUState *cpu)
1743 cpu->nr_cores = smp_cores;
1744 cpu->nr_threads = smp_threads;
1745 cpu->stopped = true;
1747 if (!cpu->as) {
1748 /* If the target cpu hasn't set up any address spaces itself,
1749 * give it the default one.
1751 AddressSpace *as = address_space_init_shareable(cpu->memory,
1752 "cpu-memory");
1753 cpu->num_ases = 1;
1754 cpu_address_space_init(cpu, as, 0);
1757 if (kvm_enabled()) {
1758 qemu_kvm_start_vcpu(cpu);
1759 } else if (hax_enabled()) {
1760 qemu_hax_start_vcpu(cpu);
1761 } else if (tcg_enabled()) {
1762 qemu_tcg_init_vcpu(cpu);
1763 } else {
1764 qemu_dummy_start_vcpu(cpu);
1768 void cpu_stop_current(void)
1770 if (current_cpu) {
1771 current_cpu->stop = false;
1772 current_cpu->stopped = true;
1773 cpu_exit(current_cpu);
1774 qemu_cond_broadcast(&qemu_pause_cond);
1778 int vm_stop(RunState state)
1780 if (qemu_in_vcpu_thread()) {
1781 qemu_system_vmstop_request_prepare();
1782 qemu_system_vmstop_request(state);
1784 * FIXME: should not return to device code in case
1785 * vm_stop() has been requested.
1787 cpu_stop_current();
1788 return 0;
1791 return do_vm_stop(state);
1795 * Prepare for (re)starting the VM.
1796 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1797 * running or in case of an error condition), 0 otherwise.
1799 int vm_prepare_start(void)
1801 RunState requested;
1802 int res = 0;
1804 qemu_vmstop_requested(&requested);
1805 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1806 return -1;
1809 /* Ensure that a STOP/RESUME pair of events is emitted if a
1810 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1811 * example, according to documentation is always followed by
1812 * the STOP event.
1814 if (runstate_is_running()) {
1815 qapi_event_send_stop(&error_abort);
1816 res = -1;
1817 } else {
1818 replay_enable_events();
1819 cpu_enable_ticks();
1820 runstate_set(RUN_STATE_RUNNING);
1821 vm_state_notify(1, RUN_STATE_RUNNING);
1824 /* We are sending this now, but the CPUs will be resumed shortly later */
1825 qapi_event_send_resume(&error_abort);
1826 return res;
1829 void vm_start(void)
1831 if (!vm_prepare_start()) {
1832 resume_all_vcpus();
1836 /* does a state transition even if the VM is already stopped,
1837 current state is forgotten forever */
1838 int vm_stop_force_state(RunState state)
1840 if (runstate_is_running()) {
1841 return vm_stop(state);
1842 } else {
1843 runstate_set(state);
1845 bdrv_drain_all();
1846 /* Make sure to return an error if the flush in a previous vm_stop()
1847 * failed. */
1848 return bdrv_flush_all();
1852 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1854 /* XXX: implement xxx_cpu_list for targets that still miss it */
1855 #if defined(cpu_list)
1856 cpu_list(f, cpu_fprintf);
1857 #endif
1860 CpuInfoList *qmp_query_cpus(Error **errp)
1862 CpuInfoList *head = NULL, *cur_item = NULL;
1863 CPUState *cpu;
1865 CPU_FOREACH(cpu) {
1866 CpuInfoList *info;
1867 #if defined(TARGET_I386)
1868 X86CPU *x86_cpu = X86_CPU(cpu);
1869 CPUX86State *env = &x86_cpu->env;
1870 #elif defined(TARGET_PPC)
1871 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1872 CPUPPCState *env = &ppc_cpu->env;
1873 #elif defined(TARGET_SPARC)
1874 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1875 CPUSPARCState *env = &sparc_cpu->env;
1876 #elif defined(TARGET_MIPS)
1877 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1878 CPUMIPSState *env = &mips_cpu->env;
1879 #elif defined(TARGET_TRICORE)
1880 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1881 CPUTriCoreState *env = &tricore_cpu->env;
1882 #endif
1884 cpu_synchronize_state(cpu);
1886 info = g_malloc0(sizeof(*info));
1887 info->value = g_malloc0(sizeof(*info->value));
1888 info->value->CPU = cpu->cpu_index;
1889 info->value->current = (cpu == first_cpu);
1890 info->value->halted = cpu->halted;
1891 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1892 info->value->thread_id = cpu->thread_id;
1893 #if defined(TARGET_I386)
1894 info->value->arch = CPU_INFO_ARCH_X86;
1895 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1896 #elif defined(TARGET_PPC)
1897 info->value->arch = CPU_INFO_ARCH_PPC;
1898 info->value->u.ppc.nip = env->nip;
1899 #elif defined(TARGET_SPARC)
1900 info->value->arch = CPU_INFO_ARCH_SPARC;
1901 info->value->u.q_sparc.pc = env->pc;
1902 info->value->u.q_sparc.npc = env->npc;
1903 #elif defined(TARGET_MIPS)
1904 info->value->arch = CPU_INFO_ARCH_MIPS;
1905 info->value->u.q_mips.PC = env->active_tc.PC;
1906 #elif defined(TARGET_TRICORE)
1907 info->value->arch = CPU_INFO_ARCH_TRICORE;
1908 info->value->u.tricore.PC = env->PC;
1909 #else
1910 info->value->arch = CPU_INFO_ARCH_OTHER;
1911 #endif
1913 /* XXX: waiting for the qapi to support GSList */
1914 if (!cur_item) {
1915 head = cur_item = info;
1916 } else {
1917 cur_item->next = info;
1918 cur_item = info;
1922 return head;
1925 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1926 bool has_cpu, int64_t cpu_index, Error **errp)
1928 FILE *f;
1929 uint32_t l;
1930 CPUState *cpu;
1931 uint8_t buf[1024];
1932 int64_t orig_addr = addr, orig_size = size;
1934 if (!has_cpu) {
1935 cpu_index = 0;
1938 cpu = qemu_get_cpu(cpu_index);
1939 if (cpu == NULL) {
1940 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1941 "a CPU number");
1942 return;
1945 f = fopen(filename, "wb");
1946 if (!f) {
1947 error_setg_file_open(errp, errno, filename);
1948 return;
1951 while (size != 0) {
1952 l = sizeof(buf);
1953 if (l > size)
1954 l = size;
1955 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1956 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1957 " specified", orig_addr, orig_size);
1958 goto exit;
1960 if (fwrite(buf, 1, l, f) != l) {
1961 error_setg(errp, QERR_IO_ERROR);
1962 goto exit;
1964 addr += l;
1965 size -= l;
1968 exit:
1969 fclose(f);
1972 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1973 Error **errp)
1975 FILE *f;
1976 uint32_t l;
1977 uint8_t buf[1024];
1979 f = fopen(filename, "wb");
1980 if (!f) {
1981 error_setg_file_open(errp, errno, filename);
1982 return;
1985 while (size != 0) {
1986 l = sizeof(buf);
1987 if (l > size)
1988 l = size;
1989 cpu_physical_memory_read(addr, buf, l);
1990 if (fwrite(buf, 1, l, f) != l) {
1991 error_setg(errp, QERR_IO_ERROR);
1992 goto exit;
1994 addr += l;
1995 size -= l;
1998 exit:
1999 fclose(f);
2002 void qmp_inject_nmi(Error **errp)
2004 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2007 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2009 if (!use_icount) {
2010 return;
2013 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2014 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2015 if (icount_align_option) {
2016 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2017 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2018 } else {
2019 cpu_fprintf(f, "Max guest delay NA\n");
2020 cpu_fprintf(f, "Max guest advance NA\n");