block: Add bdrv_aio_cancel_async
[qemu.git] / cpus.c
blob2a0e133d39aef03ba8bab325bde5430e443995f3
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 /* Needed early for CONFIG_BSD etc. */
26 #include "config-host.h"
28 #include "monitor/monitor.h"
29 #include "qapi/qmp/qerror.h"
30 #include "sysemu/sysemu.h"
31 #include "exec/gdbstub.h"
32 #include "sysemu/dma.h"
33 #include "sysemu/kvm.h"
34 #include "qmp-commands.h"
36 #include "qemu/thread.h"
37 #include "sysemu/cpus.h"
38 #include "sysemu/qtest.h"
39 #include "qemu/main-loop.h"
40 #include "qemu/bitmap.h"
41 #include "qemu/seqlock.h"
42 #include "qapi-event.h"
43 #include "hw/nmi.h"
45 #ifndef _WIN32
46 #include "qemu/compatfd.h"
47 #endif
49 #ifdef CONFIG_LINUX
51 #include <sys/prctl.h>
53 #ifndef PR_MCE_KILL
54 #define PR_MCE_KILL 33
55 #endif
57 #ifndef PR_MCE_KILL_SET
58 #define PR_MCE_KILL_SET 1
59 #endif
61 #ifndef PR_MCE_KILL_EARLY
62 #define PR_MCE_KILL_EARLY 1
63 #endif
65 #endif /* CONFIG_LINUX */
67 static CPUState *next_cpu;
68 int64_t max_delay;
69 int64_t max_advance;
71 bool cpu_is_stopped(CPUState *cpu)
73 return cpu->stopped || !runstate_is_running();
76 static bool cpu_thread_is_idle(CPUState *cpu)
78 if (cpu->stop || cpu->queued_work_first) {
79 return false;
81 if (cpu_is_stopped(cpu)) {
82 return true;
84 if (!cpu->halted || cpu_has_work(cpu) ||
85 kvm_halt_in_kernel()) {
86 return false;
88 return true;
91 static bool all_cpu_threads_idle(void)
93 CPUState *cpu;
95 CPU_FOREACH(cpu) {
96 if (!cpu_thread_is_idle(cpu)) {
97 return false;
100 return true;
103 /***********************************************************/
104 /* guest cycle counter */
106 /* Protected by TimersState seqlock */
108 static int64_t vm_clock_warp_start = -1;
109 /* Conversion factor from emulated instructions to virtual clock ticks. */
110 static int icount_time_shift;
111 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
112 #define MAX_ICOUNT_SHIFT 10
114 static QEMUTimer *icount_rt_timer;
115 static QEMUTimer *icount_vm_timer;
116 static QEMUTimer *icount_warp_timer;
118 typedef struct TimersState {
119 /* Protected by BQL. */
120 int64_t cpu_ticks_prev;
121 int64_t cpu_ticks_offset;
123 /* cpu_clock_offset can be read out of BQL, so protect it with
124 * this lock.
126 QemuSeqLock vm_clock_seqlock;
127 int64_t cpu_clock_offset;
128 int32_t cpu_ticks_enabled;
129 int64_t dummy;
131 /* Compensate for varying guest execution speed. */
132 int64_t qemu_icount_bias;
133 /* Only written by TCG thread */
134 int64_t qemu_icount;
135 } TimersState;
137 static TimersState timers_state;
139 /* Return the virtual CPU time, based on the instruction counter. */
140 static int64_t cpu_get_icount_locked(void)
142 int64_t icount;
143 CPUState *cpu = current_cpu;
145 icount = timers_state.qemu_icount;
146 if (cpu) {
147 if (!cpu_can_do_io(cpu)) {
148 fprintf(stderr, "Bad clock read\n");
150 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
152 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
155 int64_t cpu_get_icount(void)
157 int64_t icount;
158 unsigned start;
160 do {
161 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
162 icount = cpu_get_icount_locked();
163 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
165 return icount;
168 int64_t cpu_icount_to_ns(int64_t icount)
170 return icount << icount_time_shift;
173 /* return the host CPU cycle counter and handle stop/restart */
174 /* Caller must hold the BQL */
175 int64_t cpu_get_ticks(void)
177 int64_t ticks;
179 if (use_icount) {
180 return cpu_get_icount();
183 ticks = timers_state.cpu_ticks_offset;
184 if (timers_state.cpu_ticks_enabled) {
185 ticks += cpu_get_real_ticks();
188 if (timers_state.cpu_ticks_prev > ticks) {
189 /* Note: non increasing ticks may happen if the host uses
190 software suspend */
191 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
192 ticks = timers_state.cpu_ticks_prev;
195 timers_state.cpu_ticks_prev = ticks;
196 return ticks;
199 static int64_t cpu_get_clock_locked(void)
201 int64_t ticks;
203 ticks = timers_state.cpu_clock_offset;
204 if (timers_state.cpu_ticks_enabled) {
205 ticks += get_clock();
208 return ticks;
211 /* return the host CPU monotonic timer and handle stop/restart */
212 int64_t cpu_get_clock(void)
214 int64_t ti;
215 unsigned start;
217 do {
218 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
219 ti = cpu_get_clock_locked();
220 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
222 return ti;
225 /* return the offset between the host clock and virtual CPU clock */
226 int64_t cpu_get_clock_offset(void)
228 int64_t ti;
229 unsigned start;
231 do {
232 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
233 ti = timers_state.cpu_clock_offset;
234 if (!timers_state.cpu_ticks_enabled) {
235 ti -= get_clock();
237 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
239 return -ti;
242 /* enable cpu_get_ticks()
243 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
245 void cpu_enable_ticks(void)
247 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
248 seqlock_write_lock(&timers_state.vm_clock_seqlock);
249 if (!timers_state.cpu_ticks_enabled) {
250 timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
251 timers_state.cpu_clock_offset -= get_clock();
252 timers_state.cpu_ticks_enabled = 1;
254 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
257 /* disable cpu_get_ticks() : the clock is stopped. You must not call
258 * cpu_get_ticks() after that.
259 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
261 void cpu_disable_ticks(void)
263 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
264 seqlock_write_lock(&timers_state.vm_clock_seqlock);
265 if (timers_state.cpu_ticks_enabled) {
266 timers_state.cpu_ticks_offset += cpu_get_real_ticks();
267 timers_state.cpu_clock_offset = cpu_get_clock_locked();
268 timers_state.cpu_ticks_enabled = 0;
270 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
273 /* Correlation between real and virtual time is always going to be
274 fairly approximate, so ignore small variation.
275 When the guest is idle real and virtual time will be aligned in
276 the IO wait loop. */
277 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
279 static void icount_adjust(void)
281 int64_t cur_time;
282 int64_t cur_icount;
283 int64_t delta;
285 /* Protected by TimersState mutex. */
286 static int64_t last_delta;
288 /* If the VM is not running, then do nothing. */
289 if (!runstate_is_running()) {
290 return;
293 seqlock_write_lock(&timers_state.vm_clock_seqlock);
294 cur_time = cpu_get_clock_locked();
295 cur_icount = cpu_get_icount_locked();
297 delta = cur_icount - cur_time;
298 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
299 if (delta > 0
300 && last_delta + ICOUNT_WOBBLE < delta * 2
301 && icount_time_shift > 0) {
302 /* The guest is getting too far ahead. Slow time down. */
303 icount_time_shift--;
305 if (delta < 0
306 && last_delta - ICOUNT_WOBBLE > delta * 2
307 && icount_time_shift < MAX_ICOUNT_SHIFT) {
308 /* The guest is getting too far behind. Speed time up. */
309 icount_time_shift++;
311 last_delta = delta;
312 timers_state.qemu_icount_bias = cur_icount
313 - (timers_state.qemu_icount << icount_time_shift);
314 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
317 static void icount_adjust_rt(void *opaque)
319 timer_mod(icount_rt_timer,
320 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
321 icount_adjust();
324 static void icount_adjust_vm(void *opaque)
326 timer_mod(icount_vm_timer,
327 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
328 get_ticks_per_sec() / 10);
329 icount_adjust();
332 static int64_t qemu_icount_round(int64_t count)
334 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
337 static void icount_warp_rt(void *opaque)
339 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
340 * changes from -1 to another value, so the race here is okay.
342 if (atomic_read(&vm_clock_warp_start) == -1) {
343 return;
346 seqlock_write_lock(&timers_state.vm_clock_seqlock);
347 if (runstate_is_running()) {
348 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
349 int64_t warp_delta;
351 warp_delta = clock - vm_clock_warp_start;
352 if (use_icount == 2) {
354 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
355 * far ahead of real time.
357 int64_t cur_time = cpu_get_clock_locked();
358 int64_t cur_icount = cpu_get_icount_locked();
359 int64_t delta = cur_time - cur_icount;
360 warp_delta = MIN(warp_delta, delta);
362 timers_state.qemu_icount_bias += warp_delta;
364 vm_clock_warp_start = -1;
365 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
367 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
368 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
372 void qtest_clock_warp(int64_t dest)
374 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
375 assert(qtest_enabled());
376 while (clock < dest) {
377 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
378 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
379 seqlock_write_lock(&timers_state.vm_clock_seqlock);
380 timers_state.qemu_icount_bias += warp;
381 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
383 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
384 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
386 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
389 void qemu_clock_warp(QEMUClockType type)
391 int64_t clock;
392 int64_t deadline;
395 * There are too many global variables to make the "warp" behavior
396 * applicable to other clocks. But a clock argument removes the
397 * need for if statements all over the place.
399 if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
400 return;
404 * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
405 * This ensures that the deadline for the timer is computed correctly below.
406 * This also makes sure that the insn counter is synchronized before the
407 * CPU starts running, in case the CPU is woken by an event other than
408 * the earliest QEMU_CLOCK_VIRTUAL timer.
410 icount_warp_rt(NULL);
411 timer_del(icount_warp_timer);
412 if (!all_cpu_threads_idle()) {
413 return;
416 if (qtest_enabled()) {
417 /* When testing, qtest commands advance icount. */
418 return;
421 /* We want to use the earliest deadline from ALL vm_clocks */
422 clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
423 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
424 if (deadline < 0) {
425 return;
428 if (deadline > 0) {
430 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
431 * sleep. Otherwise, the CPU might be waiting for a future timer
432 * interrupt to wake it up, but the interrupt never comes because
433 * the vCPU isn't running any insns and thus doesn't advance the
434 * QEMU_CLOCK_VIRTUAL.
436 * An extreme solution for this problem would be to never let VCPUs
437 * sleep in icount mode if there is a pending QEMU_CLOCK_VIRTUAL
438 * timer; rather time could just advance to the next QEMU_CLOCK_VIRTUAL
439 * event. Instead, we do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL
440 * after some e"real" time, (related to the time left until the next
441 * event) has passed. The QEMU_CLOCK_REALTIME timer will do this.
442 * This avoids that the warps are visible externally; for example,
443 * you will not be sending network packets continuously instead of
444 * every 100ms.
446 seqlock_write_lock(&timers_state.vm_clock_seqlock);
447 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
448 vm_clock_warp_start = clock;
450 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
451 timer_mod_anticipate(icount_warp_timer, clock + deadline);
452 } else if (deadline == 0) {
453 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
457 static bool icount_state_needed(void *opaque)
459 return use_icount;
463 * This is a subsection for icount migration.
465 static const VMStateDescription icount_vmstate_timers = {
466 .name = "timer/icount",
467 .version_id = 1,
468 .minimum_version_id = 1,
469 .fields = (VMStateField[]) {
470 VMSTATE_INT64(qemu_icount_bias, TimersState),
471 VMSTATE_INT64(qemu_icount, TimersState),
472 VMSTATE_END_OF_LIST()
476 static const VMStateDescription vmstate_timers = {
477 .name = "timer",
478 .version_id = 2,
479 .minimum_version_id = 1,
480 .fields = (VMStateField[]) {
481 VMSTATE_INT64(cpu_ticks_offset, TimersState),
482 VMSTATE_INT64(dummy, TimersState),
483 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
484 VMSTATE_END_OF_LIST()
486 .subsections = (VMStateSubsection[]) {
488 .vmsd = &icount_vmstate_timers,
489 .needed = icount_state_needed,
490 }, {
491 /* empty */
496 void cpu_ticks_init(void)
498 seqlock_init(&timers_state.vm_clock_seqlock, NULL);
499 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
502 void configure_icount(QemuOpts *opts, Error **errp)
504 const char *option;
505 char *rem_str = NULL;
507 option = qemu_opt_get(opts, "shift");
508 if (!option) {
509 if (qemu_opt_get(opts, "align") != NULL) {
510 error_setg(errp, "Please specify shift option when using align");
512 return;
514 icount_align_option = qemu_opt_get_bool(opts, "align", false);
515 icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME,
516 icount_warp_rt, NULL);
517 if (strcmp(option, "auto") != 0) {
518 errno = 0;
519 icount_time_shift = strtol(option, &rem_str, 0);
520 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
521 error_setg(errp, "icount: Invalid shift value");
523 use_icount = 1;
524 return;
525 } else if (icount_align_option) {
526 error_setg(errp, "shift=auto and align=on are incompatible");
529 use_icount = 2;
531 /* 125MIPS seems a reasonable initial guess at the guest speed.
532 It will be corrected fairly quickly anyway. */
533 icount_time_shift = 3;
535 /* Have both realtime and virtual time triggers for speed adjustment.
536 The realtime trigger catches emulated time passing too slowly,
537 the virtual time trigger catches emulated time passing too fast.
538 Realtime triggers occur even when idle, so use them less frequently
539 than VM triggers. */
540 icount_rt_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
541 icount_adjust_rt, NULL);
542 timer_mod(icount_rt_timer,
543 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
544 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
545 icount_adjust_vm, NULL);
546 timer_mod(icount_vm_timer,
547 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
548 get_ticks_per_sec() / 10);
551 /***********************************************************/
552 void hw_error(const char *fmt, ...)
554 va_list ap;
555 CPUState *cpu;
557 va_start(ap, fmt);
558 fprintf(stderr, "qemu: hardware error: ");
559 vfprintf(stderr, fmt, ap);
560 fprintf(stderr, "\n");
561 CPU_FOREACH(cpu) {
562 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
563 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
565 va_end(ap);
566 abort();
569 void cpu_synchronize_all_states(void)
571 CPUState *cpu;
573 CPU_FOREACH(cpu) {
574 cpu_synchronize_state(cpu);
578 void cpu_synchronize_all_post_reset(void)
580 CPUState *cpu;
582 CPU_FOREACH(cpu) {
583 cpu_synchronize_post_reset(cpu);
587 void cpu_synchronize_all_post_init(void)
589 CPUState *cpu;
591 CPU_FOREACH(cpu) {
592 cpu_synchronize_post_init(cpu);
596 static int do_vm_stop(RunState state)
598 int ret = 0;
600 if (runstate_is_running()) {
601 cpu_disable_ticks();
602 pause_all_vcpus();
603 runstate_set(state);
604 vm_state_notify(0, state);
605 qapi_event_send_stop(&error_abort);
608 bdrv_drain_all();
609 ret = bdrv_flush_all();
611 return ret;
614 static bool cpu_can_run(CPUState *cpu)
616 if (cpu->stop) {
617 return false;
619 if (cpu_is_stopped(cpu)) {
620 return false;
622 return true;
625 static void cpu_handle_guest_debug(CPUState *cpu)
627 gdb_set_stop_cpu(cpu);
628 qemu_system_debug_request();
629 cpu->stopped = true;
632 static void cpu_signal(int sig)
634 if (current_cpu) {
635 cpu_exit(current_cpu);
637 exit_request = 1;
640 #ifdef CONFIG_LINUX
641 static void sigbus_reraise(void)
643 sigset_t set;
644 struct sigaction action;
646 memset(&action, 0, sizeof(action));
647 action.sa_handler = SIG_DFL;
648 if (!sigaction(SIGBUS, &action, NULL)) {
649 raise(SIGBUS);
650 sigemptyset(&set);
651 sigaddset(&set, SIGBUS);
652 sigprocmask(SIG_UNBLOCK, &set, NULL);
654 perror("Failed to re-raise SIGBUS!\n");
655 abort();
658 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
659 void *ctx)
661 if (kvm_on_sigbus(siginfo->ssi_code,
662 (void *)(intptr_t)siginfo->ssi_addr)) {
663 sigbus_reraise();
667 static void qemu_init_sigbus(void)
669 struct sigaction action;
671 memset(&action, 0, sizeof(action));
672 action.sa_flags = SA_SIGINFO;
673 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
674 sigaction(SIGBUS, &action, NULL);
676 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
679 static void qemu_kvm_eat_signals(CPUState *cpu)
681 struct timespec ts = { 0, 0 };
682 siginfo_t siginfo;
683 sigset_t waitset;
684 sigset_t chkset;
685 int r;
687 sigemptyset(&waitset);
688 sigaddset(&waitset, SIG_IPI);
689 sigaddset(&waitset, SIGBUS);
691 do {
692 r = sigtimedwait(&waitset, &siginfo, &ts);
693 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
694 perror("sigtimedwait");
695 exit(1);
698 switch (r) {
699 case SIGBUS:
700 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
701 sigbus_reraise();
703 break;
704 default:
705 break;
708 r = sigpending(&chkset);
709 if (r == -1) {
710 perror("sigpending");
711 exit(1);
713 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
716 #else /* !CONFIG_LINUX */
718 static void qemu_init_sigbus(void)
722 static void qemu_kvm_eat_signals(CPUState *cpu)
725 #endif /* !CONFIG_LINUX */
727 #ifndef _WIN32
728 static void dummy_signal(int sig)
732 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
734 int r;
735 sigset_t set;
736 struct sigaction sigact;
738 memset(&sigact, 0, sizeof(sigact));
739 sigact.sa_handler = dummy_signal;
740 sigaction(SIG_IPI, &sigact, NULL);
742 pthread_sigmask(SIG_BLOCK, NULL, &set);
743 sigdelset(&set, SIG_IPI);
744 sigdelset(&set, SIGBUS);
745 r = kvm_set_signal_mask(cpu, &set);
746 if (r) {
747 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
748 exit(1);
752 static void qemu_tcg_init_cpu_signals(void)
754 sigset_t set;
755 struct sigaction sigact;
757 memset(&sigact, 0, sizeof(sigact));
758 sigact.sa_handler = cpu_signal;
759 sigaction(SIG_IPI, &sigact, NULL);
761 sigemptyset(&set);
762 sigaddset(&set, SIG_IPI);
763 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
766 #else /* _WIN32 */
767 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
769 abort();
772 static void qemu_tcg_init_cpu_signals(void)
775 #endif /* _WIN32 */
777 static QemuMutex qemu_global_mutex;
778 static QemuCond qemu_io_proceeded_cond;
779 static bool iothread_requesting_mutex;
781 static QemuThread io_thread;
783 static QemuThread *tcg_cpu_thread;
784 static QemuCond *tcg_halt_cond;
786 /* cpu creation */
787 static QemuCond qemu_cpu_cond;
788 /* system init */
789 static QemuCond qemu_pause_cond;
790 static QemuCond qemu_work_cond;
792 void qemu_init_cpu_loop(void)
794 qemu_init_sigbus();
795 qemu_cond_init(&qemu_cpu_cond);
796 qemu_cond_init(&qemu_pause_cond);
797 qemu_cond_init(&qemu_work_cond);
798 qemu_cond_init(&qemu_io_proceeded_cond);
799 qemu_mutex_init(&qemu_global_mutex);
801 qemu_thread_get_self(&io_thread);
804 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
806 struct qemu_work_item wi;
808 if (qemu_cpu_is_self(cpu)) {
809 func(data);
810 return;
813 wi.func = func;
814 wi.data = data;
815 wi.free = false;
816 if (cpu->queued_work_first == NULL) {
817 cpu->queued_work_first = &wi;
818 } else {
819 cpu->queued_work_last->next = &wi;
821 cpu->queued_work_last = &wi;
822 wi.next = NULL;
823 wi.done = false;
825 qemu_cpu_kick(cpu);
826 while (!wi.done) {
827 CPUState *self_cpu = current_cpu;
829 qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
830 current_cpu = self_cpu;
834 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
836 struct qemu_work_item *wi;
838 if (qemu_cpu_is_self(cpu)) {
839 func(data);
840 return;
843 wi = g_malloc0(sizeof(struct qemu_work_item));
844 wi->func = func;
845 wi->data = data;
846 wi->free = true;
847 if (cpu->queued_work_first == NULL) {
848 cpu->queued_work_first = wi;
849 } else {
850 cpu->queued_work_last->next = wi;
852 cpu->queued_work_last = wi;
853 wi->next = NULL;
854 wi->done = false;
856 qemu_cpu_kick(cpu);
859 static void flush_queued_work(CPUState *cpu)
861 struct qemu_work_item *wi;
863 if (cpu->queued_work_first == NULL) {
864 return;
867 while ((wi = cpu->queued_work_first)) {
868 cpu->queued_work_first = wi->next;
869 wi->func(wi->data);
870 wi->done = true;
871 if (wi->free) {
872 g_free(wi);
875 cpu->queued_work_last = NULL;
876 qemu_cond_broadcast(&qemu_work_cond);
879 static void qemu_wait_io_event_common(CPUState *cpu)
881 if (cpu->stop) {
882 cpu->stop = false;
883 cpu->stopped = true;
884 qemu_cond_signal(&qemu_pause_cond);
886 flush_queued_work(cpu);
887 cpu->thread_kicked = false;
890 static void qemu_tcg_wait_io_event(void)
892 CPUState *cpu;
894 while (all_cpu_threads_idle()) {
895 /* Start accounting real time to the virtual clock if the CPUs
896 are idle. */
897 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
898 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
901 while (iothread_requesting_mutex) {
902 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
905 CPU_FOREACH(cpu) {
906 qemu_wait_io_event_common(cpu);
910 static void qemu_kvm_wait_io_event(CPUState *cpu)
912 while (cpu_thread_is_idle(cpu)) {
913 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
916 qemu_kvm_eat_signals(cpu);
917 qemu_wait_io_event_common(cpu);
920 static void *qemu_kvm_cpu_thread_fn(void *arg)
922 CPUState *cpu = arg;
923 int r;
925 qemu_mutex_lock(&qemu_global_mutex);
926 qemu_thread_get_self(cpu->thread);
927 cpu->thread_id = qemu_get_thread_id();
928 current_cpu = cpu;
930 r = kvm_init_vcpu(cpu);
931 if (r < 0) {
932 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
933 exit(1);
936 qemu_kvm_init_cpu_signals(cpu);
938 /* signal CPU creation */
939 cpu->created = true;
940 qemu_cond_signal(&qemu_cpu_cond);
942 while (1) {
943 if (cpu_can_run(cpu)) {
944 r = kvm_cpu_exec(cpu);
945 if (r == EXCP_DEBUG) {
946 cpu_handle_guest_debug(cpu);
949 qemu_kvm_wait_io_event(cpu);
952 return NULL;
955 static void *qemu_dummy_cpu_thread_fn(void *arg)
957 #ifdef _WIN32
958 fprintf(stderr, "qtest is not supported under Windows\n");
959 exit(1);
960 #else
961 CPUState *cpu = arg;
962 sigset_t waitset;
963 int r;
965 qemu_mutex_lock_iothread();
966 qemu_thread_get_self(cpu->thread);
967 cpu->thread_id = qemu_get_thread_id();
969 sigemptyset(&waitset);
970 sigaddset(&waitset, SIG_IPI);
972 /* signal CPU creation */
973 cpu->created = true;
974 qemu_cond_signal(&qemu_cpu_cond);
976 current_cpu = cpu;
977 while (1) {
978 current_cpu = NULL;
979 qemu_mutex_unlock_iothread();
980 do {
981 int sig;
982 r = sigwait(&waitset, &sig);
983 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
984 if (r == -1) {
985 perror("sigwait");
986 exit(1);
988 qemu_mutex_lock_iothread();
989 current_cpu = cpu;
990 qemu_wait_io_event_common(cpu);
993 return NULL;
994 #endif
997 static void tcg_exec_all(void);
999 static void *qemu_tcg_cpu_thread_fn(void *arg)
1001 CPUState *cpu = arg;
1003 qemu_tcg_init_cpu_signals();
1004 qemu_thread_get_self(cpu->thread);
1006 qemu_mutex_lock(&qemu_global_mutex);
1007 CPU_FOREACH(cpu) {
1008 cpu->thread_id = qemu_get_thread_id();
1009 cpu->created = true;
1011 qemu_cond_signal(&qemu_cpu_cond);
1013 /* wait for initial kick-off after machine start */
1014 while (QTAILQ_FIRST(&cpus)->stopped) {
1015 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
1017 /* process any pending work */
1018 CPU_FOREACH(cpu) {
1019 qemu_wait_io_event_common(cpu);
1023 while (1) {
1024 tcg_exec_all();
1026 if (use_icount) {
1027 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1029 if (deadline == 0) {
1030 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1033 qemu_tcg_wait_io_event();
1036 return NULL;
1039 static void qemu_cpu_kick_thread(CPUState *cpu)
1041 #ifndef _WIN32
1042 int err;
1044 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1045 if (err) {
1046 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1047 exit(1);
1049 #else /* _WIN32 */
1050 if (!qemu_cpu_is_self(cpu)) {
1051 CONTEXT tcgContext;
1053 if (SuspendThread(cpu->hThread) == (DWORD)-1) {
1054 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1055 GetLastError());
1056 exit(1);
1059 /* On multi-core systems, we are not sure that the thread is actually
1060 * suspended until we can get the context.
1062 tcgContext.ContextFlags = CONTEXT_CONTROL;
1063 while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
1064 continue;
1067 cpu_signal(0);
1069 if (ResumeThread(cpu->hThread) == (DWORD)-1) {
1070 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1071 GetLastError());
1072 exit(1);
1075 #endif
1078 void qemu_cpu_kick(CPUState *cpu)
1080 qemu_cond_broadcast(cpu->halt_cond);
1081 if (!tcg_enabled() && !cpu->thread_kicked) {
1082 qemu_cpu_kick_thread(cpu);
1083 cpu->thread_kicked = true;
1087 void qemu_cpu_kick_self(void)
1089 #ifndef _WIN32
1090 assert(current_cpu);
1092 if (!current_cpu->thread_kicked) {
1093 qemu_cpu_kick_thread(current_cpu);
1094 current_cpu->thread_kicked = true;
1096 #else
1097 abort();
1098 #endif
1101 bool qemu_cpu_is_self(CPUState *cpu)
1103 return qemu_thread_is_self(cpu->thread);
1106 static bool qemu_in_vcpu_thread(void)
1108 return current_cpu && qemu_cpu_is_self(current_cpu);
1111 void qemu_mutex_lock_iothread(void)
1113 if (!tcg_enabled()) {
1114 qemu_mutex_lock(&qemu_global_mutex);
1115 } else {
1116 iothread_requesting_mutex = true;
1117 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1118 qemu_cpu_kick_thread(first_cpu);
1119 qemu_mutex_lock(&qemu_global_mutex);
1121 iothread_requesting_mutex = false;
1122 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1126 void qemu_mutex_unlock_iothread(void)
1128 qemu_mutex_unlock(&qemu_global_mutex);
1131 static int all_vcpus_paused(void)
1133 CPUState *cpu;
1135 CPU_FOREACH(cpu) {
1136 if (!cpu->stopped) {
1137 return 0;
1141 return 1;
1144 void pause_all_vcpus(void)
1146 CPUState *cpu;
1148 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1149 CPU_FOREACH(cpu) {
1150 cpu->stop = true;
1151 qemu_cpu_kick(cpu);
1154 if (qemu_in_vcpu_thread()) {
1155 cpu_stop_current();
1156 if (!kvm_enabled()) {
1157 CPU_FOREACH(cpu) {
1158 cpu->stop = false;
1159 cpu->stopped = true;
1161 return;
1165 while (!all_vcpus_paused()) {
1166 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1167 CPU_FOREACH(cpu) {
1168 qemu_cpu_kick(cpu);
1173 void cpu_resume(CPUState *cpu)
1175 cpu->stop = false;
1176 cpu->stopped = false;
1177 qemu_cpu_kick(cpu);
1180 void resume_all_vcpus(void)
1182 CPUState *cpu;
1184 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1185 CPU_FOREACH(cpu) {
1186 cpu_resume(cpu);
1190 /* For temporary buffers for forming a name */
1191 #define VCPU_THREAD_NAME_SIZE 16
1193 static void qemu_tcg_init_vcpu(CPUState *cpu)
1195 char thread_name[VCPU_THREAD_NAME_SIZE];
1197 tcg_cpu_address_space_init(cpu, cpu->as);
1199 /* share a single thread for all cpus with TCG */
1200 if (!tcg_cpu_thread) {
1201 cpu->thread = g_malloc0(sizeof(QemuThread));
1202 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1203 qemu_cond_init(cpu->halt_cond);
1204 tcg_halt_cond = cpu->halt_cond;
1205 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1206 cpu->cpu_index);
1207 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1208 cpu, QEMU_THREAD_JOINABLE);
1209 #ifdef _WIN32
1210 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1211 #endif
1212 while (!cpu->created) {
1213 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1215 tcg_cpu_thread = cpu->thread;
1216 } else {
1217 cpu->thread = tcg_cpu_thread;
1218 cpu->halt_cond = tcg_halt_cond;
1222 static void qemu_kvm_start_vcpu(CPUState *cpu)
1224 char thread_name[VCPU_THREAD_NAME_SIZE];
1226 cpu->thread = g_malloc0(sizeof(QemuThread));
1227 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1228 qemu_cond_init(cpu->halt_cond);
1229 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1230 cpu->cpu_index);
1231 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1232 cpu, QEMU_THREAD_JOINABLE);
1233 while (!cpu->created) {
1234 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1238 static void qemu_dummy_start_vcpu(CPUState *cpu)
1240 char thread_name[VCPU_THREAD_NAME_SIZE];
1242 cpu->thread = g_malloc0(sizeof(QemuThread));
1243 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1244 qemu_cond_init(cpu->halt_cond);
1245 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1246 cpu->cpu_index);
1247 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1248 QEMU_THREAD_JOINABLE);
1249 while (!cpu->created) {
1250 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1254 void qemu_init_vcpu(CPUState *cpu)
1256 cpu->nr_cores = smp_cores;
1257 cpu->nr_threads = smp_threads;
1258 cpu->stopped = true;
1259 if (kvm_enabled()) {
1260 qemu_kvm_start_vcpu(cpu);
1261 } else if (tcg_enabled()) {
1262 qemu_tcg_init_vcpu(cpu);
1263 } else {
1264 qemu_dummy_start_vcpu(cpu);
1268 void cpu_stop_current(void)
1270 if (current_cpu) {
1271 current_cpu->stop = false;
1272 current_cpu->stopped = true;
1273 cpu_exit(current_cpu);
1274 qemu_cond_signal(&qemu_pause_cond);
1278 int vm_stop(RunState state)
1280 if (qemu_in_vcpu_thread()) {
1281 qemu_system_vmstop_request_prepare();
1282 qemu_system_vmstop_request(state);
1284 * FIXME: should not return to device code in case
1285 * vm_stop() has been requested.
1287 cpu_stop_current();
1288 return 0;
1291 return do_vm_stop(state);
1294 /* does a state transition even if the VM is already stopped,
1295 current state is forgotten forever */
1296 int vm_stop_force_state(RunState state)
1298 if (runstate_is_running()) {
1299 return vm_stop(state);
1300 } else {
1301 runstate_set(state);
1302 /* Make sure to return an error if the flush in a previous vm_stop()
1303 * failed. */
1304 return bdrv_flush_all();
1308 static int tcg_cpu_exec(CPUArchState *env)
1310 CPUState *cpu = ENV_GET_CPU(env);
1311 int ret;
1312 #ifdef CONFIG_PROFILER
1313 int64_t ti;
1314 #endif
1316 #ifdef CONFIG_PROFILER
1317 ti = profile_getclock();
1318 #endif
1319 if (use_icount) {
1320 int64_t count;
1321 int64_t deadline;
1322 int decr;
1323 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1324 + cpu->icount_extra);
1325 cpu->icount_decr.u16.low = 0;
1326 cpu->icount_extra = 0;
1327 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1329 /* Maintain prior (possibly buggy) behaviour where if no deadline
1330 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1331 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1332 * nanoseconds.
1334 if ((deadline < 0) || (deadline > INT32_MAX)) {
1335 deadline = INT32_MAX;
1338 count = qemu_icount_round(deadline);
1339 timers_state.qemu_icount += count;
1340 decr = (count > 0xffff) ? 0xffff : count;
1341 count -= decr;
1342 cpu->icount_decr.u16.low = decr;
1343 cpu->icount_extra = count;
1345 ret = cpu_exec(env);
1346 #ifdef CONFIG_PROFILER
1347 qemu_time += profile_getclock() - ti;
1348 #endif
1349 if (use_icount) {
1350 /* Fold pending instructions back into the
1351 instruction counter, and clear the interrupt flag. */
1352 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1353 + cpu->icount_extra);
1354 cpu->icount_decr.u32 = 0;
1355 cpu->icount_extra = 0;
1357 return ret;
1360 static void tcg_exec_all(void)
1362 int r;
1364 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1365 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
1367 if (next_cpu == NULL) {
1368 next_cpu = first_cpu;
1370 for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1371 CPUState *cpu = next_cpu;
1372 CPUArchState *env = cpu->env_ptr;
1374 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1375 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1377 if (cpu_can_run(cpu)) {
1378 r = tcg_cpu_exec(env);
1379 if (r == EXCP_DEBUG) {
1380 cpu_handle_guest_debug(cpu);
1381 break;
1383 } else if (cpu->stop || cpu->stopped) {
1384 break;
1387 exit_request = 0;
1390 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1392 /* XXX: implement xxx_cpu_list for targets that still miss it */
1393 #if defined(cpu_list)
1394 cpu_list(f, cpu_fprintf);
1395 #endif
1398 CpuInfoList *qmp_query_cpus(Error **errp)
1400 CpuInfoList *head = NULL, *cur_item = NULL;
1401 CPUState *cpu;
1403 CPU_FOREACH(cpu) {
1404 CpuInfoList *info;
1405 #if defined(TARGET_I386)
1406 X86CPU *x86_cpu = X86_CPU(cpu);
1407 CPUX86State *env = &x86_cpu->env;
1408 #elif defined(TARGET_PPC)
1409 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1410 CPUPPCState *env = &ppc_cpu->env;
1411 #elif defined(TARGET_SPARC)
1412 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1413 CPUSPARCState *env = &sparc_cpu->env;
1414 #elif defined(TARGET_MIPS)
1415 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1416 CPUMIPSState *env = &mips_cpu->env;
1417 #elif defined(TARGET_TRICORE)
1418 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1419 CPUTriCoreState *env = &tricore_cpu->env;
1420 #endif
1422 cpu_synchronize_state(cpu);
1424 info = g_malloc0(sizeof(*info));
1425 info->value = g_malloc0(sizeof(*info->value));
1426 info->value->CPU = cpu->cpu_index;
1427 info->value->current = (cpu == first_cpu);
1428 info->value->halted = cpu->halted;
1429 info->value->thread_id = cpu->thread_id;
1430 #if defined(TARGET_I386)
1431 info->value->has_pc = true;
1432 info->value->pc = env->eip + env->segs[R_CS].base;
1433 #elif defined(TARGET_PPC)
1434 info->value->has_nip = true;
1435 info->value->nip = env->nip;
1436 #elif defined(TARGET_SPARC)
1437 info->value->has_pc = true;
1438 info->value->pc = env->pc;
1439 info->value->has_npc = true;
1440 info->value->npc = env->npc;
1441 #elif defined(TARGET_MIPS)
1442 info->value->has_PC = true;
1443 info->value->PC = env->active_tc.PC;
1444 #elif defined(TARGET_TRICORE)
1445 info->value->has_PC = true;
1446 info->value->PC = env->PC;
1447 #endif
1449 /* XXX: waiting for the qapi to support GSList */
1450 if (!cur_item) {
1451 head = cur_item = info;
1452 } else {
1453 cur_item->next = info;
1454 cur_item = info;
1458 return head;
1461 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1462 bool has_cpu, int64_t cpu_index, Error **errp)
1464 FILE *f;
1465 uint32_t l;
1466 CPUState *cpu;
1467 uint8_t buf[1024];
1469 if (!has_cpu) {
1470 cpu_index = 0;
1473 cpu = qemu_get_cpu(cpu_index);
1474 if (cpu == NULL) {
1475 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1476 "a CPU number");
1477 return;
1480 f = fopen(filename, "wb");
1481 if (!f) {
1482 error_setg_file_open(errp, errno, filename);
1483 return;
1486 while (size != 0) {
1487 l = sizeof(buf);
1488 if (l > size)
1489 l = size;
1490 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1491 error_setg(errp, "Invalid addr 0x%016" PRIx64 "specified", addr);
1492 goto exit;
1494 if (fwrite(buf, 1, l, f) != l) {
1495 error_set(errp, QERR_IO_ERROR);
1496 goto exit;
1498 addr += l;
1499 size -= l;
1502 exit:
1503 fclose(f);
1506 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1507 Error **errp)
1509 FILE *f;
1510 uint32_t l;
1511 uint8_t buf[1024];
1513 f = fopen(filename, "wb");
1514 if (!f) {
1515 error_setg_file_open(errp, errno, filename);
1516 return;
1519 while (size != 0) {
1520 l = sizeof(buf);
1521 if (l > size)
1522 l = size;
1523 cpu_physical_memory_read(addr, buf, l);
1524 if (fwrite(buf, 1, l, f) != l) {
1525 error_set(errp, QERR_IO_ERROR);
1526 goto exit;
1528 addr += l;
1529 size -= l;
1532 exit:
1533 fclose(f);
1536 void qmp_inject_nmi(Error **errp)
1538 #if defined(TARGET_I386)
1539 CPUState *cs;
1541 CPU_FOREACH(cs) {
1542 X86CPU *cpu = X86_CPU(cs);
1544 if (!cpu->apic_state) {
1545 cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1546 } else {
1547 apic_deliver_nmi(cpu->apic_state);
1550 #else
1551 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1552 #endif
1555 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1557 if (!use_icount) {
1558 return;
1561 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1562 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1563 if (icount_align_option) {
1564 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1565 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1566 } else {
1567 cpu_fprintf(f, "Max guest delay NA\n");
1568 cpu_fprintf(f, "Max guest advance NA\n");