block: Make the block accounting functions operate on BlockAcctStats
[qemu/qmp-unstable.git] / cpus.c
blob0f7d0eaf3674a09d1ab89a1a8f8cfd0649c082e6
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 /* Needed early for CONFIG_BSD etc. */
26 #include "config-host.h"
28 #include "monitor/monitor.h"
29 #include "qapi/qmp/qerror.h"
30 #include "sysemu/sysemu.h"
31 #include "exec/gdbstub.h"
32 #include "sysemu/dma.h"
33 #include "sysemu/kvm.h"
34 #include "qmp-commands.h"
36 #include "qemu/thread.h"
37 #include "sysemu/cpus.h"
38 #include "sysemu/qtest.h"
39 #include "qemu/main-loop.h"
40 #include "qemu/bitmap.h"
41 #include "qemu/seqlock.h"
42 #include "qapi-event.h"
43 #include "hw/nmi.h"
45 #ifndef _WIN32
46 #include "qemu/compatfd.h"
47 #endif
49 #ifdef CONFIG_LINUX
51 #include <sys/prctl.h>
53 #ifndef PR_MCE_KILL
54 #define PR_MCE_KILL 33
55 #endif
57 #ifndef PR_MCE_KILL_SET
58 #define PR_MCE_KILL_SET 1
59 #endif
61 #ifndef PR_MCE_KILL_EARLY
62 #define PR_MCE_KILL_EARLY 1
63 #endif
65 #endif /* CONFIG_LINUX */
67 static CPUState *next_cpu;
68 int64_t max_delay;
69 int64_t max_advance;
71 bool cpu_is_stopped(CPUState *cpu)
73 return cpu->stopped || !runstate_is_running();
76 static bool cpu_thread_is_idle(CPUState *cpu)
78 if (cpu->stop || cpu->queued_work_first) {
79 return false;
81 if (cpu_is_stopped(cpu)) {
82 return true;
84 if (!cpu->halted || cpu_has_work(cpu) ||
85 kvm_halt_in_kernel()) {
86 return false;
88 return true;
91 static bool all_cpu_threads_idle(void)
93 CPUState *cpu;
95 CPU_FOREACH(cpu) {
96 if (!cpu_thread_is_idle(cpu)) {
97 return false;
100 return true;
103 /***********************************************************/
104 /* guest cycle counter */
106 /* Protected by TimersState seqlock */
108 static int64_t vm_clock_warp_start = -1;
109 /* Conversion factor from emulated instructions to virtual clock ticks. */
110 static int icount_time_shift;
111 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
112 #define MAX_ICOUNT_SHIFT 10
114 static QEMUTimer *icount_rt_timer;
115 static QEMUTimer *icount_vm_timer;
116 static QEMUTimer *icount_warp_timer;
118 typedef struct TimersState {
119 /* Protected by BQL. */
120 int64_t cpu_ticks_prev;
121 int64_t cpu_ticks_offset;
123 /* cpu_clock_offset can be read out of BQL, so protect it with
124 * this lock.
126 QemuSeqLock vm_clock_seqlock;
127 int64_t cpu_clock_offset;
128 int32_t cpu_ticks_enabled;
129 int64_t dummy;
131 /* Compensate for varying guest execution speed. */
132 int64_t qemu_icount_bias;
133 /* Only written by TCG thread */
134 int64_t qemu_icount;
135 } TimersState;
137 static TimersState timers_state;
139 /* Return the virtual CPU time, based on the instruction counter. */
140 static int64_t cpu_get_icount_locked(void)
142 int64_t icount;
143 CPUState *cpu = current_cpu;
145 icount = timers_state.qemu_icount;
146 if (cpu) {
147 if (!cpu_can_do_io(cpu)) {
148 fprintf(stderr, "Bad clock read\n");
150 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
152 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
155 int64_t cpu_get_icount(void)
157 int64_t icount;
158 unsigned start;
160 do {
161 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
162 icount = cpu_get_icount_locked();
163 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
165 return icount;
168 int64_t cpu_icount_to_ns(int64_t icount)
170 return icount << icount_time_shift;
173 /* return the host CPU cycle counter and handle stop/restart */
174 /* Caller must hold the BQL */
175 int64_t cpu_get_ticks(void)
177 int64_t ticks;
179 if (use_icount) {
180 return cpu_get_icount();
183 ticks = timers_state.cpu_ticks_offset;
184 if (timers_state.cpu_ticks_enabled) {
185 ticks += cpu_get_real_ticks();
188 if (timers_state.cpu_ticks_prev > ticks) {
189 /* Note: non increasing ticks may happen if the host uses
190 software suspend */
191 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
192 ticks = timers_state.cpu_ticks_prev;
195 timers_state.cpu_ticks_prev = ticks;
196 return ticks;
199 static int64_t cpu_get_clock_locked(void)
201 int64_t ticks;
203 ticks = timers_state.cpu_clock_offset;
204 if (timers_state.cpu_ticks_enabled) {
205 ticks += get_clock();
208 return ticks;
211 /* return the host CPU monotonic timer and handle stop/restart */
212 int64_t cpu_get_clock(void)
214 int64_t ti;
215 unsigned start;
217 do {
218 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
219 ti = cpu_get_clock_locked();
220 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
222 return ti;
225 /* return the offset between the host clock and virtual CPU clock */
226 int64_t cpu_get_clock_offset(void)
228 int64_t ti;
229 unsigned start;
231 do {
232 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
233 ti = timers_state.cpu_clock_offset;
234 if (!timers_state.cpu_ticks_enabled) {
235 ti -= get_clock();
237 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
239 return -ti;
242 /* enable cpu_get_ticks()
243 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
245 void cpu_enable_ticks(void)
247 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
248 seqlock_write_lock(&timers_state.vm_clock_seqlock);
249 if (!timers_state.cpu_ticks_enabled) {
250 timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
251 timers_state.cpu_clock_offset -= get_clock();
252 timers_state.cpu_ticks_enabled = 1;
254 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
257 /* disable cpu_get_ticks() : the clock is stopped. You must not call
258 * cpu_get_ticks() after that.
259 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
261 void cpu_disable_ticks(void)
263 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
264 seqlock_write_lock(&timers_state.vm_clock_seqlock);
265 if (timers_state.cpu_ticks_enabled) {
266 timers_state.cpu_ticks_offset += cpu_get_real_ticks();
267 timers_state.cpu_clock_offset = cpu_get_clock_locked();
268 timers_state.cpu_ticks_enabled = 0;
270 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
273 /* Correlation between real and virtual time is always going to be
274 fairly approximate, so ignore small variation.
275 When the guest is idle real and virtual time will be aligned in
276 the IO wait loop. */
277 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
279 static void icount_adjust(void)
281 int64_t cur_time;
282 int64_t cur_icount;
283 int64_t delta;
285 /* Protected by TimersState mutex. */
286 static int64_t last_delta;
288 /* If the VM is not running, then do nothing. */
289 if (!runstate_is_running()) {
290 return;
293 seqlock_write_lock(&timers_state.vm_clock_seqlock);
294 cur_time = cpu_get_clock_locked();
295 cur_icount = cpu_get_icount_locked();
297 delta = cur_icount - cur_time;
298 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
299 if (delta > 0
300 && last_delta + ICOUNT_WOBBLE < delta * 2
301 && icount_time_shift > 0) {
302 /* The guest is getting too far ahead. Slow time down. */
303 icount_time_shift--;
305 if (delta < 0
306 && last_delta - ICOUNT_WOBBLE > delta * 2
307 && icount_time_shift < MAX_ICOUNT_SHIFT) {
308 /* The guest is getting too far behind. Speed time up. */
309 icount_time_shift++;
311 last_delta = delta;
312 timers_state.qemu_icount_bias = cur_icount
313 - (timers_state.qemu_icount << icount_time_shift);
314 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
317 static void icount_adjust_rt(void *opaque)
319 timer_mod(icount_rt_timer,
320 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
321 icount_adjust();
324 static void icount_adjust_vm(void *opaque)
326 timer_mod(icount_vm_timer,
327 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
328 get_ticks_per_sec() / 10);
329 icount_adjust();
332 static int64_t qemu_icount_round(int64_t count)
334 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
337 static void icount_warp_rt(void *opaque)
339 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
340 * changes from -1 to another value, so the race here is okay.
342 if (atomic_read(&vm_clock_warp_start) == -1) {
343 return;
346 seqlock_write_lock(&timers_state.vm_clock_seqlock);
347 if (runstate_is_running()) {
348 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
349 int64_t warp_delta;
351 warp_delta = clock - vm_clock_warp_start;
352 if (use_icount == 2) {
354 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
355 * far ahead of real time.
357 int64_t cur_time = cpu_get_clock_locked();
358 int64_t cur_icount = cpu_get_icount_locked();
359 int64_t delta = cur_time - cur_icount;
360 warp_delta = MIN(warp_delta, delta);
362 timers_state.qemu_icount_bias += warp_delta;
364 vm_clock_warp_start = -1;
365 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
367 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
368 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
372 void qtest_clock_warp(int64_t dest)
374 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
375 assert(qtest_enabled());
376 while (clock < dest) {
377 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
378 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
379 seqlock_write_lock(&timers_state.vm_clock_seqlock);
380 timers_state.qemu_icount_bias += warp;
381 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
383 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
384 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
386 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
389 void qemu_clock_warp(QEMUClockType type)
391 int64_t clock;
392 int64_t deadline;
395 * There are too many global variables to make the "warp" behavior
396 * applicable to other clocks. But a clock argument removes the
397 * need for if statements all over the place.
399 if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
400 return;
404 * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
405 * This ensures that the deadline for the timer is computed correctly below.
406 * This also makes sure that the insn counter is synchronized before the
407 * CPU starts running, in case the CPU is woken by an event other than
408 * the earliest QEMU_CLOCK_VIRTUAL timer.
410 icount_warp_rt(NULL);
411 timer_del(icount_warp_timer);
412 if (!all_cpu_threads_idle()) {
413 return;
416 if (qtest_enabled()) {
417 /* When testing, qtest commands advance icount. */
418 return;
421 /* We want to use the earliest deadline from ALL vm_clocks */
422 clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
423 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
424 if (deadline < 0) {
425 return;
428 if (deadline > 0) {
430 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
431 * sleep. Otherwise, the CPU might be waiting for a future timer
432 * interrupt to wake it up, but the interrupt never comes because
433 * the vCPU isn't running any insns and thus doesn't advance the
434 * QEMU_CLOCK_VIRTUAL.
436 * An extreme solution for this problem would be to never let VCPUs
437 * sleep in icount mode if there is a pending QEMU_CLOCK_VIRTUAL
438 * timer; rather time could just advance to the next QEMU_CLOCK_VIRTUAL
439 * event. Instead, we do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL
440 * after some e"real" time, (related to the time left until the next
441 * event) has passed. The QEMU_CLOCK_REALTIME timer will do this.
442 * This avoids that the warps are visible externally; for example,
443 * you will not be sending network packets continuously instead of
444 * every 100ms.
446 seqlock_write_lock(&timers_state.vm_clock_seqlock);
447 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
448 vm_clock_warp_start = clock;
450 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
451 timer_mod_anticipate(icount_warp_timer, clock + deadline);
452 } else if (deadline == 0) {
453 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
457 static bool icount_state_needed(void *opaque)
459 return use_icount;
463 * This is a subsection for icount migration.
465 static const VMStateDescription icount_vmstate_timers = {
466 .name = "timer/icount",
467 .version_id = 1,
468 .minimum_version_id = 1,
469 .fields = (VMStateField[]) {
470 VMSTATE_INT64(qemu_icount_bias, TimersState),
471 VMSTATE_INT64(qemu_icount, TimersState),
472 VMSTATE_END_OF_LIST()
476 static const VMStateDescription vmstate_timers = {
477 .name = "timer",
478 .version_id = 2,
479 .minimum_version_id = 1,
480 .fields = (VMStateField[]) {
481 VMSTATE_INT64(cpu_ticks_offset, TimersState),
482 VMSTATE_INT64(dummy, TimersState),
483 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
484 VMSTATE_END_OF_LIST()
486 .subsections = (VMStateSubsection[]) {
488 .vmsd = &icount_vmstate_timers,
489 .needed = icount_state_needed,
490 }, {
491 /* empty */
496 void configure_icount(QemuOpts *opts, Error **errp)
498 const char *option;
499 char *rem_str = NULL;
501 seqlock_init(&timers_state.vm_clock_seqlock, NULL);
502 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
503 option = qemu_opt_get(opts, "shift");
504 if (!option) {
505 if (qemu_opt_get(opts, "align") != NULL) {
506 error_setg(errp, "Please specify shift option when using align");
508 return;
510 icount_align_option = qemu_opt_get_bool(opts, "align", false);
511 icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME,
512 icount_warp_rt, NULL);
513 if (strcmp(option, "auto") != 0) {
514 errno = 0;
515 icount_time_shift = strtol(option, &rem_str, 0);
516 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
517 error_setg(errp, "icount: Invalid shift value");
519 use_icount = 1;
520 return;
521 } else if (icount_align_option) {
522 error_setg(errp, "shift=auto and align=on are incompatible");
525 use_icount = 2;
527 /* 125MIPS seems a reasonable initial guess at the guest speed.
528 It will be corrected fairly quickly anyway. */
529 icount_time_shift = 3;
531 /* Have both realtime and virtual time triggers for speed adjustment.
532 The realtime trigger catches emulated time passing too slowly,
533 the virtual time trigger catches emulated time passing too fast.
534 Realtime triggers occur even when idle, so use them less frequently
535 than VM triggers. */
536 icount_rt_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
537 icount_adjust_rt, NULL);
538 timer_mod(icount_rt_timer,
539 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
540 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
541 icount_adjust_vm, NULL);
542 timer_mod(icount_vm_timer,
543 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
544 get_ticks_per_sec() / 10);
547 /***********************************************************/
548 void hw_error(const char *fmt, ...)
550 va_list ap;
551 CPUState *cpu;
553 va_start(ap, fmt);
554 fprintf(stderr, "qemu: hardware error: ");
555 vfprintf(stderr, fmt, ap);
556 fprintf(stderr, "\n");
557 CPU_FOREACH(cpu) {
558 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
559 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
561 va_end(ap);
562 abort();
565 void cpu_synchronize_all_states(void)
567 CPUState *cpu;
569 CPU_FOREACH(cpu) {
570 cpu_synchronize_state(cpu);
574 void cpu_synchronize_all_post_reset(void)
576 CPUState *cpu;
578 CPU_FOREACH(cpu) {
579 cpu_synchronize_post_reset(cpu);
583 void cpu_synchronize_all_post_init(void)
585 CPUState *cpu;
587 CPU_FOREACH(cpu) {
588 cpu_synchronize_post_init(cpu);
592 static int do_vm_stop(RunState state)
594 int ret = 0;
596 if (runstate_is_running()) {
597 cpu_disable_ticks();
598 pause_all_vcpus();
599 runstate_set(state);
600 vm_state_notify(0, state);
601 qapi_event_send_stop(&error_abort);
604 bdrv_drain_all();
605 ret = bdrv_flush_all();
607 return ret;
610 static bool cpu_can_run(CPUState *cpu)
612 if (cpu->stop) {
613 return false;
615 if (cpu_is_stopped(cpu)) {
616 return false;
618 return true;
621 static void cpu_handle_guest_debug(CPUState *cpu)
623 gdb_set_stop_cpu(cpu);
624 qemu_system_debug_request();
625 cpu->stopped = true;
628 static void cpu_signal(int sig)
630 if (current_cpu) {
631 cpu_exit(current_cpu);
633 exit_request = 1;
636 #ifdef CONFIG_LINUX
637 static void sigbus_reraise(void)
639 sigset_t set;
640 struct sigaction action;
642 memset(&action, 0, sizeof(action));
643 action.sa_handler = SIG_DFL;
644 if (!sigaction(SIGBUS, &action, NULL)) {
645 raise(SIGBUS);
646 sigemptyset(&set);
647 sigaddset(&set, SIGBUS);
648 sigprocmask(SIG_UNBLOCK, &set, NULL);
650 perror("Failed to re-raise SIGBUS!\n");
651 abort();
654 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
655 void *ctx)
657 if (kvm_on_sigbus(siginfo->ssi_code,
658 (void *)(intptr_t)siginfo->ssi_addr)) {
659 sigbus_reraise();
663 static void qemu_init_sigbus(void)
665 struct sigaction action;
667 memset(&action, 0, sizeof(action));
668 action.sa_flags = SA_SIGINFO;
669 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
670 sigaction(SIGBUS, &action, NULL);
672 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
675 static void qemu_kvm_eat_signals(CPUState *cpu)
677 struct timespec ts = { 0, 0 };
678 siginfo_t siginfo;
679 sigset_t waitset;
680 sigset_t chkset;
681 int r;
683 sigemptyset(&waitset);
684 sigaddset(&waitset, SIG_IPI);
685 sigaddset(&waitset, SIGBUS);
687 do {
688 r = sigtimedwait(&waitset, &siginfo, &ts);
689 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
690 perror("sigtimedwait");
691 exit(1);
694 switch (r) {
695 case SIGBUS:
696 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
697 sigbus_reraise();
699 break;
700 default:
701 break;
704 r = sigpending(&chkset);
705 if (r == -1) {
706 perror("sigpending");
707 exit(1);
709 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
712 #else /* !CONFIG_LINUX */
714 static void qemu_init_sigbus(void)
718 static void qemu_kvm_eat_signals(CPUState *cpu)
721 #endif /* !CONFIG_LINUX */
723 #ifndef _WIN32
724 static void dummy_signal(int sig)
728 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
730 int r;
731 sigset_t set;
732 struct sigaction sigact;
734 memset(&sigact, 0, sizeof(sigact));
735 sigact.sa_handler = dummy_signal;
736 sigaction(SIG_IPI, &sigact, NULL);
738 pthread_sigmask(SIG_BLOCK, NULL, &set);
739 sigdelset(&set, SIG_IPI);
740 sigdelset(&set, SIGBUS);
741 r = kvm_set_signal_mask(cpu, &set);
742 if (r) {
743 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
744 exit(1);
748 static void qemu_tcg_init_cpu_signals(void)
750 sigset_t set;
751 struct sigaction sigact;
753 memset(&sigact, 0, sizeof(sigact));
754 sigact.sa_handler = cpu_signal;
755 sigaction(SIG_IPI, &sigact, NULL);
757 sigemptyset(&set);
758 sigaddset(&set, SIG_IPI);
759 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
762 #else /* _WIN32 */
763 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
765 abort();
768 static void qemu_tcg_init_cpu_signals(void)
771 #endif /* _WIN32 */
773 static QemuMutex qemu_global_mutex;
774 static QemuCond qemu_io_proceeded_cond;
775 static bool iothread_requesting_mutex;
777 static QemuThread io_thread;
779 static QemuThread *tcg_cpu_thread;
780 static QemuCond *tcg_halt_cond;
782 /* cpu creation */
783 static QemuCond qemu_cpu_cond;
784 /* system init */
785 static QemuCond qemu_pause_cond;
786 static QemuCond qemu_work_cond;
788 void qemu_init_cpu_loop(void)
790 qemu_init_sigbus();
791 qemu_cond_init(&qemu_cpu_cond);
792 qemu_cond_init(&qemu_pause_cond);
793 qemu_cond_init(&qemu_work_cond);
794 qemu_cond_init(&qemu_io_proceeded_cond);
795 qemu_mutex_init(&qemu_global_mutex);
797 qemu_thread_get_self(&io_thread);
800 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
802 struct qemu_work_item wi;
804 if (qemu_cpu_is_self(cpu)) {
805 func(data);
806 return;
809 wi.func = func;
810 wi.data = data;
811 wi.free = false;
812 if (cpu->queued_work_first == NULL) {
813 cpu->queued_work_first = &wi;
814 } else {
815 cpu->queued_work_last->next = &wi;
817 cpu->queued_work_last = &wi;
818 wi.next = NULL;
819 wi.done = false;
821 qemu_cpu_kick(cpu);
822 while (!wi.done) {
823 CPUState *self_cpu = current_cpu;
825 qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
826 current_cpu = self_cpu;
830 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
832 struct qemu_work_item *wi;
834 if (qemu_cpu_is_self(cpu)) {
835 func(data);
836 return;
839 wi = g_malloc0(sizeof(struct qemu_work_item));
840 wi->func = func;
841 wi->data = data;
842 wi->free = true;
843 if (cpu->queued_work_first == NULL) {
844 cpu->queued_work_first = wi;
845 } else {
846 cpu->queued_work_last->next = wi;
848 cpu->queued_work_last = wi;
849 wi->next = NULL;
850 wi->done = false;
852 qemu_cpu_kick(cpu);
855 static void flush_queued_work(CPUState *cpu)
857 struct qemu_work_item *wi;
859 if (cpu->queued_work_first == NULL) {
860 return;
863 while ((wi = cpu->queued_work_first)) {
864 cpu->queued_work_first = wi->next;
865 wi->func(wi->data);
866 wi->done = true;
867 if (wi->free) {
868 g_free(wi);
871 cpu->queued_work_last = NULL;
872 qemu_cond_broadcast(&qemu_work_cond);
875 static void qemu_wait_io_event_common(CPUState *cpu)
877 if (cpu->stop) {
878 cpu->stop = false;
879 cpu->stopped = true;
880 qemu_cond_signal(&qemu_pause_cond);
882 flush_queued_work(cpu);
883 cpu->thread_kicked = false;
886 static void qemu_tcg_wait_io_event(void)
888 CPUState *cpu;
890 while (all_cpu_threads_idle()) {
891 /* Start accounting real time to the virtual clock if the CPUs
892 are idle. */
893 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
894 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
897 while (iothread_requesting_mutex) {
898 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
901 CPU_FOREACH(cpu) {
902 qemu_wait_io_event_common(cpu);
906 static void qemu_kvm_wait_io_event(CPUState *cpu)
908 while (cpu_thread_is_idle(cpu)) {
909 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
912 qemu_kvm_eat_signals(cpu);
913 qemu_wait_io_event_common(cpu);
916 static void *qemu_kvm_cpu_thread_fn(void *arg)
918 CPUState *cpu = arg;
919 int r;
921 qemu_mutex_lock(&qemu_global_mutex);
922 qemu_thread_get_self(cpu->thread);
923 cpu->thread_id = qemu_get_thread_id();
924 current_cpu = cpu;
926 r = kvm_init_vcpu(cpu);
927 if (r < 0) {
928 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
929 exit(1);
932 qemu_kvm_init_cpu_signals(cpu);
934 /* signal CPU creation */
935 cpu->created = true;
936 qemu_cond_signal(&qemu_cpu_cond);
938 while (1) {
939 if (cpu_can_run(cpu)) {
940 r = kvm_cpu_exec(cpu);
941 if (r == EXCP_DEBUG) {
942 cpu_handle_guest_debug(cpu);
945 qemu_kvm_wait_io_event(cpu);
948 return NULL;
951 static void *qemu_dummy_cpu_thread_fn(void *arg)
953 #ifdef _WIN32
954 fprintf(stderr, "qtest is not supported under Windows\n");
955 exit(1);
956 #else
957 CPUState *cpu = arg;
958 sigset_t waitset;
959 int r;
961 qemu_mutex_lock_iothread();
962 qemu_thread_get_self(cpu->thread);
963 cpu->thread_id = qemu_get_thread_id();
965 sigemptyset(&waitset);
966 sigaddset(&waitset, SIG_IPI);
968 /* signal CPU creation */
969 cpu->created = true;
970 qemu_cond_signal(&qemu_cpu_cond);
972 current_cpu = cpu;
973 while (1) {
974 current_cpu = NULL;
975 qemu_mutex_unlock_iothread();
976 do {
977 int sig;
978 r = sigwait(&waitset, &sig);
979 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
980 if (r == -1) {
981 perror("sigwait");
982 exit(1);
984 qemu_mutex_lock_iothread();
985 current_cpu = cpu;
986 qemu_wait_io_event_common(cpu);
989 return NULL;
990 #endif
993 static void tcg_exec_all(void);
995 static void *qemu_tcg_cpu_thread_fn(void *arg)
997 CPUState *cpu = arg;
999 qemu_tcg_init_cpu_signals();
1000 qemu_thread_get_self(cpu->thread);
1002 qemu_mutex_lock(&qemu_global_mutex);
1003 CPU_FOREACH(cpu) {
1004 cpu->thread_id = qemu_get_thread_id();
1005 cpu->created = true;
1007 qemu_cond_signal(&qemu_cpu_cond);
1009 /* wait for initial kick-off after machine start */
1010 while (QTAILQ_FIRST(&cpus)->stopped) {
1011 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
1013 /* process any pending work */
1014 CPU_FOREACH(cpu) {
1015 qemu_wait_io_event_common(cpu);
1019 while (1) {
1020 tcg_exec_all();
1022 if (use_icount) {
1023 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1025 if (deadline == 0) {
1026 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1029 qemu_tcg_wait_io_event();
1032 return NULL;
1035 static void qemu_cpu_kick_thread(CPUState *cpu)
1037 #ifndef _WIN32
1038 int err;
1040 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1041 if (err) {
1042 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1043 exit(1);
1045 #else /* _WIN32 */
1046 if (!qemu_cpu_is_self(cpu)) {
1047 CONTEXT tcgContext;
1049 if (SuspendThread(cpu->hThread) == (DWORD)-1) {
1050 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1051 GetLastError());
1052 exit(1);
1055 /* On multi-core systems, we are not sure that the thread is actually
1056 * suspended until we can get the context.
1058 tcgContext.ContextFlags = CONTEXT_CONTROL;
1059 while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
1060 continue;
1063 cpu_signal(0);
1065 if (ResumeThread(cpu->hThread) == (DWORD)-1) {
1066 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
1067 GetLastError());
1068 exit(1);
1071 #endif
1074 void qemu_cpu_kick(CPUState *cpu)
1076 qemu_cond_broadcast(cpu->halt_cond);
1077 if (!tcg_enabled() && !cpu->thread_kicked) {
1078 qemu_cpu_kick_thread(cpu);
1079 cpu->thread_kicked = true;
1083 void qemu_cpu_kick_self(void)
1085 #ifndef _WIN32
1086 assert(current_cpu);
1088 if (!current_cpu->thread_kicked) {
1089 qemu_cpu_kick_thread(current_cpu);
1090 current_cpu->thread_kicked = true;
1092 #else
1093 abort();
1094 #endif
1097 bool qemu_cpu_is_self(CPUState *cpu)
1099 return qemu_thread_is_self(cpu->thread);
1102 static bool qemu_in_vcpu_thread(void)
1104 return current_cpu && qemu_cpu_is_self(current_cpu);
1107 void qemu_mutex_lock_iothread(void)
1109 if (!tcg_enabled()) {
1110 qemu_mutex_lock(&qemu_global_mutex);
1111 } else {
1112 iothread_requesting_mutex = true;
1113 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1114 qemu_cpu_kick_thread(first_cpu);
1115 qemu_mutex_lock(&qemu_global_mutex);
1117 iothread_requesting_mutex = false;
1118 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1122 void qemu_mutex_unlock_iothread(void)
1124 qemu_mutex_unlock(&qemu_global_mutex);
1127 static int all_vcpus_paused(void)
1129 CPUState *cpu;
1131 CPU_FOREACH(cpu) {
1132 if (!cpu->stopped) {
1133 return 0;
1137 return 1;
1140 void pause_all_vcpus(void)
1142 CPUState *cpu;
1144 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1145 CPU_FOREACH(cpu) {
1146 cpu->stop = true;
1147 qemu_cpu_kick(cpu);
1150 if (qemu_in_vcpu_thread()) {
1151 cpu_stop_current();
1152 if (!kvm_enabled()) {
1153 CPU_FOREACH(cpu) {
1154 cpu->stop = false;
1155 cpu->stopped = true;
1157 return;
1161 while (!all_vcpus_paused()) {
1162 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1163 CPU_FOREACH(cpu) {
1164 qemu_cpu_kick(cpu);
1169 void cpu_resume(CPUState *cpu)
1171 cpu->stop = false;
1172 cpu->stopped = false;
1173 qemu_cpu_kick(cpu);
1176 void resume_all_vcpus(void)
1178 CPUState *cpu;
1180 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1181 CPU_FOREACH(cpu) {
1182 cpu_resume(cpu);
1186 /* For temporary buffers for forming a name */
1187 #define VCPU_THREAD_NAME_SIZE 16
1189 static void qemu_tcg_init_vcpu(CPUState *cpu)
1191 char thread_name[VCPU_THREAD_NAME_SIZE];
1193 tcg_cpu_address_space_init(cpu, cpu->as);
1195 /* share a single thread for all cpus with TCG */
1196 if (!tcg_cpu_thread) {
1197 cpu->thread = g_malloc0(sizeof(QemuThread));
1198 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1199 qemu_cond_init(cpu->halt_cond);
1200 tcg_halt_cond = cpu->halt_cond;
1201 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1202 cpu->cpu_index);
1203 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1204 cpu, QEMU_THREAD_JOINABLE);
1205 #ifdef _WIN32
1206 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1207 #endif
1208 while (!cpu->created) {
1209 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1211 tcg_cpu_thread = cpu->thread;
1212 } else {
1213 cpu->thread = tcg_cpu_thread;
1214 cpu->halt_cond = tcg_halt_cond;
1218 static void qemu_kvm_start_vcpu(CPUState *cpu)
1220 char thread_name[VCPU_THREAD_NAME_SIZE];
1222 cpu->thread = g_malloc0(sizeof(QemuThread));
1223 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1224 qemu_cond_init(cpu->halt_cond);
1225 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1226 cpu->cpu_index);
1227 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1228 cpu, QEMU_THREAD_JOINABLE);
1229 while (!cpu->created) {
1230 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1234 static void qemu_dummy_start_vcpu(CPUState *cpu)
1236 char thread_name[VCPU_THREAD_NAME_SIZE];
1238 cpu->thread = g_malloc0(sizeof(QemuThread));
1239 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1240 qemu_cond_init(cpu->halt_cond);
1241 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1242 cpu->cpu_index);
1243 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1244 QEMU_THREAD_JOINABLE);
1245 while (!cpu->created) {
1246 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1250 void qemu_init_vcpu(CPUState *cpu)
1252 cpu->nr_cores = smp_cores;
1253 cpu->nr_threads = smp_threads;
1254 cpu->stopped = true;
1255 if (kvm_enabled()) {
1256 qemu_kvm_start_vcpu(cpu);
1257 } else if (tcg_enabled()) {
1258 qemu_tcg_init_vcpu(cpu);
1259 } else {
1260 qemu_dummy_start_vcpu(cpu);
1264 void cpu_stop_current(void)
1266 if (current_cpu) {
1267 current_cpu->stop = false;
1268 current_cpu->stopped = true;
1269 cpu_exit(current_cpu);
1270 qemu_cond_signal(&qemu_pause_cond);
1274 int vm_stop(RunState state)
1276 if (qemu_in_vcpu_thread()) {
1277 qemu_system_vmstop_request_prepare();
1278 qemu_system_vmstop_request(state);
1280 * FIXME: should not return to device code in case
1281 * vm_stop() has been requested.
1283 cpu_stop_current();
1284 return 0;
1287 return do_vm_stop(state);
1290 /* does a state transition even if the VM is already stopped,
1291 current state is forgotten forever */
1292 int vm_stop_force_state(RunState state)
1294 if (runstate_is_running()) {
1295 return vm_stop(state);
1296 } else {
1297 runstate_set(state);
1298 /* Make sure to return an error if the flush in a previous vm_stop()
1299 * failed. */
1300 return bdrv_flush_all();
1304 static int tcg_cpu_exec(CPUArchState *env)
1306 CPUState *cpu = ENV_GET_CPU(env);
1307 int ret;
1308 #ifdef CONFIG_PROFILER
1309 int64_t ti;
1310 #endif
1312 #ifdef CONFIG_PROFILER
1313 ti = profile_getclock();
1314 #endif
1315 if (use_icount) {
1316 int64_t count;
1317 int64_t deadline;
1318 int decr;
1319 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1320 + cpu->icount_extra);
1321 cpu->icount_decr.u16.low = 0;
1322 cpu->icount_extra = 0;
1323 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1325 /* Maintain prior (possibly buggy) behaviour where if no deadline
1326 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1327 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1328 * nanoseconds.
1330 if ((deadline < 0) || (deadline > INT32_MAX)) {
1331 deadline = INT32_MAX;
1334 count = qemu_icount_round(deadline);
1335 timers_state.qemu_icount += count;
1336 decr = (count > 0xffff) ? 0xffff : count;
1337 count -= decr;
1338 cpu->icount_decr.u16.low = decr;
1339 cpu->icount_extra = count;
1341 ret = cpu_exec(env);
1342 #ifdef CONFIG_PROFILER
1343 qemu_time += profile_getclock() - ti;
1344 #endif
1345 if (use_icount) {
1346 /* Fold pending instructions back into the
1347 instruction counter, and clear the interrupt flag. */
1348 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1349 + cpu->icount_extra);
1350 cpu->icount_decr.u32 = 0;
1351 cpu->icount_extra = 0;
1353 return ret;
1356 static void tcg_exec_all(void)
1358 int r;
1360 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1361 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
1363 if (next_cpu == NULL) {
1364 next_cpu = first_cpu;
1366 for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1367 CPUState *cpu = next_cpu;
1368 CPUArchState *env = cpu->env_ptr;
1370 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1371 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1373 if (cpu_can_run(cpu)) {
1374 r = tcg_cpu_exec(env);
1375 if (r == EXCP_DEBUG) {
1376 cpu_handle_guest_debug(cpu);
1377 break;
1379 } else if (cpu->stop || cpu->stopped) {
1380 break;
1383 exit_request = 0;
1386 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1388 /* XXX: implement xxx_cpu_list for targets that still miss it */
1389 #if defined(cpu_list)
1390 cpu_list(f, cpu_fprintf);
1391 #endif
1394 CpuInfoList *qmp_query_cpus(Error **errp)
1396 CpuInfoList *head = NULL, *cur_item = NULL;
1397 CPUState *cpu;
1399 CPU_FOREACH(cpu) {
1400 CpuInfoList *info;
1401 #if defined(TARGET_I386)
1402 X86CPU *x86_cpu = X86_CPU(cpu);
1403 CPUX86State *env = &x86_cpu->env;
1404 #elif defined(TARGET_PPC)
1405 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1406 CPUPPCState *env = &ppc_cpu->env;
1407 #elif defined(TARGET_SPARC)
1408 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1409 CPUSPARCState *env = &sparc_cpu->env;
1410 #elif defined(TARGET_MIPS)
1411 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1412 CPUMIPSState *env = &mips_cpu->env;
1413 #elif defined(TARGET_TRICORE)
1414 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1415 CPUTriCoreState *env = &tricore_cpu->env;
1416 #endif
1418 cpu_synchronize_state(cpu);
1420 info = g_malloc0(sizeof(*info));
1421 info->value = g_malloc0(sizeof(*info->value));
1422 info->value->CPU = cpu->cpu_index;
1423 info->value->current = (cpu == first_cpu);
1424 info->value->halted = cpu->halted;
1425 info->value->thread_id = cpu->thread_id;
1426 #if defined(TARGET_I386)
1427 info->value->has_pc = true;
1428 info->value->pc = env->eip + env->segs[R_CS].base;
1429 #elif defined(TARGET_PPC)
1430 info->value->has_nip = true;
1431 info->value->nip = env->nip;
1432 #elif defined(TARGET_SPARC)
1433 info->value->has_pc = true;
1434 info->value->pc = env->pc;
1435 info->value->has_npc = true;
1436 info->value->npc = env->npc;
1437 #elif defined(TARGET_MIPS)
1438 info->value->has_PC = true;
1439 info->value->PC = env->active_tc.PC;
1440 #elif defined(TARGET_TRICORE)
1441 info->value->has_PC = true;
1442 info->value->PC = env->PC;
1443 #endif
1445 /* XXX: waiting for the qapi to support GSList */
1446 if (!cur_item) {
1447 head = cur_item = info;
1448 } else {
1449 cur_item->next = info;
1450 cur_item = info;
1454 return head;
1457 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1458 bool has_cpu, int64_t cpu_index, Error **errp)
1460 FILE *f;
1461 uint32_t l;
1462 CPUState *cpu;
1463 uint8_t buf[1024];
1465 if (!has_cpu) {
1466 cpu_index = 0;
1469 cpu = qemu_get_cpu(cpu_index);
1470 if (cpu == NULL) {
1471 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1472 "a CPU number");
1473 return;
1476 f = fopen(filename, "wb");
1477 if (!f) {
1478 error_setg_file_open(errp, errno, filename);
1479 return;
1482 while (size != 0) {
1483 l = sizeof(buf);
1484 if (l > size)
1485 l = size;
1486 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1487 error_setg(errp, "Invalid addr 0x%016" PRIx64 "specified", addr);
1488 goto exit;
1490 if (fwrite(buf, 1, l, f) != l) {
1491 error_set(errp, QERR_IO_ERROR);
1492 goto exit;
1494 addr += l;
1495 size -= l;
1498 exit:
1499 fclose(f);
1502 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1503 Error **errp)
1505 FILE *f;
1506 uint32_t l;
1507 uint8_t buf[1024];
1509 f = fopen(filename, "wb");
1510 if (!f) {
1511 error_setg_file_open(errp, errno, filename);
1512 return;
1515 while (size != 0) {
1516 l = sizeof(buf);
1517 if (l > size)
1518 l = size;
1519 cpu_physical_memory_read(addr, buf, l);
1520 if (fwrite(buf, 1, l, f) != l) {
1521 error_set(errp, QERR_IO_ERROR);
1522 goto exit;
1524 addr += l;
1525 size -= l;
1528 exit:
1529 fclose(f);
1532 void qmp_inject_nmi(Error **errp)
1534 #if defined(TARGET_I386)
1535 CPUState *cs;
1537 CPU_FOREACH(cs) {
1538 X86CPU *cpu = X86_CPU(cs);
1540 if (!cpu->apic_state) {
1541 cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1542 } else {
1543 apic_deliver_nmi(cpu->apic_state);
1546 #else
1547 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1548 #endif
1551 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1553 if (!use_icount) {
1554 return;
1557 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1558 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1559 if (icount_align_option) {
1560 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1561 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1562 } else {
1563 cpu_fprintf(f, "Max guest delay NA\n");
1564 cpu_fprintf(f, "Max guest advance NA\n");