cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 typedef struct TimersState {
 128     /* Protected by BQL.  */
 129     int64_t cpu_ticks_prev;
 130     int64_t cpu_ticks_offset;
 131
 132     /* cpu_clock_offset can be read out of BQL, so protect it with
 133      * this lock.
 134      */
 135     QemuSeqLock vm_clock_seqlock;
 136     int64_t cpu_clock_offset;
 137     int32_t cpu_ticks_enabled;
 138
 139     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 140     int icount_time_shift;
 141     /* Compensate for varying guest execution speed.  */
 142     int64_t qemu_icount_bias;
 143     /* Only written by TCG thread */
 144     int64_t qemu_icount;
 145     /* for adjusting icount */
 146     int64_t vm_clock_warp_start;
 147     QEMUTimer *icount_rt_timer;
 148     QEMUTimer *icount_vm_timer;
 149     QEMUTimer *icount_warp_timer;
 150 } TimersState;
 151
 152 static TimersState timers_state;
 153 bool mttcg_enabled;
 154
 155 /*
 156  * We default to false if we know other options have been enabled
 157  * which are currently incompatible with MTTCG. Otherwise when each
 158  * guest (target) has been updated to support:
 159  *   - atomic instructions
 160  *   - memory ordering primitives (barriers)
 161  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 162  *
 163  * Once a guest architecture has been converted to the new primitives
 164  * there are two remaining limitations to check.
 165  *
 166  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 167  * - The host must have a stronger memory order than the guest
 168  *
 169  * It may be possible in future to support strong guests on weak hosts
 170  * but that will require tagging all load/stores in a guest with their
 171  * implicit memory order requirements which would likely slow things
 172  * down a lot.
 173  */
 174
 175 static bool check_tcg_memory_orders_compatible(void)
 176 {
 177 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 178     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 179 #else
 180     return false;
 181 #endif
 182 }
 183
 184 static bool default_mttcg_enabled(void)
 185 {
 186     if (use_icount || TCG_OVERSIZED_GUEST) {
 187         return false;
 188     } else {
 189 #ifdef TARGET_SUPPORTS_MTTCG
 190         return check_tcg_memory_orders_compatible();
 191 #else
 192         return false;
 193 #endif
 194     }
 195 }
 196
 197 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 198 {
 199     const char *t = qemu_opt_get(opts, "thread");
 200     if (t) {
 201         if (strcmp(t, "multi") == 0) {
 202             if (TCG_OVERSIZED_GUEST) {
 203                 error_setg(errp, "No MTTCG when guest word size > hosts");
 204             } else if (use_icount) {
 205                 error_setg(errp, "No MTTCG when icount is enabled");
 206             } else {
 207 #ifndef TARGET_SUPPORTS_MTTCG
 208                 error_report("Guest not yet converted to MTTCG - "
 209                              "you may get unexpected results");
 210 #endif
 211                 if (!check_tcg_memory_orders_compatible()) {
 212                     error_report("Guest expects a stronger memory ordering "
 213                                  "than the host provides");
 214                     error_printf("This may cause strange/hard to debug errors\n");
 215                 }
 216                 mttcg_enabled = true;
 217             }
 218         } else if (strcmp(t, "single") == 0) {
 219             mttcg_enabled = false;
 220         } else {
 221             error_setg(errp, "Invalid 'thread' setting %s", t);
 222         }
 223     } else {
 224         mttcg_enabled = default_mttcg_enabled();
 225     }
 226 }
 227
 228 /* The current number of executed instructions is based on what we
 229  * originally budgeted minus the current state of the decrementing
 230  * icount counters in extra/u16.low.
 231  */
 232 static int64_t cpu_get_icount_executed(CPUState *cpu)
 233 {
 234     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 235 }
 236
 237 /*
 238  * Update the global shared timer_state.qemu_icount to take into
 239  * account executed instructions. This is done by the TCG vCPU
 240  * thread so the main-loop can see time has moved forward.
 241  */
 242 void cpu_update_icount(CPUState *cpu)
 243 {
 244     int64_t executed = cpu_get_icount_executed(cpu);
 245     cpu->icount_budget -= executed;
 246
 247 #ifdef CONFIG_ATOMIC64
 248     atomic_set__nocheck(&timers_state.qemu_icount,
 249                         timers_state.qemu_icount + executed);
 250 #else /* FIXME: we need 64bit atomics to do this safely */
 251     timers_state.qemu_icount += executed;
 252 #endif
 253 }
 254
 255 static int64_t cpu_get_icount_raw_locked(void)
 256 {
 257     CPUState *cpu = current_cpu;
 258
 259     if (cpu && cpu->running) {
 260         if (!cpu->can_do_io) {
 261             error_report("Bad icount read");
 262             exit(1);
 263         }
 264         /* Take into account what has run */
 265         cpu_update_icount(cpu);
 266     }
 267     /* The read is protected by the seqlock, so __nocheck is okay.  */
 268     return atomic_read__nocheck(&timers_state.qemu_icount);
 269 }
 270
 271 static int64_t cpu_get_icount_locked(void)
 272 {
 273     int64_t icount = cpu_get_icount_raw_locked();
 274     return atomic_read__nocheck(&timers_state.qemu_icount_bias) + cpu_icount_to_ns(icount);
 275 }
 276
 277 int64_t cpu_get_icount_raw(void)
 278 {
 279     int64_t icount;
 280     unsigned start;
 281
 282     do {
 283         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 284         icount = cpu_get_icount_raw_locked();
 285     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 286
 287     return icount;
 288 }
 289
 290 /* Return the virtual CPU time, based on the instruction counter.  */
 291 int64_t cpu_get_icount(void)
 292 {
 293     int64_t icount;
 294     unsigned start;
 295
 296     do {
 297         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 298         icount = cpu_get_icount_locked();
 299     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 300
 301     return icount;
 302 }
 303
 304 int64_t cpu_icount_to_ns(int64_t icount)
 305 {
 306     return icount << atomic_read(&timers_state.icount_time_shift);
 307 }
 308
 309 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 310  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 311  * counter.
 312  *
 313  * Caller must hold the BQL
 314  */
 315 int64_t cpu_get_ticks(void)
 316 {
 317     int64_t ticks;
 318
 319     if (use_icount) {
 320         return cpu_get_icount();
 321     }
 322
 323     ticks = timers_state.cpu_ticks_offset;
 324     if (timers_state.cpu_ticks_enabled) {
 325         ticks += cpu_get_host_ticks();
 326     }
 327
 328     if (timers_state.cpu_ticks_prev > ticks) {
 329         /* Note: non increasing ticks may happen if the host uses
 330            software suspend */
 331         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 332         ticks = timers_state.cpu_ticks_prev;
 333     }
 334
 335     timers_state.cpu_ticks_prev = ticks;
 336     return ticks;
 337 }
 338
 339 static int64_t cpu_get_clock_locked(void)
 340 {
 341     int64_t time;
 342
 343     time = timers_state.cpu_clock_offset;
 344     if (timers_state.cpu_ticks_enabled) {
 345         time += get_clock();
 346     }
 347
 348     return time;
 349 }
 350
 351 /* Return the monotonic time elapsed in VM, i.e.,
 352  * the time between vm_start and vm_stop
 353  */
 354 int64_t cpu_get_clock(void)
 355 {
 356     int64_t ti;
 357     unsigned start;
 358
 359     do {
 360         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 361         ti = cpu_get_clock_locked();
 362     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 363
 364     return ti;
 365 }
 366
 367 /* enable cpu_get_ticks()
 368  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 369  */
 370 void cpu_enable_ticks(void)
 371 {
 372     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 373     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 374     if (!timers_state.cpu_ticks_enabled) {
 375         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 376         timers_state.cpu_clock_offset -= get_clock();
 377         timers_state.cpu_ticks_enabled = 1;
 378     }
 379     seqlock_write_end(&timers_state.vm_clock_seqlock);
 380 }
 381
 382 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 383  * cpu_get_ticks() after that.
 384  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 385  */
 386 void cpu_disable_ticks(void)
 387 {
 388     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 389     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 390     if (timers_state.cpu_ticks_enabled) {
 391         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 392         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 393         timers_state.cpu_ticks_enabled = 0;
 394     }
 395     seqlock_write_end(&timers_state.vm_clock_seqlock);
 396 }
 397
 398 /* Correlation between real and virtual time is always going to be
 399    fairly approximate, so ignore small variation.
 400    When the guest is idle real and virtual time will be aligned in
 401    the IO wait loop.  */
 402 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 403
 404 static void icount_adjust(void)
 405 {
 406     int64_t cur_time;
 407     int64_t cur_icount;
 408     int64_t delta;
 409
 410     /* Protected by TimersState mutex.  */
 411     static int64_t last_delta;
 412
 413     /* If the VM is not running, then do nothing.  */
 414     if (!runstate_is_running()) {
 415         return;
 416     }
 417
 418     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 419     cur_time = cpu_get_clock_locked();
 420     cur_icount = cpu_get_icount_locked();
 421
 422     delta = cur_icount - cur_time;
 423     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 424     if (delta > 0
 425         && last_delta + ICOUNT_WOBBLE < delta * 2
 426         && timers_state.icount_time_shift > 0) {
 427         /* The guest is getting too far ahead.  Slow time down.  */
 428         atomic_set(&timers_state.icount_time_shift,
 429                    timers_state.icount_time_shift - 1);
 430     }
 431     if (delta < 0
 432         && last_delta - ICOUNT_WOBBLE > delta * 2
 433         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 434         /* The guest is getting too far behind.  Speed time up.  */
 435         atomic_set(&timers_state.icount_time_shift,
 436                    timers_state.icount_time_shift + 1);
 437     }
 438     last_delta = delta;
 439     atomic_set__nocheck(&timers_state.qemu_icount_bias,
 440                         cur_icount - (timers_state.qemu_icount
 441                                       << timers_state.icount_time_shift));
 442     seqlock_write_end(&timers_state.vm_clock_seqlock);
 443 }
 444
 445 static void icount_adjust_rt(void *opaque)
 446 {
 447     timer_mod(timers_state.icount_rt_timer,
 448               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 449     icount_adjust();
 450 }
 451
 452 static void icount_adjust_vm(void *opaque)
 453 {
 454     timer_mod(timers_state.icount_vm_timer,
 455                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 456                    NANOSECONDS_PER_SECOND / 10);
 457     icount_adjust();
 458 }
 459
 460 static int64_t qemu_icount_round(int64_t count)
 461 {
 462     int shift = atomic_read(&timers_state.icount_time_shift);
 463     return (count + (1 << shift) - 1) >> shift;
 464 }
 465
 466 static void icount_warp_rt(void)
 467 {
 468     unsigned seq;
 469     int64_t warp_start;
 470
 471     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 472      * changes from -1 to another value, so the race here is okay.
 473      */
 474     do {
 475         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 476         warp_start = timers_state.vm_clock_warp_start;
 477     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 478
 479     if (warp_start == -1) {
 480         return;
 481     }
 482
 483     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 484     if (runstate_is_running()) {
 485         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 486                                      cpu_get_clock_locked());
 487         int64_t warp_delta;
 488
 489         warp_delta = clock - timers_state.vm_clock_warp_start;
 490         if (use_icount == 2) {
 491             /*
 492              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 493              * far ahead of real time.
 494              */
 495             int64_t cur_icount = cpu_get_icount_locked();
 496             int64_t delta = clock - cur_icount;
 497             warp_delta = MIN(warp_delta, delta);
 498         }
 499         atomic_set__nocheck(&timers_state.qemu_icount_bias,
 500                             timers_state.qemu_icount_bias + warp_delta);
 501     }
 502     timers_state.vm_clock_warp_start = -1;
 503     seqlock_write_end(&timers_state.vm_clock_seqlock);
 504
 505     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 506         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 507     }
 508 }
 509
 510 static void icount_timer_cb(void *opaque)
 511 {
 512     /* No need for a checkpoint because the timer already synchronizes
 513      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 514      */
 515     icount_warp_rt();
 516 }
 517
 518 void qtest_clock_warp(int64_t dest)
 519 {
 520     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 521     AioContext *aio_context;
 522     assert(qtest_enabled());
 523     aio_context = qemu_get_aio_context();
 524     while (clock < dest) {
 525         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 526         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 527
 528         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 529         atomic_set__nocheck(&timers_state.qemu_icount_bias,
 530                             timers_state.qemu_icount_bias + warp);
 531         seqlock_write_end(&timers_state.vm_clock_seqlock);
 532
 533         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 534         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 535         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 536     }
 537     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 538 }
 539
 540 void qemu_start_warp_timer(void)
 541 {
 542     int64_t clock;
 543     int64_t deadline;
 544
 545     if (!use_icount) {
 546         return;
 547     }
 548
 549     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 550      * do not fire, so computing the deadline does not make sense.
 551      */
 552     if (!runstate_is_running()) {
 553         return;
 554     }
 555
 556     /* warp clock deterministically in record/replay mode */
 557     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 558         return;
 559     }
 560
 561     if (!all_cpu_threads_idle()) {
 562         return;
 563     }
 564
 565     if (qtest_enabled()) {
 566         /* When testing, qtest commands advance icount.  */
 567         return;
 568     }
 569
 570     /* We want to use the earliest deadline from ALL vm_clocks */
 571     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 572     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 573     if (deadline < 0) {
 574         static bool notified;
 575         if (!icount_sleep && !notified) {
 576             warn_report("icount sleep disabled and no active timers");
 577             notified = true;
 578         }
 579         return;
 580     }
 581
 582     if (deadline > 0) {
 583         /*
 584          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 585          * sleep.  Otherwise, the CPU might be waiting for a future timer
 586          * interrupt to wake it up, but the interrupt never comes because
 587          * the vCPU isn't running any insns and thus doesn't advance the
 588          * QEMU_CLOCK_VIRTUAL.
 589          */
 590         if (!icount_sleep) {
 591             /*
 592              * We never let VCPUs sleep in no sleep icount mode.
 593              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 594              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 595              * It is useful when we want a deterministic execution time,
 596              * isolated from host latencies.
 597              */
 598             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 599             atomic_set__nocheck(&timers_state.qemu_icount_bias,
 600                                 timers_state.qemu_icount_bias + deadline);
 601             seqlock_write_end(&timers_state.vm_clock_seqlock);
 602             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 603         } else {
 604             /*
 605              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 606              * "real" time, (related to the time left until the next event) has
 607              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 608              * This avoids that the warps are visible externally; for example,
 609              * you will not be sending network packets continuously instead of
 610              * every 100ms.
 611              */
 612             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 613             if (timers_state.vm_clock_warp_start == -1
 614                 || timers_state.vm_clock_warp_start > clock) {
 615                 timers_state.vm_clock_warp_start = clock;
 616             }
 617             seqlock_write_end(&timers_state.vm_clock_seqlock);
 618             timer_mod_anticipate(timers_state.icount_warp_timer,
 619                                  clock + deadline);
 620         }
 621     } else if (deadline == 0) {
 622         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 623     }
 624 }
 625
 626 static void qemu_account_warp_timer(void)
 627 {
 628     if (!use_icount || !icount_sleep) {
 629         return;
 630     }
 631
 632     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 633      * do not fire, so computing the deadline does not make sense.
 634      */
 635     if (!runstate_is_running()) {
 636         return;
 637     }
 638
 639     /* warp clock deterministically in record/replay mode */
 640     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 641         return;
 642     }
 643
 644     timer_del(timers_state.icount_warp_timer);
 645     icount_warp_rt();
 646 }
 647
 648 static bool icount_state_needed(void *opaque)
 649 {
 650     return use_icount;
 651 }
 652
 653 static bool warp_timer_state_needed(void *opaque)
 654 {
 655     TimersState *s = opaque;
 656     return s->icount_warp_timer != NULL;
 657 }
 658
 659 static bool adjust_timers_state_needed(void *opaque)
 660 {
 661     TimersState *s = opaque;
 662     return s->icount_rt_timer != NULL;
 663 }
 664
 665 /*
 666  * Subsection for warp timer migration is optional, because may not be created
 667  */
 668 static const VMStateDescription icount_vmstate_warp_timer = {
 669     .name = "timer/icount/warp_timer",
 670     .version_id = 1,
 671     .minimum_version_id = 1,
 672     .needed = warp_timer_state_needed,
 673     .fields = (VMStateField[]) {
 674         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 675         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 676         VMSTATE_END_OF_LIST()
 677     }
 678 };
 679
 680 static const VMStateDescription icount_vmstate_adjust_timers = {
 681     .name = "timer/icount/timers",
 682     .version_id = 1,
 683     .minimum_version_id = 1,
 684     .needed = adjust_timers_state_needed,
 685     .fields = (VMStateField[]) {
 686         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 687         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 688         VMSTATE_END_OF_LIST()
 689     }
 690 };
 691
 692 /*
 693  * This is a subsection for icount migration.
 694  */
 695 static const VMStateDescription icount_vmstate_timers = {
 696     .name = "timer/icount",
 697     .version_id = 1,
 698     .minimum_version_id = 1,
 699     .needed = icount_state_needed,
 700     .fields = (VMStateField[]) {
 701         VMSTATE_INT64(qemu_icount_bias, TimersState),
 702         VMSTATE_INT64(qemu_icount, TimersState),
 703         VMSTATE_END_OF_LIST()
 704     },
 705     .subsections = (const VMStateDescription*[]) {
 706         &icount_vmstate_warp_timer,
 707         &icount_vmstate_adjust_timers,
 708         NULL
 709     }
 710 };
 711
 712 static const VMStateDescription vmstate_timers = {
 713     .name = "timer",
 714     .version_id = 2,
 715     .minimum_version_id = 1,
 716     .fields = (VMStateField[]) {
 717         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 718         VMSTATE_UNUSED(8),
 719         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 720         VMSTATE_END_OF_LIST()
 721     },
 722     .subsections = (const VMStateDescription*[]) {
 723         &icount_vmstate_timers,
 724         NULL
 725     }
 726 };
 727
 728 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 729 {
 730     double pct;
 731     double throttle_ratio;
 732     long sleeptime_ns;
 733
 734     if (!cpu_throttle_get_percentage()) {
 735         return;
 736     }
 737
 738     pct = (double)cpu_throttle_get_percentage()/100;
 739     throttle_ratio = pct / (1 - pct);
 740     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 741
 742     qemu_mutex_unlock_iothread();
 743     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 744     qemu_mutex_lock_iothread();
 745     atomic_set(&cpu->throttle_thread_scheduled, 0);
 746 }
 747
 748 static void cpu_throttle_timer_tick(void *opaque)
 749 {
 750     CPUState *cpu;
 751     double pct;
 752
 753     /* Stop the timer if needed */
 754     if (!cpu_throttle_get_percentage()) {
 755         return;
 756     }
 757     CPU_FOREACH(cpu) {
 758         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 759             async_run_on_cpu(cpu, cpu_throttle_thread,
 760                              RUN_ON_CPU_NULL);
 761         }
 762     }
 763
 764     pct = (double)cpu_throttle_get_percentage()/100;
 765     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 766                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 767 }
 768
 769 void cpu_throttle_set(int new_throttle_pct)
 770 {
 771     /* Ensure throttle percentage is within valid range */
 772     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 773     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 774
 775     atomic_set(&throttle_percentage, new_throttle_pct);
 776
 777     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 778                                        CPU_THROTTLE_TIMESLICE_NS);
 779 }
 780
 781 void cpu_throttle_stop(void)
 782 {
 783     atomic_set(&throttle_percentage, 0);
 784 }
 785
 786 bool cpu_throttle_active(void)
 787 {
 788     return (cpu_throttle_get_percentage() != 0);
 789 }
 790
 791 int cpu_throttle_get_percentage(void)
 792 {
 793     return atomic_read(&throttle_percentage);
 794 }
 795
 796 void cpu_ticks_init(void)
 797 {
 798     seqlock_init(&timers_state.vm_clock_seqlock);
 799     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 800     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 801                                            cpu_throttle_timer_tick, NULL);
 802 }
 803
 804 void configure_icount(QemuOpts *opts, Error **errp)
 805 {
 806     const char *option;
 807     char *rem_str = NULL;
 808
 809     option = qemu_opt_get(opts, "shift");
 810     if (!option) {
 811         if (qemu_opt_get(opts, "align") != NULL) {
 812             error_setg(errp, "Please specify shift option when using align");
 813         }
 814         return;
 815     }
 816
 817     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 818     if (icount_sleep) {
 819         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 820                                          icount_timer_cb, NULL);
 821     }
 822
 823     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 824
 825     if (icount_align_option && !icount_sleep) {
 826         error_setg(errp, "align=on and sleep=off are incompatible");
 827     }
 828     if (strcmp(option, "auto") != 0) {
 829         errno = 0;
 830         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 831         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 832             error_setg(errp, "icount: Invalid shift value");
 833         }
 834         use_icount = 1;
 835         return;
 836     } else if (icount_align_option) {
 837         error_setg(errp, "shift=auto and align=on are incompatible");
 838     } else if (!icount_sleep) {
 839         error_setg(errp, "shift=auto and sleep=off are incompatible");
 840     }
 841
 842     use_icount = 2;
 843
 844     /* 125MIPS seems a reasonable initial guess at the guest speed.
 845        It will be corrected fairly quickly anyway.  */
 846     timers_state.icount_time_shift = 3;
 847
 848     /* Have both realtime and virtual time triggers for speed adjustment.
 849        The realtime trigger catches emulated time passing too slowly,
 850        the virtual time trigger catches emulated time passing too fast.
 851        Realtime triggers occur even when idle, so use them less frequently
 852        than VM triggers.  */
 853     timers_state.vm_clock_warp_start = -1;
 854     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 855                                    icount_adjust_rt, NULL);
 856     timer_mod(timers_state.icount_rt_timer,
 857                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 858     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 859                                         icount_adjust_vm, NULL);
 860     timer_mod(timers_state.icount_vm_timer,
 861                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 862                    NANOSECONDS_PER_SECOND / 10);
 863 }
 864
 865 /***********************************************************/
 866 /* TCG vCPU kick timer
 867  *
 868  * The kick timer is responsible for moving single threaded vCPU
 869  * emulation on to the next vCPU. If more than one vCPU is running a
 870  * timer event with force a cpu->exit so the next vCPU can get
 871  * scheduled.
 872  *
 873  * The timer is removed if all vCPUs are idle and restarted again once
 874  * idleness is complete.
 875  */
 876
 877 static QEMUTimer *tcg_kick_vcpu_timer;
 878 static CPUState *tcg_current_rr_cpu;
 879
 880 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 881
 882 static inline int64_t qemu_tcg_next_kick(void)
 883 {
 884     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 885 }
 886
 887 /* Kick the currently round-robin scheduled vCPU */
 888 static void qemu_cpu_kick_rr_cpu(void)
 889 {
 890     CPUState *cpu;
 891     do {
 892         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 893         if (cpu) {
 894             cpu_exit(cpu);
 895         }
 896     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 897 }
 898
 899 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 900 {
 901 }
 902
 903 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 904 {
 905     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 906         qemu_notify_event();
 907         return;
 908     }
 909
 910     if (qemu_in_vcpu_thread()) {
 911         /* A CPU is currently running; kick it back out to the
 912          * tcg_cpu_exec() loop so it will recalculate its
 913          * icount deadline immediately.
 914          */
 915         qemu_cpu_kick(current_cpu);
 916     } else if (first_cpu) {
 917         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 918          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 919          * causes cpu_thread_is_idle to return false.  This way,
 920          * handle_icount_deadline can run.
 921          * If we have no CPUs at all for some reason, we don't
 922          * need to do anything.
 923          */
 924         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 925     }
 926 }
 927
 928 static void kick_tcg_thread(void *opaque)
 929 {
 930     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 931     qemu_cpu_kick_rr_cpu();
 932 }
 933
 934 static void start_tcg_kick_timer(void)
 935 {
 936     assert(!mttcg_enabled);
 937     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 938         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 939                                            kick_tcg_thread, NULL);
 940         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 941     }
 942 }
 943
 944 static void stop_tcg_kick_timer(void)
 945 {
 946     assert(!mttcg_enabled);
 947     if (tcg_kick_vcpu_timer) {
 948         timer_del(tcg_kick_vcpu_timer);
 949         tcg_kick_vcpu_timer = NULL;
 950     }
 951 }
 952
 953 /***********************************************************/
 954 void hw_error(const char *fmt, ...)
 955 {
 956     va_list ap;
 957     CPUState *cpu;
 958
 959     va_start(ap, fmt);
 960     fprintf(stderr, "qemu: hardware error: ");
 961     vfprintf(stderr, fmt, ap);
 962     fprintf(stderr, "\n");
 963     CPU_FOREACH(cpu) {
 964         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 965         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 966     }
 967     va_end(ap);
 968     abort();
 969 }
 970
 971 void cpu_synchronize_all_states(void)
 972 {
 973     CPUState *cpu;
 974
 975     CPU_FOREACH(cpu) {
 976         cpu_synchronize_state(cpu);
 977         /* TODO: move to cpu_synchronize_state() */
 978         if (hvf_enabled()) {
 979             hvf_cpu_synchronize_state(cpu);
 980         }
 981     }
 982 }
 983
 984 void cpu_synchronize_all_post_reset(void)
 985 {
 986     CPUState *cpu;
 987
 988     CPU_FOREACH(cpu) {
 989         cpu_synchronize_post_reset(cpu);
 990         /* TODO: move to cpu_synchronize_post_reset() */
 991         if (hvf_enabled()) {
 992             hvf_cpu_synchronize_post_reset(cpu);
 993         }
 994     }
 995 }
 996
 997 void cpu_synchronize_all_post_init(void)
 998 {
 999     CPUState *cpu;
1000
1001     CPU_FOREACH(cpu) {
1002         cpu_synchronize_post_init(cpu);
1003         /* TODO: move to cpu_synchronize_post_init() */
1004         if (hvf_enabled()) {
1005             hvf_cpu_synchronize_post_init(cpu);
1006         }
1007     }
1008 }
1009
1010 void cpu_synchronize_all_pre_loadvm(void)
1011 {
1012     CPUState *cpu;
1013
1014     CPU_FOREACH(cpu) {
1015         cpu_synchronize_pre_loadvm(cpu);
1016     }
1017 }
1018
1019 static int do_vm_stop(RunState state, bool send_stop)
1020 {
1021     int ret = 0;
1022
1023     if (runstate_is_running()) {
1024         cpu_disable_ticks();
1025         pause_all_vcpus();
1026         runstate_set(state);
1027         vm_state_notify(0, state);
1028         if (send_stop) {
1029             qapi_event_send_stop(&error_abort);
1030         }
1031     }
1032
1033     bdrv_drain_all();
1034     replay_disable_events();
1035     ret = bdrv_flush_all();
1036
1037     return ret;
1038 }
1039
1040 /* Special vm_stop() variant for terminating the process.  Historically clients
1041  * did not expect a QMP STOP event and so we need to retain compatibility.
1042  */
1043 int vm_shutdown(void)
1044 {
1045     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1046 }
1047
1048 static bool cpu_can_run(CPUState *cpu)
1049 {
1050     if (cpu->stop) {
1051         return false;
1052     }
1053     if (cpu_is_stopped(cpu)) {
1054         return false;
1055     }
1056     return true;
1057 }
1058
1059 static void cpu_handle_guest_debug(CPUState *cpu)
1060 {
1061     gdb_set_stop_cpu(cpu);
1062     qemu_system_debug_request();
1063     cpu->stopped = true;
1064 }
1065
1066 #ifdef CONFIG_LINUX
1067 static void sigbus_reraise(void)
1068 {
1069     sigset_t set;
1070     struct sigaction action;
1071
1072     memset(&action, 0, sizeof(action));
1073     action.sa_handler = SIG_DFL;
1074     if (!sigaction(SIGBUS, &action, NULL)) {
1075         raise(SIGBUS);
1076         sigemptyset(&set);
1077         sigaddset(&set, SIGBUS);
1078         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1079     }
1080     perror("Failed to re-raise SIGBUS!\n");
1081     abort();
1082 }
1083
1084 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1085 {
1086     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1087         sigbus_reraise();
1088     }
1089
1090     if (current_cpu) {
1091         /* Called asynchronously in VCPU thread.  */
1092         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1093             sigbus_reraise();
1094         }
1095     } else {
1096         /* Called synchronously (via signalfd) in main thread.  */
1097         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1098             sigbus_reraise();
1099         }
1100     }
1101 }
1102
1103 static void qemu_init_sigbus(void)
1104 {
1105     struct sigaction action;
1106
1107     memset(&action, 0, sizeof(action));
1108     action.sa_flags = SA_SIGINFO;
1109     action.sa_sigaction = sigbus_handler;
1110     sigaction(SIGBUS, &action, NULL);
1111
1112     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1113 }
1114 #else /* !CONFIG_LINUX */
1115 static void qemu_init_sigbus(void)
1116 {
1117 }
1118 #endif /* !CONFIG_LINUX */
1119
1120 static QemuMutex qemu_global_mutex;
1121
1122 static QemuThread io_thread;
1123
1124 /* cpu creation */
1125 static QemuCond qemu_cpu_cond;
1126 /* system init */
1127 static QemuCond qemu_pause_cond;
1128
1129 void qemu_init_cpu_loop(void)
1130 {
1131     qemu_init_sigbus();
1132     qemu_cond_init(&qemu_cpu_cond);
1133     qemu_cond_init(&qemu_pause_cond);
1134     qemu_mutex_init(&qemu_global_mutex);
1135
1136     qemu_thread_get_self(&io_thread);
1137 }
1138
1139 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1140 {
1141     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1142 }
1143
1144 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1145 {
1146     if (kvm_destroy_vcpu(cpu) < 0) {
1147         error_report("kvm_destroy_vcpu failed");
1148         exit(EXIT_FAILURE);
1149     }
1150 }
1151
1152 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1153 {
1154 }
1155
1156 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1157 {
1158     g_assert(qemu_cpu_is_self(cpu));
1159     cpu->stop = false;
1160     cpu->stopped = true;
1161     if (exit) {
1162         cpu_exit(cpu);
1163     }
1164     qemu_cond_broadcast(&qemu_pause_cond);
1165 }
1166
1167 static void qemu_wait_io_event_common(CPUState *cpu)
1168 {
1169     atomic_mb_set(&cpu->thread_kicked, false);
1170     if (cpu->stop) {
1171         qemu_cpu_stop(cpu, false);
1172     }
1173     process_queued_cpu_work(cpu);
1174 }
1175
1176 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1177 {
1178     while (all_cpu_threads_idle()) {
1179         stop_tcg_kick_timer();
1180         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1181     }
1182
1183     start_tcg_kick_timer();
1184
1185     qemu_wait_io_event_common(cpu);
1186 }
1187
1188 static void qemu_wait_io_event(CPUState *cpu)
1189 {
1190     while (cpu_thread_is_idle(cpu)) {
1191         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1192     }
1193
1194 #ifdef _WIN32
1195     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1196     if (!tcg_enabled()) {
1197         SleepEx(0, TRUE);
1198     }
1199 #endif
1200     qemu_wait_io_event_common(cpu);
1201 }
1202
1203 static void *qemu_kvm_cpu_thread_fn(void *arg)
1204 {
1205     CPUState *cpu = arg;
1206     int r;
1207
1208     rcu_register_thread();
1209
1210     qemu_mutex_lock_iothread();
1211     qemu_thread_get_self(cpu->thread);
1212     cpu->thread_id = qemu_get_thread_id();
1213     cpu->can_do_io = 1;
1214     current_cpu = cpu;
1215
1216     r = kvm_init_vcpu(cpu);
1217     if (r < 0) {
1218         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1219         exit(1);
1220     }
1221
1222     kvm_init_cpu_signals(cpu);
1223
1224     /* signal CPU creation */
1225     cpu->created = true;
1226     qemu_cond_signal(&qemu_cpu_cond);
1227
1228     do {
1229         if (cpu_can_run(cpu)) {
1230             r = kvm_cpu_exec(cpu);
1231             if (r == EXCP_DEBUG) {
1232                 cpu_handle_guest_debug(cpu);
1233             }
1234         }
1235         qemu_wait_io_event(cpu);
1236     } while (!cpu->unplug || cpu_can_run(cpu));
1237
1238     qemu_kvm_destroy_vcpu(cpu);
1239     cpu->created = false;
1240     qemu_cond_signal(&qemu_cpu_cond);
1241     qemu_mutex_unlock_iothread();
1242     rcu_unregister_thread();
1243     return NULL;
1244 }
1245
1246 static void *qemu_dummy_cpu_thread_fn(void *arg)
1247 {
1248 #ifdef _WIN32
1249     error_report("qtest is not supported under Windows");
1250     exit(1);
1251 #else
1252     CPUState *cpu = arg;
1253     sigset_t waitset;
1254     int r;
1255
1256     rcu_register_thread();
1257
1258     qemu_mutex_lock_iothread();
1259     qemu_thread_get_self(cpu->thread);
1260     cpu->thread_id = qemu_get_thread_id();
1261     cpu->can_do_io = 1;
1262     current_cpu = cpu;
1263
1264     sigemptyset(&waitset);
1265     sigaddset(&waitset, SIG_IPI);
1266
1267     /* signal CPU creation */
1268     cpu->created = true;
1269     qemu_cond_signal(&qemu_cpu_cond);
1270
1271     do {
1272         qemu_mutex_unlock_iothread();
1273         do {
1274             int sig;
1275             r = sigwait(&waitset, &sig);
1276         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1277         if (r == -1) {
1278             perror("sigwait");
1279             exit(1);
1280         }
1281         qemu_mutex_lock_iothread();
1282         qemu_wait_io_event(cpu);
1283     } while (!cpu->unplug);
1284
1285     rcu_unregister_thread();
1286     return NULL;
1287 #endif
1288 }
1289
1290 static int64_t tcg_get_icount_limit(void)
1291 {
1292     int64_t deadline;
1293
1294     if (replay_mode != REPLAY_MODE_PLAY) {
1295         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1296
1297         /* Maintain prior (possibly buggy) behaviour where if no deadline
1298          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1299          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1300          * nanoseconds.
1301          */
1302         if ((deadline < 0) || (deadline > INT32_MAX)) {
1303             deadline = INT32_MAX;
1304         }
1305
1306         return qemu_icount_round(deadline);
1307     } else {
1308         return replay_get_instructions();
1309     }
1310 }
1311
1312 static void handle_icount_deadline(void)
1313 {
1314     assert(qemu_in_vcpu_thread());
1315     if (use_icount) {
1316         int64_t deadline =
1317             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1318
1319         if (deadline == 0) {
1320             /* Wake up other AioContexts.  */
1321             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1322             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1323         }
1324     }
1325 }
1326
1327 static void prepare_icount_for_run(CPUState *cpu)
1328 {
1329     if (use_icount) {
1330         int insns_left;
1331
1332         /* These should always be cleared by process_icount_data after
1333          * each vCPU execution. However u16.high can be raised
1334          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1335          */
1336         g_assert(cpu->icount_decr.u16.low == 0);
1337         g_assert(cpu->icount_extra == 0);
1338
1339         cpu->icount_budget = tcg_get_icount_limit();
1340         insns_left = MIN(0xffff, cpu->icount_budget);
1341         cpu->icount_decr.u16.low = insns_left;
1342         cpu->icount_extra = cpu->icount_budget - insns_left;
1343
1344         replay_mutex_lock();
1345     }
1346 }
1347
1348 static void process_icount_data(CPUState *cpu)
1349 {
1350     if (use_icount) {
1351         /* Account for executed instructions */
1352         cpu_update_icount(cpu);
1353
1354         /* Reset the counters */
1355         cpu->icount_decr.u16.low = 0;
1356         cpu->icount_extra = 0;
1357         cpu->icount_budget = 0;
1358
1359         replay_account_executed_instructions();
1360
1361         replay_mutex_unlock();
1362     }
1363 }
1364
1365
1366 static int tcg_cpu_exec(CPUState *cpu)
1367 {
1368     int ret;
1369 #ifdef CONFIG_PROFILER
1370     int64_t ti;
1371 #endif
1372
1373     assert(tcg_enabled());
1374 #ifdef CONFIG_PROFILER
1375     ti = profile_getclock();
1376 #endif
1377     cpu_exec_start(cpu);
1378     ret = cpu_exec(cpu);
1379     cpu_exec_end(cpu);
1380 #ifdef CONFIG_PROFILER
1381     tcg_time += profile_getclock() - ti;
1382 #endif
1383     return ret;
1384 }
1385
1386 /* Destroy any remaining vCPUs which have been unplugged and have
1387  * finished running
1388  */
1389 static void deal_with_unplugged_cpus(void)
1390 {
1391     CPUState *cpu;
1392
1393     CPU_FOREACH(cpu) {
1394         if (cpu->unplug && !cpu_can_run(cpu)) {
1395             qemu_tcg_destroy_vcpu(cpu);
1396             cpu->created = false;
1397             qemu_cond_signal(&qemu_cpu_cond);
1398             break;
1399         }
1400     }
1401 }
1402
1403 /* Single-threaded TCG
1404  *
1405  * In the single-threaded case each vCPU is simulated in turn. If
1406  * there is more than a single vCPU we create a simple timer to kick
1407  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1408  * This is done explicitly rather than relying on side-effects
1409  * elsewhere.
1410  */
1411
1412 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1413 {
1414     CPUState *cpu = arg;
1415
1416     assert(tcg_enabled());
1417     rcu_register_thread();
1418     tcg_register_thread();
1419
1420     qemu_mutex_lock_iothread();
1421     qemu_thread_get_self(cpu->thread);
1422
1423     cpu->thread_id = qemu_get_thread_id();
1424     cpu->created = true;
1425     cpu->can_do_io = 1;
1426     qemu_cond_signal(&qemu_cpu_cond);
1427
1428     /* wait for initial kick-off after machine start */
1429     while (first_cpu->stopped) {
1430         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1431
1432         /* process any pending work */
1433         CPU_FOREACH(cpu) {
1434             current_cpu = cpu;
1435             qemu_wait_io_event_common(cpu);
1436         }
1437     }
1438
1439     start_tcg_kick_timer();
1440
1441     cpu = first_cpu;
1442
1443     /* process any pending work */
1444     cpu->exit_request = 1;
1445
1446     while (1) {
1447         qemu_mutex_unlock_iothread();
1448         replay_mutex_lock();
1449         qemu_mutex_lock_iothread();
1450         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1451         qemu_account_warp_timer();
1452
1453         /* Run the timers here.  This is much more efficient than
1454          * waking up the I/O thread and waiting for completion.
1455          */
1456         handle_icount_deadline();
1457
1458         replay_mutex_unlock();
1459
1460         if (!cpu) {
1461             cpu = first_cpu;
1462         }
1463
1464         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1465
1466             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1467             current_cpu = cpu;
1468
1469             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1470                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1471
1472             if (cpu_can_run(cpu)) {
1473                 int r;
1474
1475                 qemu_mutex_unlock_iothread();
1476                 prepare_icount_for_run(cpu);
1477
1478                 r = tcg_cpu_exec(cpu);
1479
1480                 process_icount_data(cpu);
1481                 qemu_mutex_lock_iothread();
1482
1483                 if (r == EXCP_DEBUG) {
1484                     cpu_handle_guest_debug(cpu);
1485                     break;
1486                 } else if (r == EXCP_ATOMIC) {
1487                     qemu_mutex_unlock_iothread();
1488                     cpu_exec_step_atomic(cpu);
1489                     qemu_mutex_lock_iothread();
1490                     break;
1491                 }
1492             } else if (cpu->stop) {
1493                 if (cpu->unplug) {
1494                     cpu = CPU_NEXT(cpu);
1495                 }
1496                 break;
1497             }
1498
1499             cpu = CPU_NEXT(cpu);
1500         } /* while (cpu && !cpu->exit_request).. */
1501
1502         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1503         atomic_set(&tcg_current_rr_cpu, NULL);
1504
1505         if (cpu && cpu->exit_request) {
1506             atomic_mb_set(&cpu->exit_request, 0);
1507         }
1508
1509         qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1510         deal_with_unplugged_cpus();
1511     }
1512
1513     rcu_unregister_thread();
1514     return NULL;
1515 }
1516
1517 static void *qemu_hax_cpu_thread_fn(void *arg)
1518 {
1519     CPUState *cpu = arg;
1520     int r;
1521
1522     rcu_register_thread();
1523     qemu_mutex_lock_iothread();
1524     qemu_thread_get_self(cpu->thread);
1525
1526     cpu->thread_id = qemu_get_thread_id();
1527     cpu->created = true;
1528     cpu->halted = 0;
1529     current_cpu = cpu;
1530
1531     hax_init_vcpu(cpu);
1532     qemu_cond_signal(&qemu_cpu_cond);
1533
1534     do {
1535         if (cpu_can_run(cpu)) {
1536             r = hax_smp_cpu_exec(cpu);
1537             if (r == EXCP_DEBUG) {
1538                 cpu_handle_guest_debug(cpu);
1539             }
1540         }
1541
1542         qemu_wait_io_event(cpu);
1543     } while (!cpu->unplug || cpu_can_run(cpu));
1544     rcu_unregister_thread();
1545     return NULL;
1546 }
1547
1548 /* The HVF-specific vCPU thread function. This one should only run when the host
1549  * CPU supports the VMX "unrestricted guest" feature. */
1550 static void *qemu_hvf_cpu_thread_fn(void *arg)
1551 {
1552     CPUState *cpu = arg;
1553
1554     int r;
1555
1556     assert(hvf_enabled());
1557
1558     rcu_register_thread();
1559
1560     qemu_mutex_lock_iothread();
1561     qemu_thread_get_self(cpu->thread);
1562
1563     cpu->thread_id = qemu_get_thread_id();
1564     cpu->can_do_io = 1;
1565     current_cpu = cpu;
1566
1567     hvf_init_vcpu(cpu);
1568
1569     /* signal CPU creation */
1570     cpu->created = true;
1571     qemu_cond_signal(&qemu_cpu_cond);
1572
1573     do {
1574         if (cpu_can_run(cpu)) {
1575             r = hvf_vcpu_exec(cpu);
1576             if (r == EXCP_DEBUG) {
1577                 cpu_handle_guest_debug(cpu);
1578             }
1579         }
1580         qemu_wait_io_event(cpu);
1581     } while (!cpu->unplug || cpu_can_run(cpu));
1582
1583     hvf_vcpu_destroy(cpu);
1584     cpu->created = false;
1585     qemu_cond_signal(&qemu_cpu_cond);
1586     qemu_mutex_unlock_iothread();
1587     rcu_unregister_thread();
1588     return NULL;
1589 }
1590
1591 static void *qemu_whpx_cpu_thread_fn(void *arg)
1592 {
1593     CPUState *cpu = arg;
1594     int r;
1595
1596     rcu_register_thread();
1597
1598     qemu_mutex_lock_iothread();
1599     qemu_thread_get_self(cpu->thread);
1600     cpu->thread_id = qemu_get_thread_id();
1601     current_cpu = cpu;
1602
1603     r = whpx_init_vcpu(cpu);
1604     if (r < 0) {
1605         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1606         exit(1);
1607     }
1608
1609     /* signal CPU creation */
1610     cpu->created = true;
1611     qemu_cond_signal(&qemu_cpu_cond);
1612
1613     do {
1614         if (cpu_can_run(cpu)) {
1615             r = whpx_vcpu_exec(cpu);
1616             if (r == EXCP_DEBUG) {
1617                 cpu_handle_guest_debug(cpu);
1618             }
1619         }
1620         while (cpu_thread_is_idle(cpu)) {
1621             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1622         }
1623         qemu_wait_io_event_common(cpu);
1624     } while (!cpu->unplug || cpu_can_run(cpu));
1625
1626     whpx_destroy_vcpu(cpu);
1627     cpu->created = false;
1628     qemu_cond_signal(&qemu_cpu_cond);
1629     qemu_mutex_unlock_iothread();
1630     rcu_unregister_thread();
1631     return NULL;
1632 }
1633
1634 #ifdef _WIN32
1635 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1636 {
1637 }
1638 #endif
1639
1640 /* Multi-threaded TCG
1641  *
1642  * In the multi-threaded case each vCPU has its own thread. The TLS
1643  * variable current_cpu can be used deep in the code to find the
1644  * current CPUState for a given thread.
1645  */
1646
1647 static void *qemu_tcg_cpu_thread_fn(void *arg)
1648 {
1649     CPUState *cpu = arg;
1650
1651     assert(tcg_enabled());
1652     g_assert(!use_icount);
1653
1654     rcu_register_thread();
1655     tcg_register_thread();
1656
1657     qemu_mutex_lock_iothread();
1658     qemu_thread_get_self(cpu->thread);
1659
1660     cpu->thread_id = qemu_get_thread_id();
1661     cpu->created = true;
1662     cpu->can_do_io = 1;
1663     current_cpu = cpu;
1664     qemu_cond_signal(&qemu_cpu_cond);
1665
1666     /* process any pending work */
1667     cpu->exit_request = 1;
1668
1669     do {
1670         if (cpu_can_run(cpu)) {
1671             int r;
1672             qemu_mutex_unlock_iothread();
1673             r = tcg_cpu_exec(cpu);
1674             qemu_mutex_lock_iothread();
1675             switch (r) {
1676             case EXCP_DEBUG:
1677                 cpu_handle_guest_debug(cpu);
1678                 break;
1679             case EXCP_HALTED:
1680                 /* during start-up the vCPU is reset and the thread is
1681                  * kicked several times. If we don't ensure we go back
1682                  * to sleep in the halted state we won't cleanly
1683                  * start-up when the vCPU is enabled.
1684                  *
1685                  * cpu->halted should ensure we sleep in wait_io_event
1686                  */
1687                 g_assert(cpu->halted);
1688                 break;
1689             case EXCP_ATOMIC:
1690                 qemu_mutex_unlock_iothread();
1691                 cpu_exec_step_atomic(cpu);
1692                 qemu_mutex_lock_iothread();
1693             default:
1694                 /* Ignore everything else? */
1695                 break;
1696             }
1697         }
1698
1699         atomic_mb_set(&cpu->exit_request, 0);
1700         qemu_wait_io_event(cpu);
1701     } while (!cpu->unplug || cpu_can_run(cpu));
1702
1703     qemu_tcg_destroy_vcpu(cpu);
1704     cpu->created = false;
1705     qemu_cond_signal(&qemu_cpu_cond);
1706     qemu_mutex_unlock_iothread();
1707     rcu_unregister_thread();
1708     return NULL;
1709 }
1710
1711 static void qemu_cpu_kick_thread(CPUState *cpu)
1712 {
1713 #ifndef _WIN32
1714     int err;
1715
1716     if (cpu->thread_kicked) {
1717         return;
1718     }
1719     cpu->thread_kicked = true;
1720     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1721     if (err) {
1722         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1723         exit(1);
1724     }
1725 #else /* _WIN32 */
1726     if (!qemu_cpu_is_self(cpu)) {
1727         if (whpx_enabled()) {
1728             whpx_vcpu_kick(cpu);
1729         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1730             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1731                     __func__, GetLastError());
1732             exit(1);
1733         }
1734     }
1735 #endif
1736 }
1737
1738 void qemu_cpu_kick(CPUState *cpu)
1739 {
1740     qemu_cond_broadcast(cpu->halt_cond);
1741     if (tcg_enabled()) {
1742         cpu_exit(cpu);
1743         /* NOP unless doing single-thread RR */
1744         qemu_cpu_kick_rr_cpu();
1745     } else {
1746         if (hax_enabled()) {
1747             /*
1748              * FIXME: race condition with the exit_request check in
1749              * hax_vcpu_hax_exec
1750              */
1751             cpu->exit_request = 1;
1752         }
1753         qemu_cpu_kick_thread(cpu);
1754     }
1755 }
1756
1757 void qemu_cpu_kick_self(void)
1758 {
1759     assert(current_cpu);
1760     qemu_cpu_kick_thread(current_cpu);
1761 }
1762
1763 bool qemu_cpu_is_self(CPUState *cpu)
1764 {
1765     return qemu_thread_is_self(cpu->thread);
1766 }
1767
1768 bool qemu_in_vcpu_thread(void)
1769 {
1770     return current_cpu && qemu_cpu_is_self(current_cpu);
1771 }
1772
1773 static __thread bool iothread_locked = false;
1774
1775 bool qemu_mutex_iothread_locked(void)
1776 {
1777     return iothread_locked;
1778 }
1779
1780 /*
1781  * The BQL is taken from so many places that it is worth profiling the
1782  * callers directly, instead of funneling them all through a single function.
1783  */
1784 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1785 {
1786     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1787
1788     g_assert(!qemu_mutex_iothread_locked());
1789     bql_lock(&qemu_global_mutex, file, line);
1790     iothread_locked = true;
1791 }
1792
1793 void qemu_mutex_unlock_iothread(void)
1794 {
1795     g_assert(qemu_mutex_iothread_locked());
1796     iothread_locked = false;
1797     qemu_mutex_unlock(&qemu_global_mutex);
1798 }
1799
1800 static bool all_vcpus_paused(void)
1801 {
1802     CPUState *cpu;
1803
1804     CPU_FOREACH(cpu) {
1805         if (!cpu->stopped) {
1806             return false;
1807         }
1808     }
1809
1810     return true;
1811 }
1812
1813 void pause_all_vcpus(void)
1814 {
1815     CPUState *cpu;
1816
1817     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1818     CPU_FOREACH(cpu) {
1819         if (qemu_cpu_is_self(cpu)) {
1820             qemu_cpu_stop(cpu, true);
1821         } else {
1822             cpu->stop = true;
1823             qemu_cpu_kick(cpu);
1824         }
1825     }
1826
1827     /* We need to drop the replay_lock so any vCPU threads woken up
1828      * can finish their replay tasks
1829      */
1830     replay_mutex_unlock();
1831
1832     while (!all_vcpus_paused()) {
1833         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1834         CPU_FOREACH(cpu) {
1835             qemu_cpu_kick(cpu);
1836         }
1837     }
1838
1839     qemu_mutex_unlock_iothread();
1840     replay_mutex_lock();
1841     qemu_mutex_lock_iothread();
1842 }
1843
1844 void cpu_resume(CPUState *cpu)
1845 {
1846     cpu->stop = false;
1847     cpu->stopped = false;
1848     qemu_cpu_kick(cpu);
1849 }
1850
1851 void resume_all_vcpus(void)
1852 {
1853     CPUState *cpu;
1854
1855     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1856     CPU_FOREACH(cpu) {
1857         cpu_resume(cpu);
1858     }
1859 }
1860
1861 void cpu_remove_sync(CPUState *cpu)
1862 {
1863     cpu->stop = true;
1864     cpu->unplug = true;
1865     qemu_cpu_kick(cpu);
1866     qemu_mutex_unlock_iothread();
1867     qemu_thread_join(cpu->thread);
1868     qemu_mutex_lock_iothread();
1869 }
1870
1871 /* For temporary buffers for forming a name */
1872 #define VCPU_THREAD_NAME_SIZE 16
1873
1874 static void qemu_tcg_init_vcpu(CPUState *cpu)
1875 {
1876     char thread_name[VCPU_THREAD_NAME_SIZE];
1877     static QemuCond *single_tcg_halt_cond;
1878     static QemuThread *single_tcg_cpu_thread;
1879     static int tcg_region_inited;
1880
1881     assert(tcg_enabled());
1882     /*
1883      * Initialize TCG regions--once. Now is a good time, because:
1884      * (1) TCG's init context, prologue and target globals have been set up.
1885      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1886      *     -accel flag is processed, so the check doesn't work then).
1887      */
1888     if (!tcg_region_inited) {
1889         tcg_region_inited = 1;
1890         tcg_region_init();
1891     }
1892
1893     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1894         cpu->thread = g_malloc0(sizeof(QemuThread));
1895         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1896         qemu_cond_init(cpu->halt_cond);
1897
1898         if (qemu_tcg_mttcg_enabled()) {
1899             /* create a thread per vCPU with TCG (MTTCG) */
1900             parallel_cpus = true;
1901             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1902                  cpu->cpu_index);
1903
1904             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1905                                cpu, QEMU_THREAD_JOINABLE);
1906
1907         } else {
1908             /* share a single thread for all cpus with TCG */
1909             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1910             qemu_thread_create(cpu->thread, thread_name,
1911                                qemu_tcg_rr_cpu_thread_fn,
1912                                cpu, QEMU_THREAD_JOINABLE);
1913
1914             single_tcg_halt_cond = cpu->halt_cond;
1915             single_tcg_cpu_thread = cpu->thread;
1916         }
1917 #ifdef _WIN32
1918         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1919 #endif
1920     } else {
1921         /* For non-MTTCG cases we share the thread */
1922         cpu->thread = single_tcg_cpu_thread;
1923         cpu->halt_cond = single_tcg_halt_cond;
1924         cpu->thread_id = first_cpu->thread_id;
1925         cpu->can_do_io = 1;
1926         cpu->created = true;
1927     }
1928 }
1929
1930 static void qemu_hax_start_vcpu(CPUState *cpu)
1931 {
1932     char thread_name[VCPU_THREAD_NAME_SIZE];
1933
1934     cpu->thread = g_malloc0(sizeof(QemuThread));
1935     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1936     qemu_cond_init(cpu->halt_cond);
1937
1938     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1939              cpu->cpu_index);
1940     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1941                        cpu, QEMU_THREAD_JOINABLE);
1942 #ifdef _WIN32
1943     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1944 #endif
1945 }
1946
1947 static void qemu_kvm_start_vcpu(CPUState *cpu)
1948 {
1949     char thread_name[VCPU_THREAD_NAME_SIZE];
1950
1951     cpu->thread = g_malloc0(sizeof(QemuThread));
1952     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1953     qemu_cond_init(cpu->halt_cond);
1954     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1955              cpu->cpu_index);
1956     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1957                        cpu, QEMU_THREAD_JOINABLE);
1958 }
1959
1960 static void qemu_hvf_start_vcpu(CPUState *cpu)
1961 {
1962     char thread_name[VCPU_THREAD_NAME_SIZE];
1963
1964     /* HVF currently does not support TCG, and only runs in
1965      * unrestricted-guest mode. */
1966     assert(hvf_enabled());
1967
1968     cpu->thread = g_malloc0(sizeof(QemuThread));
1969     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1970     qemu_cond_init(cpu->halt_cond);
1971
1972     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
1973              cpu->cpu_index);
1974     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
1975                        cpu, QEMU_THREAD_JOINABLE);
1976 }
1977
1978 static void qemu_whpx_start_vcpu(CPUState *cpu)
1979 {
1980     char thread_name[VCPU_THREAD_NAME_SIZE];
1981
1982     cpu->thread = g_malloc0(sizeof(QemuThread));
1983     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1984     qemu_cond_init(cpu->halt_cond);
1985     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
1986              cpu->cpu_index);
1987     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
1988                        cpu, QEMU_THREAD_JOINABLE);
1989 #ifdef _WIN32
1990     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1991 #endif
1992 }
1993
1994 static void qemu_dummy_start_vcpu(CPUState *cpu)
1995 {
1996     char thread_name[VCPU_THREAD_NAME_SIZE];
1997
1998     cpu->thread = g_malloc0(sizeof(QemuThread));
1999     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2000     qemu_cond_init(cpu->halt_cond);
2001     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2002              cpu->cpu_index);
2003     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2004                        QEMU_THREAD_JOINABLE);
2005 }
2006
2007 void qemu_init_vcpu(CPUState *cpu)
2008 {
2009     cpu->nr_cores = smp_cores;
2010     cpu->nr_threads = smp_threads;
2011     cpu->stopped = true;
2012
2013     if (!cpu->as) {
2014         /* If the target cpu hasn't set up any address spaces itself,
2015          * give it the default one.
2016          */
2017         cpu->num_ases = 1;
2018         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2019     }
2020
2021     if (kvm_enabled()) {
2022         qemu_kvm_start_vcpu(cpu);
2023     } else if (hax_enabled()) {
2024         qemu_hax_start_vcpu(cpu);
2025     } else if (hvf_enabled()) {
2026         qemu_hvf_start_vcpu(cpu);
2027     } else if (tcg_enabled()) {
2028         qemu_tcg_init_vcpu(cpu);
2029     } else if (whpx_enabled()) {
2030         qemu_whpx_start_vcpu(cpu);
2031     } else {
2032         qemu_dummy_start_vcpu(cpu);
2033     }
2034
2035     while (!cpu->created) {
2036         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2037     }
2038 }
2039
2040 void cpu_stop_current(void)
2041 {
2042     if (current_cpu) {
2043         qemu_cpu_stop(current_cpu, true);
2044     }
2045 }
2046
2047 int vm_stop(RunState state)
2048 {
2049     if (qemu_in_vcpu_thread()) {
2050         qemu_system_vmstop_request_prepare();
2051         qemu_system_vmstop_request(state);
2052         /*
2053          * FIXME: should not return to device code in case
2054          * vm_stop() has been requested.
2055          */
2056         cpu_stop_current();
2057         return 0;
2058     }
2059
2060     return do_vm_stop(state, true);
2061 }
2062
2063 /**
2064  * Prepare for (re)starting the VM.
2065  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2066  * running or in case of an error condition), 0 otherwise.
2067  */
2068 int vm_prepare_start(void)
2069 {
2070     RunState requested;
2071
2072     qemu_vmstop_requested(&requested);
2073     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2074         return -1;
2075     }
2076
2077     /* Ensure that a STOP/RESUME pair of events is emitted if a
2078      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2079      * example, according to documentation is always followed by
2080      * the STOP event.
2081      */
2082     if (runstate_is_running()) {
2083         qapi_event_send_stop(&error_abort);
2084         qapi_event_send_resume(&error_abort);
2085         return -1;
2086     }
2087
2088     /* We are sending this now, but the CPUs will be resumed shortly later */
2089     qapi_event_send_resume(&error_abort);
2090
2091     replay_enable_events();
2092     cpu_enable_ticks();
2093     runstate_set(RUN_STATE_RUNNING);
2094     vm_state_notify(1, RUN_STATE_RUNNING);
2095     return 0;
2096 }
2097
2098 void vm_start(void)
2099 {
2100     if (!vm_prepare_start()) {
2101         resume_all_vcpus();
2102     }
2103 }
2104
2105 /* does a state transition even if the VM is already stopped,
2106    current state is forgotten forever */
2107 int vm_stop_force_state(RunState state)
2108 {
2109     if (runstate_is_running()) {
2110         return vm_stop(state);
2111     } else {
2112         runstate_set(state);
2113
2114         bdrv_drain_all();
2115         /* Make sure to return an error if the flush in a previous vm_stop()
2116          * failed. */
2117         return bdrv_flush_all();
2118     }
2119 }
2120
2121 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2122 {
2123     /* XXX: implement xxx_cpu_list for targets that still miss it */
2124 #if defined(cpu_list)
2125     cpu_list(f, cpu_fprintf);
2126 #endif
2127 }
2128
2129 CpuInfoList *qmp_query_cpus(Error **errp)
2130 {
2131     MachineState *ms = MACHINE(qdev_get_machine());
2132     MachineClass *mc = MACHINE_GET_CLASS(ms);
2133     CpuInfoList *head = NULL, *cur_item = NULL;
2134     CPUState *cpu;
2135
2136     CPU_FOREACH(cpu) {
2137         CpuInfoList *info;
2138 #if defined(TARGET_I386)
2139         X86CPU *x86_cpu = X86_CPU(cpu);
2140         CPUX86State *env = &x86_cpu->env;
2141 #elif defined(TARGET_PPC)
2142         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2143         CPUPPCState *env = &ppc_cpu->env;
2144 #elif defined(TARGET_SPARC)
2145         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2146         CPUSPARCState *env = &sparc_cpu->env;
2147 #elif defined(TARGET_RISCV)
2148         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2149         CPURISCVState *env = &riscv_cpu->env;
2150 #elif defined(TARGET_MIPS)
2151         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2152         CPUMIPSState *env = &mips_cpu->env;
2153 #elif defined(TARGET_TRICORE)
2154         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2155         CPUTriCoreState *env = &tricore_cpu->env;
2156 #elif defined(TARGET_S390X)
2157         S390CPU *s390_cpu = S390_CPU(cpu);
2158         CPUS390XState *env = &s390_cpu->env;
2159 #endif
2160
2161         cpu_synchronize_state(cpu);
2162
2163         info = g_malloc0(sizeof(*info));
2164         info->value = g_malloc0(sizeof(*info->value));
2165         info->value->CPU = cpu->cpu_index;
2166         info->value->current = (cpu == first_cpu);
2167         info->value->halted = cpu->halted;
2168         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2169         info->value->thread_id = cpu->thread_id;
2170 #if defined(TARGET_I386)
2171         info->value->arch = CPU_INFO_ARCH_X86;
2172         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2173 #elif defined(TARGET_PPC)
2174         info->value->arch = CPU_INFO_ARCH_PPC;
2175         info->value->u.ppc.nip = env->nip;
2176 #elif defined(TARGET_SPARC)
2177         info->value->arch = CPU_INFO_ARCH_SPARC;
2178         info->value->u.q_sparc.pc = env->pc;
2179         info->value->u.q_sparc.npc = env->npc;
2180 #elif defined(TARGET_MIPS)
2181         info->value->arch = CPU_INFO_ARCH_MIPS;
2182         info->value->u.q_mips.PC = env->active_tc.PC;
2183 #elif defined(TARGET_TRICORE)
2184         info->value->arch = CPU_INFO_ARCH_TRICORE;
2185         info->value->u.tricore.PC = env->PC;
2186 #elif defined(TARGET_S390X)
2187         info->value->arch = CPU_INFO_ARCH_S390;
2188         info->value->u.s390.cpu_state = env->cpu_state;
2189 #elif defined(TARGET_RISCV)
2190         info->value->arch = CPU_INFO_ARCH_RISCV;
2191         info->value->u.riscv.pc = env->pc;
2192 #else
2193         info->value->arch = CPU_INFO_ARCH_OTHER;
2194 #endif
2195         info->value->has_props = !!mc->cpu_index_to_instance_props;
2196         if (info->value->has_props) {
2197             CpuInstanceProperties *props;
2198             props = g_malloc0(sizeof(*props));
2199             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2200             info->value->props = props;
2201         }
2202
2203         /* XXX: waiting for the qapi to support GSList */
2204         if (!cur_item) {
2205             head = cur_item = info;
2206         } else {
2207             cur_item->next = info;
2208             cur_item = info;
2209         }
2210     }
2211
2212     return head;
2213 }
2214
2215 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2216 {
2217     /*
2218      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2219      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2220      */
2221     switch (target) {
2222     case SYS_EMU_TARGET_I386:
2223     case SYS_EMU_TARGET_X86_64:
2224         return CPU_INFO_ARCH_X86;
2225
2226     case SYS_EMU_TARGET_PPC:
2227     case SYS_EMU_TARGET_PPCEMB:
2228     case SYS_EMU_TARGET_PPC64:
2229         return CPU_INFO_ARCH_PPC;
2230
2231     case SYS_EMU_TARGET_SPARC:
2232     case SYS_EMU_TARGET_SPARC64:
2233         return CPU_INFO_ARCH_SPARC;
2234
2235     case SYS_EMU_TARGET_MIPS:
2236     case SYS_EMU_TARGET_MIPSEL:
2237     case SYS_EMU_TARGET_MIPS64:
2238     case SYS_EMU_TARGET_MIPS64EL:
2239         return CPU_INFO_ARCH_MIPS;
2240
2241     case SYS_EMU_TARGET_TRICORE:
2242         return CPU_INFO_ARCH_TRICORE;
2243
2244     case SYS_EMU_TARGET_S390X:
2245         return CPU_INFO_ARCH_S390;
2246
2247     case SYS_EMU_TARGET_RISCV32:
2248     case SYS_EMU_TARGET_RISCV64:
2249         return CPU_INFO_ARCH_RISCV;
2250
2251     default:
2252         return CPU_INFO_ARCH_OTHER;
2253     }
2254 }
2255
2256 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2257 {
2258 #ifdef TARGET_S390X
2259     S390CPU *s390_cpu = S390_CPU(cpu);
2260     CPUS390XState *env = &s390_cpu->env;
2261
2262     info->cpu_state = env->cpu_state;
2263 #else
2264     abort();
2265 #endif
2266 }
2267
2268 /*
2269  * fast means: we NEVER interrupt vCPU threads to retrieve
2270  * information from KVM.
2271  */
2272 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2273 {
2274     MachineState *ms = MACHINE(qdev_get_machine());
2275     MachineClass *mc = MACHINE_GET_CLASS(ms);
2276     CpuInfoFastList *head = NULL, *cur_item = NULL;
2277     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2278                                           -1, &error_abort);
2279     CPUState *cpu;
2280
2281     CPU_FOREACH(cpu) {
2282         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2283         info->value = g_malloc0(sizeof(*info->value));
2284
2285         info->value->cpu_index = cpu->cpu_index;
2286         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2287         info->value->thread_id = cpu->thread_id;
2288
2289         info->value->has_props = !!mc->cpu_index_to_instance_props;
2290         if (info->value->has_props) {
2291             CpuInstanceProperties *props;
2292             props = g_malloc0(sizeof(*props));
2293             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2294             info->value->props = props;
2295         }
2296
2297         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2298         info->value->target = target;
2299         if (target == SYS_EMU_TARGET_S390X) {
2300             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2301         }
2302
2303         if (!cur_item) {
2304             head = cur_item = info;
2305         } else {
2306             cur_item->next = info;
2307             cur_item = info;
2308         }
2309     }
2310
2311     return head;
2312 }
2313
2314 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2315                  bool has_cpu, int64_t cpu_index, Error **errp)
2316 {
2317     FILE *f;
2318     uint32_t l;
2319     CPUState *cpu;
2320     uint8_t buf[1024];
2321     int64_t orig_addr = addr, orig_size = size;
2322
2323     if (!has_cpu) {
2324         cpu_index = 0;
2325     }
2326
2327     cpu = qemu_get_cpu(cpu_index);
2328     if (cpu == NULL) {
2329         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2330                    "a CPU number");
2331         return;
2332     }
2333
2334     f = fopen(filename, "wb");
2335     if (!f) {
2336         error_setg_file_open(errp, errno, filename);
2337         return;
2338     }
2339
2340     while (size != 0) {
2341         l = sizeof(buf);
2342         if (l > size)
2343             l = size;
2344         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2345             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2346                              " specified", orig_addr, orig_size);
2347             goto exit;
2348         }
2349         if (fwrite(buf, 1, l, f) != l) {
2350             error_setg(errp, QERR_IO_ERROR);
2351             goto exit;
2352         }
2353         addr += l;
2354         size -= l;
2355     }
2356
2357 exit:
2358     fclose(f);
2359 }
2360
2361 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2362                   Error **errp)
2363 {
2364     FILE *f;
2365     uint32_t l;
2366     uint8_t buf[1024];
2367
2368     f = fopen(filename, "wb");
2369     if (!f) {
2370         error_setg_file_open(errp, errno, filename);
2371         return;
2372     }
2373
2374     while (size != 0) {
2375         l = sizeof(buf);
2376         if (l > size)
2377             l = size;
2378         cpu_physical_memory_read(addr, buf, l);
2379         if (fwrite(buf, 1, l, f) != l) {
2380             error_setg(errp, QERR_IO_ERROR);
2381             goto exit;
2382         }
2383         addr += l;
2384         size -= l;
2385     }
2386
2387 exit:
2388     fclose(f);
2389 }
2390
2391 void qmp_inject_nmi(Error **errp)
2392 {
2393     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2394 }
2395
2396 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2397 {
2398     if (!use_icount) {
2399         return;
2400     }
2401
2402     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2403                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2404     if (icount_align_option) {
2405         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2406         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2407     } else {
2408         cpu_fprintf(f, "Max guest delay     NA\n");
2409         cpu_fprintf(f, "Max guest advance   NA\n");
2410     }
2411 }