cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 typedef struct TimersState {
 128     /* Protected by BQL.  */
 129     int64_t cpu_ticks_prev;
 130     int64_t cpu_ticks_offset;
 131
 132     /* Protect fields that can be respectively read outside the
 133      * BQL, and written from multiple threads.
 134      */
 135     QemuSeqLock vm_clock_seqlock;
 136     QemuSpin vm_clock_lock;
 137
 138     int16_t cpu_ticks_enabled;
 139
 140     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 141     int16_t icount_time_shift;
 142
 143     /* Compensate for varying guest execution speed.  */
 144     int64_t qemu_icount_bias;
 145
 146     int64_t vm_clock_warp_start;
 147     int64_t cpu_clock_offset;
 148
 149     /* Only written by TCG thread */
 150     int64_t qemu_icount;
 151
 152     /* for adjusting icount */
 153     QEMUTimer *icount_rt_timer;
 154     QEMUTimer *icount_vm_timer;
 155     QEMUTimer *icount_warp_timer;
 156 } TimersState;
 157
 158 static TimersState timers_state;
 159 bool mttcg_enabled;
 160
 161 /*
 162  * We default to false if we know other options have been enabled
 163  * which are currently incompatible with MTTCG. Otherwise when each
 164  * guest (target) has been updated to support:
 165  *   - atomic instructions
 166  *   - memory ordering primitives (barriers)
 167  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 168  *
 169  * Once a guest architecture has been converted to the new primitives
 170  * there are two remaining limitations to check.
 171  *
 172  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 173  * - The host must have a stronger memory order than the guest
 174  *
 175  * It may be possible in future to support strong guests on weak hosts
 176  * but that will require tagging all load/stores in a guest with their
 177  * implicit memory order requirements which would likely slow things
 178  * down a lot.
 179  */
 180
 181 static bool check_tcg_memory_orders_compatible(void)
 182 {
 183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 184     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 185 #else
 186     return false;
 187 #endif
 188 }
 189
 190 static bool default_mttcg_enabled(void)
 191 {
 192     if (use_icount || TCG_OVERSIZED_GUEST) {
 193         return false;
 194     } else {
 195 #ifdef TARGET_SUPPORTS_MTTCG
 196         return check_tcg_memory_orders_compatible();
 197 #else
 198         return false;
 199 #endif
 200     }
 201 }
 202
 203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 204 {
 205     const char *t = qemu_opt_get(opts, "thread");
 206     if (t) {
 207         if (strcmp(t, "multi") == 0) {
 208             if (TCG_OVERSIZED_GUEST) {
 209                 error_setg(errp, "No MTTCG when guest word size > hosts");
 210             } else if (use_icount) {
 211                 error_setg(errp, "No MTTCG when icount is enabled");
 212             } else {
 213 #ifndef TARGET_SUPPORTS_MTTCG
 214                 error_report("Guest not yet converted to MTTCG - "
 215                              "you may get unexpected results");
 216 #endif
 217                 if (!check_tcg_memory_orders_compatible()) {
 218                     error_report("Guest expects a stronger memory ordering "
 219                                  "than the host provides");
 220                     error_printf("This may cause strange/hard to debug errors\n");
 221                 }
 222                 mttcg_enabled = true;
 223             }
 224         } else if (strcmp(t, "single") == 0) {
 225             mttcg_enabled = false;
 226         } else {
 227             error_setg(errp, "Invalid 'thread' setting %s", t);
 228         }
 229     } else {
 230         mttcg_enabled = default_mttcg_enabled();
 231     }
 232 }
 233
 234 /* The current number of executed instructions is based on what we
 235  * originally budgeted minus the current state of the decrementing
 236  * icount counters in extra/u16.low.
 237  */
 238 static int64_t cpu_get_icount_executed(CPUState *cpu)
 239 {
 240     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 241 }
 242
 243 /*
 244  * Update the global shared timer_state.qemu_icount to take into
 245  * account executed instructions. This is done by the TCG vCPU
 246  * thread so the main-loop can see time has moved forward.
 247  */
 248 static void cpu_update_icount_locked(CPUState *cpu)
 249 {
 250     int64_t executed = cpu_get_icount_executed(cpu);
 251     cpu->icount_budget -= executed;
 252
 253     atomic_set_i64(&timers_state.qemu_icount,
 254                    timers_state.qemu_icount + executed);
 255 }
 256
 257 /*
 258  * Update the global shared timer_state.qemu_icount to take into
 259  * account executed instructions. This is done by the TCG vCPU
 260  * thread so the main-loop can see time has moved forward.
 261  */
 262 void cpu_update_icount(CPUState *cpu)
 263 {
 264     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 265                        &timers_state.vm_clock_lock);
 266     cpu_update_icount_locked(cpu);
 267     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 268                          &timers_state.vm_clock_lock);
 269 }
 270
 271 static int64_t cpu_get_icount_raw_locked(void)
 272 {
 273     CPUState *cpu = current_cpu;
 274
 275     if (cpu && cpu->running) {
 276         if (!cpu->can_do_io) {
 277             error_report("Bad icount read");
 278             exit(1);
 279         }
 280         /* Take into account what has run */
 281         cpu_update_icount_locked(cpu);
 282     }
 283     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 284     return atomic_read_i64(&timers_state.qemu_icount);
 285 }
 286
 287 static int64_t cpu_get_icount_locked(void)
 288 {
 289     int64_t icount = cpu_get_icount_raw_locked();
 290     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 291         cpu_icount_to_ns(icount);
 292 }
 293
 294 int64_t cpu_get_icount_raw(void)
 295 {
 296     int64_t icount;
 297     unsigned start;
 298
 299     do {
 300         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 301         icount = cpu_get_icount_raw_locked();
 302     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 303
 304     return icount;
 305 }
 306
 307 /* Return the virtual CPU time, based on the instruction counter.  */
 308 int64_t cpu_get_icount(void)
 309 {
 310     int64_t icount;
 311     unsigned start;
 312
 313     do {
 314         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 315         icount = cpu_get_icount_locked();
 316     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 317
 318     return icount;
 319 }
 320
 321 int64_t cpu_icount_to_ns(int64_t icount)
 322 {
 323     return icount << atomic_read(&timers_state.icount_time_shift);
 324 }
 325
 326 static int64_t cpu_get_ticks_locked(void)
 327 {
 328     int64_t ticks = timers_state.cpu_ticks_offset;
 329     if (timers_state.cpu_ticks_enabled) {
 330         ticks += cpu_get_host_ticks();
 331     }
 332
 333     if (timers_state.cpu_ticks_prev > ticks) {
 334         /* Non increasing ticks may happen if the host uses software suspend.  */
 335         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 336         ticks = timers_state.cpu_ticks_prev;
 337     }
 338
 339     timers_state.cpu_ticks_prev = ticks;
 340     return ticks;
 341 }
 342
 343 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 344  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 345  * counter.
 346  */
 347 int64_t cpu_get_ticks(void)
 348 {
 349     int64_t ticks;
 350
 351     if (use_icount) {
 352         return cpu_get_icount();
 353     }
 354
 355     qemu_spin_lock(&timers_state.vm_clock_lock);
 356     ticks = cpu_get_ticks_locked();
 357     qemu_spin_unlock(&timers_state.vm_clock_lock);
 358     return ticks;
 359 }
 360
 361 static int64_t cpu_get_clock_locked(void)
 362 {
 363     int64_t time;
 364
 365     time = timers_state.cpu_clock_offset;
 366     if (timers_state.cpu_ticks_enabled) {
 367         time += get_clock();
 368     }
 369
 370     return time;
 371 }
 372
 373 /* Return the monotonic time elapsed in VM, i.e.,
 374  * the time between vm_start and vm_stop
 375  */
 376 int64_t cpu_get_clock(void)
 377 {
 378     int64_t ti;
 379     unsigned start;
 380
 381     do {
 382         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 383         ti = cpu_get_clock_locked();
 384     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 385
 386     return ti;
 387 }
 388
 389 /* enable cpu_get_ticks()
 390  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 391  */
 392 void cpu_enable_ticks(void)
 393 {
 394     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 395                        &timers_state.vm_clock_lock);
 396     if (!timers_state.cpu_ticks_enabled) {
 397         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 398         timers_state.cpu_clock_offset -= get_clock();
 399         timers_state.cpu_ticks_enabled = 1;
 400     }
 401     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 402                        &timers_state.vm_clock_lock);
 403 }
 404
 405 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 406  * cpu_get_ticks() after that.
 407  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 408  */
 409 void cpu_disable_ticks(void)
 410 {
 411     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 412                        &timers_state.vm_clock_lock);
 413     if (timers_state.cpu_ticks_enabled) {
 414         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 415         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 416         timers_state.cpu_ticks_enabled = 0;
 417     }
 418     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 419                          &timers_state.vm_clock_lock);
 420 }
 421
 422 /* Correlation between real and virtual time is always going to be
 423    fairly approximate, so ignore small variation.
 424    When the guest is idle real and virtual time will be aligned in
 425    the IO wait loop.  */
 426 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 427
 428 static void icount_adjust(void)
 429 {
 430     int64_t cur_time;
 431     int64_t cur_icount;
 432     int64_t delta;
 433
 434     /* Protected by TimersState mutex.  */
 435     static int64_t last_delta;
 436
 437     /* If the VM is not running, then do nothing.  */
 438     if (!runstate_is_running()) {
 439         return;
 440     }
 441
 442     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 443                        &timers_state.vm_clock_lock);
 444     cur_time = cpu_get_clock_locked();
 445     cur_icount = cpu_get_icount_locked();
 446
 447     delta = cur_icount - cur_time;
 448     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 449     if (delta > 0
 450         && last_delta + ICOUNT_WOBBLE < delta * 2
 451         && timers_state.icount_time_shift > 0) {
 452         /* The guest is getting too far ahead.  Slow time down.  */
 453         atomic_set(&timers_state.icount_time_shift,
 454                    timers_state.icount_time_shift - 1);
 455     }
 456     if (delta < 0
 457         && last_delta - ICOUNT_WOBBLE > delta * 2
 458         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 459         /* The guest is getting too far behind.  Speed time up.  */
 460         atomic_set(&timers_state.icount_time_shift,
 461                    timers_state.icount_time_shift + 1);
 462     }
 463     last_delta = delta;
 464     atomic_set_i64(&timers_state.qemu_icount_bias,
 465                    cur_icount - (timers_state.qemu_icount
 466                                  << timers_state.icount_time_shift));
 467     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                          &timers_state.vm_clock_lock);
 469 }
 470
 471 static void icount_adjust_rt(void *opaque)
 472 {
 473     timer_mod(timers_state.icount_rt_timer,
 474               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 475     icount_adjust();
 476 }
 477
 478 static void icount_adjust_vm(void *opaque)
 479 {
 480     timer_mod(timers_state.icount_vm_timer,
 481                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 482                    NANOSECONDS_PER_SECOND / 10);
 483     icount_adjust();
 484 }
 485
 486 static int64_t qemu_icount_round(int64_t count)
 487 {
 488     int shift = atomic_read(&timers_state.icount_time_shift);
 489     return (count + (1 << shift) - 1) >> shift;
 490 }
 491
 492 static void icount_warp_rt(void)
 493 {
 494     unsigned seq;
 495     int64_t warp_start;
 496
 497     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 498      * changes from -1 to another value, so the race here is okay.
 499      */
 500     do {
 501         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 502         warp_start = timers_state.vm_clock_warp_start;
 503     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 504
 505     if (warp_start == -1) {
 506         return;
 507     }
 508
 509     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 510                        &timers_state.vm_clock_lock);
 511     if (runstate_is_running()) {
 512         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 513                                      cpu_get_clock_locked());
 514         int64_t warp_delta;
 515
 516         warp_delta = clock - timers_state.vm_clock_warp_start;
 517         if (use_icount == 2) {
 518             /*
 519              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 520              * far ahead of real time.
 521              */
 522             int64_t cur_icount = cpu_get_icount_locked();
 523             int64_t delta = clock - cur_icount;
 524             warp_delta = MIN(warp_delta, delta);
 525         }
 526         atomic_set_i64(&timers_state.qemu_icount_bias,
 527                        timers_state.qemu_icount_bias + warp_delta);
 528     }
 529     timers_state.vm_clock_warp_start = -1;
 530     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 531                        &timers_state.vm_clock_lock);
 532
 533     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 534         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 535     }
 536 }
 537
 538 static void icount_timer_cb(void *opaque)
 539 {
 540     /* No need for a checkpoint because the timer already synchronizes
 541      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 542      */
 543     icount_warp_rt();
 544 }
 545
 546 void qtest_clock_warp(int64_t dest)
 547 {
 548     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 549     AioContext *aio_context;
 550     assert(qtest_enabled());
 551     aio_context = qemu_get_aio_context();
 552     while (clock < dest) {
 553         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 554         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 555
 556         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 557                            &timers_state.vm_clock_lock);
 558         atomic_set_i64(&timers_state.qemu_icount_bias,
 559                        timers_state.qemu_icount_bias + warp);
 560         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 561                              &timers_state.vm_clock_lock);
 562
 563         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 564         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 565         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 566     }
 567     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 568 }
 569
 570 void qemu_start_warp_timer(void)
 571 {
 572     int64_t clock;
 573     int64_t deadline;
 574
 575     if (!use_icount) {
 576         return;
 577     }
 578
 579     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 580      * do not fire, so computing the deadline does not make sense.
 581      */
 582     if (!runstate_is_running()) {
 583         return;
 584     }
 585
 586     if (replay_mode != REPLAY_MODE_PLAY) {
 587         if (!all_cpu_threads_idle()) {
 588             return;
 589         }
 590
 591         if (qtest_enabled()) {
 592             /* When testing, qtest commands advance icount.  */
 593             return;
 594         }
 595
 596         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 597     } else {
 598         /* warp clock deterministically in record/replay mode */
 599         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 600             /* vCPU is sleeping and warp can't be started.
 601                It is probably a race condition: notification sent
 602                to vCPU was processed in advance and vCPU went to sleep.
 603                Therefore we have to wake it up for doing someting. */
 604             if (replay_has_checkpoint()) {
 605                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 606             }
 607             return;
 608         }
 609     }
 610
 611     /* We want to use the earliest deadline from ALL vm_clocks */
 612     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 613     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 614     if (deadline < 0) {
 615         static bool notified;
 616         if (!icount_sleep && !notified) {
 617             warn_report("icount sleep disabled and no active timers");
 618             notified = true;
 619         }
 620         return;
 621     }
 622
 623     if (deadline > 0) {
 624         /*
 625          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 626          * sleep.  Otherwise, the CPU might be waiting for a future timer
 627          * interrupt to wake it up, but the interrupt never comes because
 628          * the vCPU isn't running any insns and thus doesn't advance the
 629          * QEMU_CLOCK_VIRTUAL.
 630          */
 631         if (!icount_sleep) {
 632             /*
 633              * We never let VCPUs sleep in no sleep icount mode.
 634              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 635              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 636              * It is useful when we want a deterministic execution time,
 637              * isolated from host latencies.
 638              */
 639             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 640                                &timers_state.vm_clock_lock);
 641             atomic_set_i64(&timers_state.qemu_icount_bias,
 642                            timers_state.qemu_icount_bias + deadline);
 643             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 644                                  &timers_state.vm_clock_lock);
 645             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 646         } else {
 647             /*
 648              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 649              * "real" time, (related to the time left until the next event) has
 650              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 651              * This avoids that the warps are visible externally; for example,
 652              * you will not be sending network packets continuously instead of
 653              * every 100ms.
 654              */
 655             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 656                                &timers_state.vm_clock_lock);
 657             if (timers_state.vm_clock_warp_start == -1
 658                 || timers_state.vm_clock_warp_start > clock) {
 659                 timers_state.vm_clock_warp_start = clock;
 660             }
 661             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 662                                  &timers_state.vm_clock_lock);
 663             timer_mod_anticipate(timers_state.icount_warp_timer,
 664                                  clock + deadline);
 665         }
 666     } else if (deadline == 0) {
 667         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 668     }
 669 }
 670
 671 static void qemu_account_warp_timer(void)
 672 {
 673     if (!use_icount || !icount_sleep) {
 674         return;
 675     }
 676
 677     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 678      * do not fire, so computing the deadline does not make sense.
 679      */
 680     if (!runstate_is_running()) {
 681         return;
 682     }
 683
 684     /* warp clock deterministically in record/replay mode */
 685     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 686         return;
 687     }
 688
 689     timer_del(timers_state.icount_warp_timer);
 690     icount_warp_rt();
 691 }
 692
 693 static bool icount_state_needed(void *opaque)
 694 {
 695     return use_icount;
 696 }
 697
 698 static bool warp_timer_state_needed(void *opaque)
 699 {
 700     TimersState *s = opaque;
 701     return s->icount_warp_timer != NULL;
 702 }
 703
 704 static bool adjust_timers_state_needed(void *opaque)
 705 {
 706     TimersState *s = opaque;
 707     return s->icount_rt_timer != NULL;
 708 }
 709
 710 /*
 711  * Subsection for warp timer migration is optional, because may not be created
 712  */
 713 static const VMStateDescription icount_vmstate_warp_timer = {
 714     .name = "timer/icount/warp_timer",
 715     .version_id = 1,
 716     .minimum_version_id = 1,
 717     .needed = warp_timer_state_needed,
 718     .fields = (VMStateField[]) {
 719         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 720         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 721         VMSTATE_END_OF_LIST()
 722     }
 723 };
 724
 725 static const VMStateDescription icount_vmstate_adjust_timers = {
 726     .name = "timer/icount/timers",
 727     .version_id = 1,
 728     .minimum_version_id = 1,
 729     .needed = adjust_timers_state_needed,
 730     .fields = (VMStateField[]) {
 731         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 732         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 733         VMSTATE_END_OF_LIST()
 734     }
 735 };
 736
 737 /*
 738  * This is a subsection for icount migration.
 739  */
 740 static const VMStateDescription icount_vmstate_timers = {
 741     .name = "timer/icount",
 742     .version_id = 1,
 743     .minimum_version_id = 1,
 744     .needed = icount_state_needed,
 745     .fields = (VMStateField[]) {
 746         VMSTATE_INT64(qemu_icount_bias, TimersState),
 747         VMSTATE_INT64(qemu_icount, TimersState),
 748         VMSTATE_END_OF_LIST()
 749     },
 750     .subsections = (const VMStateDescription*[]) {
 751         &icount_vmstate_warp_timer,
 752         &icount_vmstate_adjust_timers,
 753         NULL
 754     }
 755 };
 756
 757 static const VMStateDescription vmstate_timers = {
 758     .name = "timer",
 759     .version_id = 2,
 760     .minimum_version_id = 1,
 761     .fields = (VMStateField[]) {
 762         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 763         VMSTATE_UNUSED(8),
 764         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 765         VMSTATE_END_OF_LIST()
 766     },
 767     .subsections = (const VMStateDescription*[]) {
 768         &icount_vmstate_timers,
 769         NULL
 770     }
 771 };
 772
 773 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 774 {
 775     double pct;
 776     double throttle_ratio;
 777     long sleeptime_ns;
 778
 779     if (!cpu_throttle_get_percentage()) {
 780         return;
 781     }
 782
 783     pct = (double)cpu_throttle_get_percentage()/100;
 784     throttle_ratio = pct / (1 - pct);
 785     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 786
 787     qemu_mutex_unlock_iothread();
 788     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 789     qemu_mutex_lock_iothread();
 790     atomic_set(&cpu->throttle_thread_scheduled, 0);
 791 }
 792
 793 static void cpu_throttle_timer_tick(void *opaque)
 794 {
 795     CPUState *cpu;
 796     double pct;
 797
 798     /* Stop the timer if needed */
 799     if (!cpu_throttle_get_percentage()) {
 800         return;
 801     }
 802     CPU_FOREACH(cpu) {
 803         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 804             async_run_on_cpu(cpu, cpu_throttle_thread,
 805                              RUN_ON_CPU_NULL);
 806         }
 807     }
 808
 809     pct = (double)cpu_throttle_get_percentage()/100;
 810     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 811                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 812 }
 813
 814 void cpu_throttle_set(int new_throttle_pct)
 815 {
 816     /* Ensure throttle percentage is within valid range */
 817     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 818     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 819
 820     atomic_set(&throttle_percentage, new_throttle_pct);
 821
 822     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 823                                        CPU_THROTTLE_TIMESLICE_NS);
 824 }
 825
 826 void cpu_throttle_stop(void)
 827 {
 828     atomic_set(&throttle_percentage, 0);
 829 }
 830
 831 bool cpu_throttle_active(void)
 832 {
 833     return (cpu_throttle_get_percentage() != 0);
 834 }
 835
 836 int cpu_throttle_get_percentage(void)
 837 {
 838     return atomic_read(&throttle_percentage);
 839 }
 840
 841 void cpu_ticks_init(void)
 842 {
 843     seqlock_init(&timers_state.vm_clock_seqlock);
 844     qemu_spin_init(&timers_state.vm_clock_lock);
 845     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 846     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 847                                            cpu_throttle_timer_tick, NULL);
 848 }
 849
 850 void configure_icount(QemuOpts *opts, Error **errp)
 851 {
 852     const char *option;
 853     char *rem_str = NULL;
 854
 855     option = qemu_opt_get(opts, "shift");
 856     if (!option) {
 857         if (qemu_opt_get(opts, "align") != NULL) {
 858             error_setg(errp, "Please specify shift option when using align");
 859         }
 860         return;
 861     }
 862
 863     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 864     if (icount_sleep) {
 865         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 866                                          icount_timer_cb, NULL);
 867     }
 868
 869     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 870
 871     if (icount_align_option && !icount_sleep) {
 872         error_setg(errp, "align=on and sleep=off are incompatible");
 873     }
 874     if (strcmp(option, "auto") != 0) {
 875         errno = 0;
 876         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 877         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 878             error_setg(errp, "icount: Invalid shift value");
 879         }
 880         use_icount = 1;
 881         return;
 882     } else if (icount_align_option) {
 883         error_setg(errp, "shift=auto and align=on are incompatible");
 884     } else if (!icount_sleep) {
 885         error_setg(errp, "shift=auto and sleep=off are incompatible");
 886     }
 887
 888     use_icount = 2;
 889
 890     /* 125MIPS seems a reasonable initial guess at the guest speed.
 891        It will be corrected fairly quickly anyway.  */
 892     timers_state.icount_time_shift = 3;
 893
 894     /* Have both realtime and virtual time triggers for speed adjustment.
 895        The realtime trigger catches emulated time passing too slowly,
 896        the virtual time trigger catches emulated time passing too fast.
 897        Realtime triggers occur even when idle, so use them less frequently
 898        than VM triggers.  */
 899     timers_state.vm_clock_warp_start = -1;
 900     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 901                                    icount_adjust_rt, NULL);
 902     timer_mod(timers_state.icount_rt_timer,
 903                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 904     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 905                                         icount_adjust_vm, NULL);
 906     timer_mod(timers_state.icount_vm_timer,
 907                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 908                    NANOSECONDS_PER_SECOND / 10);
 909 }
 910
 911 /***********************************************************/
 912 /* TCG vCPU kick timer
 913  *
 914  * The kick timer is responsible for moving single threaded vCPU
 915  * emulation on to the next vCPU. If more than one vCPU is running a
 916  * timer event with force a cpu->exit so the next vCPU can get
 917  * scheduled.
 918  *
 919  * The timer is removed if all vCPUs are idle and restarted again once
 920  * idleness is complete.
 921  */
 922
 923 static QEMUTimer *tcg_kick_vcpu_timer;
 924 static CPUState *tcg_current_rr_cpu;
 925
 926 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 927
 928 static inline int64_t qemu_tcg_next_kick(void)
 929 {
 930     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 931 }
 932
 933 /* Kick the currently round-robin scheduled vCPU */
 934 static void qemu_cpu_kick_rr_cpu(void)
 935 {
 936     CPUState *cpu;
 937     do {
 938         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 939         if (cpu) {
 940             cpu_exit(cpu);
 941         }
 942     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 943 }
 944
 945 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 946 {
 947 }
 948
 949 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 950 {
 951     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 952         qemu_notify_event();
 953         return;
 954     }
 955
 956     if (qemu_in_vcpu_thread()) {
 957         /* A CPU is currently running; kick it back out to the
 958          * tcg_cpu_exec() loop so it will recalculate its
 959          * icount deadline immediately.
 960          */
 961         qemu_cpu_kick(current_cpu);
 962     } else if (first_cpu) {
 963         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 964          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 965          * causes cpu_thread_is_idle to return false.  This way,
 966          * handle_icount_deadline can run.
 967          * If we have no CPUs at all for some reason, we don't
 968          * need to do anything.
 969          */
 970         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 971     }
 972 }
 973
 974 static void kick_tcg_thread(void *opaque)
 975 {
 976     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 977     qemu_cpu_kick_rr_cpu();
 978 }
 979
 980 static void start_tcg_kick_timer(void)
 981 {
 982     assert(!mttcg_enabled);
 983     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 984         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 985                                            kick_tcg_thread, NULL);
 986         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 987     }
 988 }
 989
 990 static void stop_tcg_kick_timer(void)
 991 {
 992     assert(!mttcg_enabled);
 993     if (tcg_kick_vcpu_timer) {
 994         timer_del(tcg_kick_vcpu_timer);
 995         tcg_kick_vcpu_timer = NULL;
 996     }
 997 }
 998
 999 /***********************************************************/
1000 void hw_error(const char *fmt, ...)
1001 {
1002     va_list ap;
1003     CPUState *cpu;
1004
1005     va_start(ap, fmt);
1006     fprintf(stderr, "qemu: hardware error: ");
1007     vfprintf(stderr, fmt, ap);
1008     fprintf(stderr, "\n");
1009     CPU_FOREACH(cpu) {
1010         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1011         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
1012     }
1013     va_end(ap);
1014     abort();
1015 }
1016
1017 void cpu_synchronize_all_states(void)
1018 {
1019     CPUState *cpu;
1020
1021     CPU_FOREACH(cpu) {
1022         cpu_synchronize_state(cpu);
1023         /* TODO: move to cpu_synchronize_state() */
1024         if (hvf_enabled()) {
1025             hvf_cpu_synchronize_state(cpu);
1026         }
1027     }
1028 }
1029
1030 void cpu_synchronize_all_post_reset(void)
1031 {
1032     CPUState *cpu;
1033
1034     CPU_FOREACH(cpu) {
1035         cpu_synchronize_post_reset(cpu);
1036         /* TODO: move to cpu_synchronize_post_reset() */
1037         if (hvf_enabled()) {
1038             hvf_cpu_synchronize_post_reset(cpu);
1039         }
1040     }
1041 }
1042
1043 void cpu_synchronize_all_post_init(void)
1044 {
1045     CPUState *cpu;
1046
1047     CPU_FOREACH(cpu) {
1048         cpu_synchronize_post_init(cpu);
1049         /* TODO: move to cpu_synchronize_post_init() */
1050         if (hvf_enabled()) {
1051             hvf_cpu_synchronize_post_init(cpu);
1052         }
1053     }
1054 }
1055
1056 void cpu_synchronize_all_pre_loadvm(void)
1057 {
1058     CPUState *cpu;
1059
1060     CPU_FOREACH(cpu) {
1061         cpu_synchronize_pre_loadvm(cpu);
1062     }
1063 }
1064
1065 static int do_vm_stop(RunState state, bool send_stop)
1066 {
1067     int ret = 0;
1068
1069     if (runstate_is_running()) {
1070         cpu_disable_ticks();
1071         pause_all_vcpus();
1072         runstate_set(state);
1073         vm_state_notify(0, state);
1074         if (send_stop) {
1075             qapi_event_send_stop();
1076         }
1077     }
1078
1079     bdrv_drain_all();
1080     replay_disable_events();
1081     ret = bdrv_flush_all();
1082
1083     return ret;
1084 }
1085
1086 /* Special vm_stop() variant for terminating the process.  Historically clients
1087  * did not expect a QMP STOP event and so we need to retain compatibility.
1088  */
1089 int vm_shutdown(void)
1090 {
1091     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1092 }
1093
1094 static bool cpu_can_run(CPUState *cpu)
1095 {
1096     if (cpu->stop) {
1097         return false;
1098     }
1099     if (cpu_is_stopped(cpu)) {
1100         return false;
1101     }
1102     return true;
1103 }
1104
1105 static void cpu_handle_guest_debug(CPUState *cpu)
1106 {
1107     gdb_set_stop_cpu(cpu);
1108     qemu_system_debug_request();
1109     cpu->stopped = true;
1110 }
1111
1112 #ifdef CONFIG_LINUX
1113 static void sigbus_reraise(void)
1114 {
1115     sigset_t set;
1116     struct sigaction action;
1117
1118     memset(&action, 0, sizeof(action));
1119     action.sa_handler = SIG_DFL;
1120     if (!sigaction(SIGBUS, &action, NULL)) {
1121         raise(SIGBUS);
1122         sigemptyset(&set);
1123         sigaddset(&set, SIGBUS);
1124         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1125     }
1126     perror("Failed to re-raise SIGBUS!\n");
1127     abort();
1128 }
1129
1130 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1131 {
1132     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1133         sigbus_reraise();
1134     }
1135
1136     if (current_cpu) {
1137         /* Called asynchronously in VCPU thread.  */
1138         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1139             sigbus_reraise();
1140         }
1141     } else {
1142         /* Called synchronously (via signalfd) in main thread.  */
1143         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1144             sigbus_reraise();
1145         }
1146     }
1147 }
1148
1149 static void qemu_init_sigbus(void)
1150 {
1151     struct sigaction action;
1152
1153     memset(&action, 0, sizeof(action));
1154     action.sa_flags = SA_SIGINFO;
1155     action.sa_sigaction = sigbus_handler;
1156     sigaction(SIGBUS, &action, NULL);
1157
1158     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1159 }
1160 #else /* !CONFIG_LINUX */
1161 static void qemu_init_sigbus(void)
1162 {
1163 }
1164 #endif /* !CONFIG_LINUX */
1165
1166 static QemuMutex qemu_global_mutex;
1167
1168 static QemuThread io_thread;
1169
1170 /* cpu creation */
1171 static QemuCond qemu_cpu_cond;
1172 /* system init */
1173 static QemuCond qemu_pause_cond;
1174
1175 void qemu_init_cpu_loop(void)
1176 {
1177     qemu_init_sigbus();
1178     qemu_cond_init(&qemu_cpu_cond);
1179     qemu_cond_init(&qemu_pause_cond);
1180     qemu_mutex_init(&qemu_global_mutex);
1181
1182     qemu_thread_get_self(&io_thread);
1183 }
1184
1185 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1186 {
1187     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1188 }
1189
1190 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1191 {
1192     if (kvm_destroy_vcpu(cpu) < 0) {
1193         error_report("kvm_destroy_vcpu failed");
1194         exit(EXIT_FAILURE);
1195     }
1196 }
1197
1198 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1199 {
1200 }
1201
1202 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1203 {
1204     g_assert(qemu_cpu_is_self(cpu));
1205     cpu->stop = false;
1206     cpu->stopped = true;
1207     if (exit) {
1208         cpu_exit(cpu);
1209     }
1210     qemu_cond_broadcast(&qemu_pause_cond);
1211 }
1212
1213 static void qemu_wait_io_event_common(CPUState *cpu)
1214 {
1215     atomic_mb_set(&cpu->thread_kicked, false);
1216     if (cpu->stop) {
1217         qemu_cpu_stop(cpu, false);
1218     }
1219     process_queued_cpu_work(cpu);
1220 }
1221
1222 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1223 {
1224     while (all_cpu_threads_idle()) {
1225         stop_tcg_kick_timer();
1226         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1227     }
1228
1229     start_tcg_kick_timer();
1230
1231     qemu_wait_io_event_common(cpu);
1232 }
1233
1234 static void qemu_wait_io_event(CPUState *cpu)
1235 {
1236     while (cpu_thread_is_idle(cpu)) {
1237         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1238     }
1239
1240 #ifdef _WIN32
1241     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1242     if (!tcg_enabled()) {
1243         SleepEx(0, TRUE);
1244     }
1245 #endif
1246     qemu_wait_io_event_common(cpu);
1247 }
1248
1249 static void *qemu_kvm_cpu_thread_fn(void *arg)
1250 {
1251     CPUState *cpu = arg;
1252     int r;
1253
1254     rcu_register_thread();
1255
1256     qemu_mutex_lock_iothread();
1257     qemu_thread_get_self(cpu->thread);
1258     cpu->thread_id = qemu_get_thread_id();
1259     cpu->can_do_io = 1;
1260     current_cpu = cpu;
1261
1262     r = kvm_init_vcpu(cpu);
1263     if (r < 0) {
1264         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1265         exit(1);
1266     }
1267
1268     kvm_init_cpu_signals(cpu);
1269
1270     /* signal CPU creation */
1271     cpu->created = true;
1272     qemu_cond_signal(&qemu_cpu_cond);
1273
1274     do {
1275         if (cpu_can_run(cpu)) {
1276             r = kvm_cpu_exec(cpu);
1277             if (r == EXCP_DEBUG) {
1278                 cpu_handle_guest_debug(cpu);
1279             }
1280         }
1281         qemu_wait_io_event(cpu);
1282     } while (!cpu->unplug || cpu_can_run(cpu));
1283
1284     qemu_kvm_destroy_vcpu(cpu);
1285     cpu->created = false;
1286     qemu_cond_signal(&qemu_cpu_cond);
1287     qemu_mutex_unlock_iothread();
1288     rcu_unregister_thread();
1289     return NULL;
1290 }
1291
1292 static void *qemu_dummy_cpu_thread_fn(void *arg)
1293 {
1294 #ifdef _WIN32
1295     error_report("qtest is not supported under Windows");
1296     exit(1);
1297 #else
1298     CPUState *cpu = arg;
1299     sigset_t waitset;
1300     int r;
1301
1302     rcu_register_thread();
1303
1304     qemu_mutex_lock_iothread();
1305     qemu_thread_get_self(cpu->thread);
1306     cpu->thread_id = qemu_get_thread_id();
1307     cpu->can_do_io = 1;
1308     current_cpu = cpu;
1309
1310     sigemptyset(&waitset);
1311     sigaddset(&waitset, SIG_IPI);
1312
1313     /* signal CPU creation */
1314     cpu->created = true;
1315     qemu_cond_signal(&qemu_cpu_cond);
1316
1317     do {
1318         qemu_mutex_unlock_iothread();
1319         do {
1320             int sig;
1321             r = sigwait(&waitset, &sig);
1322         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1323         if (r == -1) {
1324             perror("sigwait");
1325             exit(1);
1326         }
1327         qemu_mutex_lock_iothread();
1328         qemu_wait_io_event(cpu);
1329     } while (!cpu->unplug);
1330
1331     rcu_unregister_thread();
1332     return NULL;
1333 #endif
1334 }
1335
1336 static int64_t tcg_get_icount_limit(void)
1337 {
1338     int64_t deadline;
1339
1340     if (replay_mode != REPLAY_MODE_PLAY) {
1341         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1342
1343         /* Maintain prior (possibly buggy) behaviour where if no deadline
1344          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1345          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1346          * nanoseconds.
1347          */
1348         if ((deadline < 0) || (deadline > INT32_MAX)) {
1349             deadline = INT32_MAX;
1350         }
1351
1352         return qemu_icount_round(deadline);
1353     } else {
1354         return replay_get_instructions();
1355     }
1356 }
1357
1358 static void handle_icount_deadline(void)
1359 {
1360     assert(qemu_in_vcpu_thread());
1361     if (use_icount) {
1362         int64_t deadline =
1363             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1364
1365         if (deadline == 0) {
1366             /* Wake up other AioContexts.  */
1367             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1368             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1369         }
1370     }
1371 }
1372
1373 static void prepare_icount_for_run(CPUState *cpu)
1374 {
1375     if (use_icount) {
1376         int insns_left;
1377
1378         /* These should always be cleared by process_icount_data after
1379          * each vCPU execution. However u16.high can be raised
1380          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1381          */
1382         g_assert(cpu->icount_decr.u16.low == 0);
1383         g_assert(cpu->icount_extra == 0);
1384
1385         cpu->icount_budget = tcg_get_icount_limit();
1386         insns_left = MIN(0xffff, cpu->icount_budget);
1387         cpu->icount_decr.u16.low = insns_left;
1388         cpu->icount_extra = cpu->icount_budget - insns_left;
1389
1390         replay_mutex_lock();
1391     }
1392 }
1393
1394 static void process_icount_data(CPUState *cpu)
1395 {
1396     if (use_icount) {
1397         /* Account for executed instructions */
1398         cpu_update_icount(cpu);
1399
1400         /* Reset the counters */
1401         cpu->icount_decr.u16.low = 0;
1402         cpu->icount_extra = 0;
1403         cpu->icount_budget = 0;
1404
1405         replay_account_executed_instructions();
1406
1407         replay_mutex_unlock();
1408     }
1409 }
1410
1411
1412 static int tcg_cpu_exec(CPUState *cpu)
1413 {
1414     int ret;
1415 #ifdef CONFIG_PROFILER
1416     int64_t ti;
1417 #endif
1418
1419     assert(tcg_enabled());
1420 #ifdef CONFIG_PROFILER
1421     ti = profile_getclock();
1422 #endif
1423     cpu_exec_start(cpu);
1424     ret = cpu_exec(cpu);
1425     cpu_exec_end(cpu);
1426 #ifdef CONFIG_PROFILER
1427     tcg_time += profile_getclock() - ti;
1428 #endif
1429     return ret;
1430 }
1431
1432 /* Destroy any remaining vCPUs which have been unplugged and have
1433  * finished running
1434  */
1435 static void deal_with_unplugged_cpus(void)
1436 {
1437     CPUState *cpu;
1438
1439     CPU_FOREACH(cpu) {
1440         if (cpu->unplug && !cpu_can_run(cpu)) {
1441             qemu_tcg_destroy_vcpu(cpu);
1442             cpu->created = false;
1443             qemu_cond_signal(&qemu_cpu_cond);
1444             break;
1445         }
1446     }
1447 }
1448
1449 /* Single-threaded TCG
1450  *
1451  * In the single-threaded case each vCPU is simulated in turn. If
1452  * there is more than a single vCPU we create a simple timer to kick
1453  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1454  * This is done explicitly rather than relying on side-effects
1455  * elsewhere.
1456  */
1457
1458 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1459 {
1460     CPUState *cpu = arg;
1461
1462     assert(tcg_enabled());
1463     rcu_register_thread();
1464     tcg_register_thread();
1465
1466     qemu_mutex_lock_iothread();
1467     qemu_thread_get_self(cpu->thread);
1468
1469     cpu->thread_id = qemu_get_thread_id();
1470     cpu->created = true;
1471     cpu->can_do_io = 1;
1472     qemu_cond_signal(&qemu_cpu_cond);
1473
1474     /* wait for initial kick-off after machine start */
1475     while (first_cpu->stopped) {
1476         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1477
1478         /* process any pending work */
1479         CPU_FOREACH(cpu) {
1480             current_cpu = cpu;
1481             qemu_wait_io_event_common(cpu);
1482         }
1483     }
1484
1485     start_tcg_kick_timer();
1486
1487     cpu = first_cpu;
1488
1489     /* process any pending work */
1490     cpu->exit_request = 1;
1491
1492     while (1) {
1493         qemu_mutex_unlock_iothread();
1494         replay_mutex_lock();
1495         qemu_mutex_lock_iothread();
1496         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1497         qemu_account_warp_timer();
1498
1499         /* Run the timers here.  This is much more efficient than
1500          * waking up the I/O thread and waiting for completion.
1501          */
1502         handle_icount_deadline();
1503
1504         replay_mutex_unlock();
1505
1506         if (!cpu) {
1507             cpu = first_cpu;
1508         }
1509
1510         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1511
1512             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1513             current_cpu = cpu;
1514
1515             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1516                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1517
1518             if (cpu_can_run(cpu)) {
1519                 int r;
1520
1521                 qemu_mutex_unlock_iothread();
1522                 prepare_icount_for_run(cpu);
1523
1524                 r = tcg_cpu_exec(cpu);
1525
1526                 process_icount_data(cpu);
1527                 qemu_mutex_lock_iothread();
1528
1529                 if (r == EXCP_DEBUG) {
1530                     cpu_handle_guest_debug(cpu);
1531                     break;
1532                 } else if (r == EXCP_ATOMIC) {
1533                     qemu_mutex_unlock_iothread();
1534                     cpu_exec_step_atomic(cpu);
1535                     qemu_mutex_lock_iothread();
1536                     break;
1537                 }
1538             } else if (cpu->stop) {
1539                 if (cpu->unplug) {
1540                     cpu = CPU_NEXT(cpu);
1541                 }
1542                 break;
1543             }
1544
1545             cpu = CPU_NEXT(cpu);
1546         } /* while (cpu && !cpu->exit_request).. */
1547
1548         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1549         atomic_set(&tcg_current_rr_cpu, NULL);
1550
1551         if (cpu && cpu->exit_request) {
1552             atomic_mb_set(&cpu->exit_request, 0);
1553         }
1554
1555         qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1556         deal_with_unplugged_cpus();
1557     }
1558
1559     rcu_unregister_thread();
1560     return NULL;
1561 }
1562
1563 static void *qemu_hax_cpu_thread_fn(void *arg)
1564 {
1565     CPUState *cpu = arg;
1566     int r;
1567
1568     rcu_register_thread();
1569     qemu_mutex_lock_iothread();
1570     qemu_thread_get_self(cpu->thread);
1571
1572     cpu->thread_id = qemu_get_thread_id();
1573     cpu->created = true;
1574     cpu->halted = 0;
1575     current_cpu = cpu;
1576
1577     hax_init_vcpu(cpu);
1578     qemu_cond_signal(&qemu_cpu_cond);
1579
1580     do {
1581         if (cpu_can_run(cpu)) {
1582             r = hax_smp_cpu_exec(cpu);
1583             if (r == EXCP_DEBUG) {
1584                 cpu_handle_guest_debug(cpu);
1585             }
1586         }
1587
1588         qemu_wait_io_event(cpu);
1589     } while (!cpu->unplug || cpu_can_run(cpu));
1590     rcu_unregister_thread();
1591     return NULL;
1592 }
1593
1594 /* The HVF-specific vCPU thread function. This one should only run when the host
1595  * CPU supports the VMX "unrestricted guest" feature. */
1596 static void *qemu_hvf_cpu_thread_fn(void *arg)
1597 {
1598     CPUState *cpu = arg;
1599
1600     int r;
1601
1602     assert(hvf_enabled());
1603
1604     rcu_register_thread();
1605
1606     qemu_mutex_lock_iothread();
1607     qemu_thread_get_self(cpu->thread);
1608
1609     cpu->thread_id = qemu_get_thread_id();
1610     cpu->can_do_io = 1;
1611     current_cpu = cpu;
1612
1613     hvf_init_vcpu(cpu);
1614
1615     /* signal CPU creation */
1616     cpu->created = true;
1617     qemu_cond_signal(&qemu_cpu_cond);
1618
1619     do {
1620         if (cpu_can_run(cpu)) {
1621             r = hvf_vcpu_exec(cpu);
1622             if (r == EXCP_DEBUG) {
1623                 cpu_handle_guest_debug(cpu);
1624             }
1625         }
1626         qemu_wait_io_event(cpu);
1627     } while (!cpu->unplug || cpu_can_run(cpu));
1628
1629     hvf_vcpu_destroy(cpu);
1630     cpu->created = false;
1631     qemu_cond_signal(&qemu_cpu_cond);
1632     qemu_mutex_unlock_iothread();
1633     rcu_unregister_thread();
1634     return NULL;
1635 }
1636
1637 static void *qemu_whpx_cpu_thread_fn(void *arg)
1638 {
1639     CPUState *cpu = arg;
1640     int r;
1641
1642     rcu_register_thread();
1643
1644     qemu_mutex_lock_iothread();
1645     qemu_thread_get_self(cpu->thread);
1646     cpu->thread_id = qemu_get_thread_id();
1647     current_cpu = cpu;
1648
1649     r = whpx_init_vcpu(cpu);
1650     if (r < 0) {
1651         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1652         exit(1);
1653     }
1654
1655     /* signal CPU creation */
1656     cpu->created = true;
1657     qemu_cond_signal(&qemu_cpu_cond);
1658
1659     do {
1660         if (cpu_can_run(cpu)) {
1661             r = whpx_vcpu_exec(cpu);
1662             if (r == EXCP_DEBUG) {
1663                 cpu_handle_guest_debug(cpu);
1664             }
1665         }
1666         while (cpu_thread_is_idle(cpu)) {
1667             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1668         }
1669         qemu_wait_io_event_common(cpu);
1670     } while (!cpu->unplug || cpu_can_run(cpu));
1671
1672     whpx_destroy_vcpu(cpu);
1673     cpu->created = false;
1674     qemu_cond_signal(&qemu_cpu_cond);
1675     qemu_mutex_unlock_iothread();
1676     rcu_unregister_thread();
1677     return NULL;
1678 }
1679
1680 #ifdef _WIN32
1681 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1682 {
1683 }
1684 #endif
1685
1686 /* Multi-threaded TCG
1687  *
1688  * In the multi-threaded case each vCPU has its own thread. The TLS
1689  * variable current_cpu can be used deep in the code to find the
1690  * current CPUState for a given thread.
1691  */
1692
1693 static void *qemu_tcg_cpu_thread_fn(void *arg)
1694 {
1695     CPUState *cpu = arg;
1696
1697     assert(tcg_enabled());
1698     g_assert(!use_icount);
1699
1700     rcu_register_thread();
1701     tcg_register_thread();
1702
1703     qemu_mutex_lock_iothread();
1704     qemu_thread_get_self(cpu->thread);
1705
1706     cpu->thread_id = qemu_get_thread_id();
1707     cpu->created = true;
1708     cpu->can_do_io = 1;
1709     current_cpu = cpu;
1710     qemu_cond_signal(&qemu_cpu_cond);
1711
1712     /* process any pending work */
1713     cpu->exit_request = 1;
1714
1715     do {
1716         if (cpu_can_run(cpu)) {
1717             int r;
1718             qemu_mutex_unlock_iothread();
1719             r = tcg_cpu_exec(cpu);
1720             qemu_mutex_lock_iothread();
1721             switch (r) {
1722             case EXCP_DEBUG:
1723                 cpu_handle_guest_debug(cpu);
1724                 break;
1725             case EXCP_HALTED:
1726                 /* during start-up the vCPU is reset and the thread is
1727                  * kicked several times. If we don't ensure we go back
1728                  * to sleep in the halted state we won't cleanly
1729                  * start-up when the vCPU is enabled.
1730                  *
1731                  * cpu->halted should ensure we sleep in wait_io_event
1732                  */
1733                 g_assert(cpu->halted);
1734                 break;
1735             case EXCP_ATOMIC:
1736                 qemu_mutex_unlock_iothread();
1737                 cpu_exec_step_atomic(cpu);
1738                 qemu_mutex_lock_iothread();
1739             default:
1740                 /* Ignore everything else? */
1741                 break;
1742             }
1743         }
1744
1745         atomic_mb_set(&cpu->exit_request, 0);
1746         qemu_wait_io_event(cpu);
1747     } while (!cpu->unplug || cpu_can_run(cpu));
1748
1749     qemu_tcg_destroy_vcpu(cpu);
1750     cpu->created = false;
1751     qemu_cond_signal(&qemu_cpu_cond);
1752     qemu_mutex_unlock_iothread();
1753     rcu_unregister_thread();
1754     return NULL;
1755 }
1756
1757 static void qemu_cpu_kick_thread(CPUState *cpu)
1758 {
1759 #ifndef _WIN32
1760     int err;
1761
1762     if (cpu->thread_kicked) {
1763         return;
1764     }
1765     cpu->thread_kicked = true;
1766     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1767     if (err) {
1768         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1769         exit(1);
1770     }
1771 #else /* _WIN32 */
1772     if (!qemu_cpu_is_self(cpu)) {
1773         if (whpx_enabled()) {
1774             whpx_vcpu_kick(cpu);
1775         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1776             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1777                     __func__, GetLastError());
1778             exit(1);
1779         }
1780     }
1781 #endif
1782 }
1783
1784 void qemu_cpu_kick(CPUState *cpu)
1785 {
1786     qemu_cond_broadcast(cpu->halt_cond);
1787     if (tcg_enabled()) {
1788         cpu_exit(cpu);
1789         /* NOP unless doing single-thread RR */
1790         qemu_cpu_kick_rr_cpu();
1791     } else {
1792         if (hax_enabled()) {
1793             /*
1794              * FIXME: race condition with the exit_request check in
1795              * hax_vcpu_hax_exec
1796              */
1797             cpu->exit_request = 1;
1798         }
1799         qemu_cpu_kick_thread(cpu);
1800     }
1801 }
1802
1803 void qemu_cpu_kick_self(void)
1804 {
1805     assert(current_cpu);
1806     qemu_cpu_kick_thread(current_cpu);
1807 }
1808
1809 bool qemu_cpu_is_self(CPUState *cpu)
1810 {
1811     return qemu_thread_is_self(cpu->thread);
1812 }
1813
1814 bool qemu_in_vcpu_thread(void)
1815 {
1816     return current_cpu && qemu_cpu_is_self(current_cpu);
1817 }
1818
1819 static __thread bool iothread_locked = false;
1820
1821 bool qemu_mutex_iothread_locked(void)
1822 {
1823     return iothread_locked;
1824 }
1825
1826 /*
1827  * The BQL is taken from so many places that it is worth profiling the
1828  * callers directly, instead of funneling them all through a single function.
1829  */
1830 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1831 {
1832     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1833
1834     g_assert(!qemu_mutex_iothread_locked());
1835     bql_lock(&qemu_global_mutex, file, line);
1836     iothread_locked = true;
1837 }
1838
1839 void qemu_mutex_unlock_iothread(void)
1840 {
1841     g_assert(qemu_mutex_iothread_locked());
1842     iothread_locked = false;
1843     qemu_mutex_unlock(&qemu_global_mutex);
1844 }
1845
1846 static bool all_vcpus_paused(void)
1847 {
1848     CPUState *cpu;
1849
1850     CPU_FOREACH(cpu) {
1851         if (!cpu->stopped) {
1852             return false;
1853         }
1854     }
1855
1856     return true;
1857 }
1858
1859 void pause_all_vcpus(void)
1860 {
1861     CPUState *cpu;
1862
1863     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1864     CPU_FOREACH(cpu) {
1865         if (qemu_cpu_is_self(cpu)) {
1866             qemu_cpu_stop(cpu, true);
1867         } else {
1868             cpu->stop = true;
1869             qemu_cpu_kick(cpu);
1870         }
1871     }
1872
1873     /* We need to drop the replay_lock so any vCPU threads woken up
1874      * can finish their replay tasks
1875      */
1876     replay_mutex_unlock();
1877
1878     while (!all_vcpus_paused()) {
1879         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1880         CPU_FOREACH(cpu) {
1881             qemu_cpu_kick(cpu);
1882         }
1883     }
1884
1885     qemu_mutex_unlock_iothread();
1886     replay_mutex_lock();
1887     qemu_mutex_lock_iothread();
1888 }
1889
1890 void cpu_resume(CPUState *cpu)
1891 {
1892     cpu->stop = false;
1893     cpu->stopped = false;
1894     qemu_cpu_kick(cpu);
1895 }
1896
1897 void resume_all_vcpus(void)
1898 {
1899     CPUState *cpu;
1900
1901     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1902     CPU_FOREACH(cpu) {
1903         cpu_resume(cpu);
1904     }
1905 }
1906
1907 void cpu_remove_sync(CPUState *cpu)
1908 {
1909     cpu->stop = true;
1910     cpu->unplug = true;
1911     qemu_cpu_kick(cpu);
1912     qemu_mutex_unlock_iothread();
1913     qemu_thread_join(cpu->thread);
1914     qemu_mutex_lock_iothread();
1915 }
1916
1917 /* For temporary buffers for forming a name */
1918 #define VCPU_THREAD_NAME_SIZE 16
1919
1920 static void qemu_tcg_init_vcpu(CPUState *cpu)
1921 {
1922     char thread_name[VCPU_THREAD_NAME_SIZE];
1923     static QemuCond *single_tcg_halt_cond;
1924     static QemuThread *single_tcg_cpu_thread;
1925     static int tcg_region_inited;
1926
1927     assert(tcg_enabled());
1928     /*
1929      * Initialize TCG regions--once. Now is a good time, because:
1930      * (1) TCG's init context, prologue and target globals have been set up.
1931      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1932      *     -accel flag is processed, so the check doesn't work then).
1933      */
1934     if (!tcg_region_inited) {
1935         tcg_region_inited = 1;
1936         tcg_region_init();
1937     }
1938
1939     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1940         cpu->thread = g_malloc0(sizeof(QemuThread));
1941         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1942         qemu_cond_init(cpu->halt_cond);
1943
1944         if (qemu_tcg_mttcg_enabled()) {
1945             /* create a thread per vCPU with TCG (MTTCG) */
1946             parallel_cpus = true;
1947             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1948                  cpu->cpu_index);
1949
1950             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1951                                cpu, QEMU_THREAD_JOINABLE);
1952
1953         } else {
1954             /* share a single thread for all cpus with TCG */
1955             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1956             qemu_thread_create(cpu->thread, thread_name,
1957                                qemu_tcg_rr_cpu_thread_fn,
1958                                cpu, QEMU_THREAD_JOINABLE);
1959
1960             single_tcg_halt_cond = cpu->halt_cond;
1961             single_tcg_cpu_thread = cpu->thread;
1962         }
1963 #ifdef _WIN32
1964         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1965 #endif
1966     } else {
1967         /* For non-MTTCG cases we share the thread */
1968         cpu->thread = single_tcg_cpu_thread;
1969         cpu->halt_cond = single_tcg_halt_cond;
1970         cpu->thread_id = first_cpu->thread_id;
1971         cpu->can_do_io = 1;
1972         cpu->created = true;
1973     }
1974 }
1975
1976 static void qemu_hax_start_vcpu(CPUState *cpu)
1977 {
1978     char thread_name[VCPU_THREAD_NAME_SIZE];
1979
1980     cpu->thread = g_malloc0(sizeof(QemuThread));
1981     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1982     qemu_cond_init(cpu->halt_cond);
1983
1984     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1985              cpu->cpu_index);
1986     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1987                        cpu, QEMU_THREAD_JOINABLE);
1988 #ifdef _WIN32
1989     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1990 #endif
1991 }
1992
1993 static void qemu_kvm_start_vcpu(CPUState *cpu)
1994 {
1995     char thread_name[VCPU_THREAD_NAME_SIZE];
1996
1997     cpu->thread = g_malloc0(sizeof(QemuThread));
1998     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1999     qemu_cond_init(cpu->halt_cond);
2000     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2001              cpu->cpu_index);
2002     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2003                        cpu, QEMU_THREAD_JOINABLE);
2004 }
2005
2006 static void qemu_hvf_start_vcpu(CPUState *cpu)
2007 {
2008     char thread_name[VCPU_THREAD_NAME_SIZE];
2009
2010     /* HVF currently does not support TCG, and only runs in
2011      * unrestricted-guest mode. */
2012     assert(hvf_enabled());
2013
2014     cpu->thread = g_malloc0(sizeof(QemuThread));
2015     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2016     qemu_cond_init(cpu->halt_cond);
2017
2018     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2019              cpu->cpu_index);
2020     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2021                        cpu, QEMU_THREAD_JOINABLE);
2022 }
2023
2024 static void qemu_whpx_start_vcpu(CPUState *cpu)
2025 {
2026     char thread_name[VCPU_THREAD_NAME_SIZE];
2027
2028     cpu->thread = g_malloc0(sizeof(QemuThread));
2029     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2030     qemu_cond_init(cpu->halt_cond);
2031     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2032              cpu->cpu_index);
2033     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2034                        cpu, QEMU_THREAD_JOINABLE);
2035 #ifdef _WIN32
2036     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2037 #endif
2038 }
2039
2040 static void qemu_dummy_start_vcpu(CPUState *cpu)
2041 {
2042     char thread_name[VCPU_THREAD_NAME_SIZE];
2043
2044     cpu->thread = g_malloc0(sizeof(QemuThread));
2045     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2046     qemu_cond_init(cpu->halt_cond);
2047     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2048              cpu->cpu_index);
2049     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2050                        QEMU_THREAD_JOINABLE);
2051 }
2052
2053 void qemu_init_vcpu(CPUState *cpu)
2054 {
2055     cpu->nr_cores = smp_cores;
2056     cpu->nr_threads = smp_threads;
2057     cpu->stopped = true;
2058
2059     if (!cpu->as) {
2060         /* If the target cpu hasn't set up any address spaces itself,
2061          * give it the default one.
2062          */
2063         cpu->num_ases = 1;
2064         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2065     }
2066
2067     if (kvm_enabled()) {
2068         qemu_kvm_start_vcpu(cpu);
2069     } else if (hax_enabled()) {
2070         qemu_hax_start_vcpu(cpu);
2071     } else if (hvf_enabled()) {
2072         qemu_hvf_start_vcpu(cpu);
2073     } else if (tcg_enabled()) {
2074         qemu_tcg_init_vcpu(cpu);
2075     } else if (whpx_enabled()) {
2076         qemu_whpx_start_vcpu(cpu);
2077     } else {
2078         qemu_dummy_start_vcpu(cpu);
2079     }
2080
2081     while (!cpu->created) {
2082         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2083     }
2084 }
2085
2086 void cpu_stop_current(void)
2087 {
2088     if (current_cpu) {
2089         qemu_cpu_stop(current_cpu, true);
2090     }
2091 }
2092
2093 int vm_stop(RunState state)
2094 {
2095     if (qemu_in_vcpu_thread()) {
2096         qemu_system_vmstop_request_prepare();
2097         qemu_system_vmstop_request(state);
2098         /*
2099          * FIXME: should not return to device code in case
2100          * vm_stop() has been requested.
2101          */
2102         cpu_stop_current();
2103         return 0;
2104     }
2105
2106     return do_vm_stop(state, true);
2107 }
2108
2109 /**
2110  * Prepare for (re)starting the VM.
2111  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2112  * running or in case of an error condition), 0 otherwise.
2113  */
2114 int vm_prepare_start(void)
2115 {
2116     RunState requested;
2117
2118     qemu_vmstop_requested(&requested);
2119     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2120         return -1;
2121     }
2122
2123     /* Ensure that a STOP/RESUME pair of events is emitted if a
2124      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2125      * example, according to documentation is always followed by
2126      * the STOP event.
2127      */
2128     if (runstate_is_running()) {
2129         qapi_event_send_stop();
2130         qapi_event_send_resume();
2131         return -1;
2132     }
2133
2134     /* We are sending this now, but the CPUs will be resumed shortly later */
2135     qapi_event_send_resume();
2136
2137     replay_enable_events();
2138     cpu_enable_ticks();
2139     runstate_set(RUN_STATE_RUNNING);
2140     vm_state_notify(1, RUN_STATE_RUNNING);
2141     return 0;
2142 }
2143
2144 void vm_start(void)
2145 {
2146     if (!vm_prepare_start()) {
2147         resume_all_vcpus();
2148     }
2149 }
2150
2151 /* does a state transition even if the VM is already stopped,
2152    current state is forgotten forever */
2153 int vm_stop_force_state(RunState state)
2154 {
2155     if (runstate_is_running()) {
2156         return vm_stop(state);
2157     } else {
2158         runstate_set(state);
2159
2160         bdrv_drain_all();
2161         /* Make sure to return an error if the flush in a previous vm_stop()
2162          * failed. */
2163         return bdrv_flush_all();
2164     }
2165 }
2166
2167 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2168 {
2169     /* XXX: implement xxx_cpu_list for targets that still miss it */
2170 #if defined(cpu_list)
2171     cpu_list(f, cpu_fprintf);
2172 #endif
2173 }
2174
2175 CpuInfoList *qmp_query_cpus(Error **errp)
2176 {
2177     MachineState *ms = MACHINE(qdev_get_machine());
2178     MachineClass *mc = MACHINE_GET_CLASS(ms);
2179     CpuInfoList *head = NULL, *cur_item = NULL;
2180     CPUState *cpu;
2181
2182     CPU_FOREACH(cpu) {
2183         CpuInfoList *info;
2184 #if defined(TARGET_I386)
2185         X86CPU *x86_cpu = X86_CPU(cpu);
2186         CPUX86State *env = &x86_cpu->env;
2187 #elif defined(TARGET_PPC)
2188         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2189         CPUPPCState *env = &ppc_cpu->env;
2190 #elif defined(TARGET_SPARC)
2191         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2192         CPUSPARCState *env = &sparc_cpu->env;
2193 #elif defined(TARGET_RISCV)
2194         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2195         CPURISCVState *env = &riscv_cpu->env;
2196 #elif defined(TARGET_MIPS)
2197         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2198         CPUMIPSState *env = &mips_cpu->env;
2199 #elif defined(TARGET_TRICORE)
2200         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2201         CPUTriCoreState *env = &tricore_cpu->env;
2202 #elif defined(TARGET_S390X)
2203         S390CPU *s390_cpu = S390_CPU(cpu);
2204         CPUS390XState *env = &s390_cpu->env;
2205 #endif
2206
2207         cpu_synchronize_state(cpu);
2208
2209         info = g_malloc0(sizeof(*info));
2210         info->value = g_malloc0(sizeof(*info->value));
2211         info->value->CPU = cpu->cpu_index;
2212         info->value->current = (cpu == first_cpu);
2213         info->value->halted = cpu->halted;
2214         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2215         info->value->thread_id = cpu->thread_id;
2216 #if defined(TARGET_I386)
2217         info->value->arch = CPU_INFO_ARCH_X86;
2218         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2219 #elif defined(TARGET_PPC)
2220         info->value->arch = CPU_INFO_ARCH_PPC;
2221         info->value->u.ppc.nip = env->nip;
2222 #elif defined(TARGET_SPARC)
2223         info->value->arch = CPU_INFO_ARCH_SPARC;
2224         info->value->u.q_sparc.pc = env->pc;
2225         info->value->u.q_sparc.npc = env->npc;
2226 #elif defined(TARGET_MIPS)
2227         info->value->arch = CPU_INFO_ARCH_MIPS;
2228         info->value->u.q_mips.PC = env->active_tc.PC;
2229 #elif defined(TARGET_TRICORE)
2230         info->value->arch = CPU_INFO_ARCH_TRICORE;
2231         info->value->u.tricore.PC = env->PC;
2232 #elif defined(TARGET_S390X)
2233         info->value->arch = CPU_INFO_ARCH_S390;
2234         info->value->u.s390.cpu_state = env->cpu_state;
2235 #elif defined(TARGET_RISCV)
2236         info->value->arch = CPU_INFO_ARCH_RISCV;
2237         info->value->u.riscv.pc = env->pc;
2238 #else
2239         info->value->arch = CPU_INFO_ARCH_OTHER;
2240 #endif
2241         info->value->has_props = !!mc->cpu_index_to_instance_props;
2242         if (info->value->has_props) {
2243             CpuInstanceProperties *props;
2244             props = g_malloc0(sizeof(*props));
2245             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2246             info->value->props = props;
2247         }
2248
2249         /* XXX: waiting for the qapi to support GSList */
2250         if (!cur_item) {
2251             head = cur_item = info;
2252         } else {
2253             cur_item->next = info;
2254             cur_item = info;
2255         }
2256     }
2257
2258     return head;
2259 }
2260
2261 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2262 {
2263     /*
2264      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2265      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2266      */
2267     switch (target) {
2268     case SYS_EMU_TARGET_I386:
2269     case SYS_EMU_TARGET_X86_64:
2270         return CPU_INFO_ARCH_X86;
2271
2272     case SYS_EMU_TARGET_PPC:
2273     case SYS_EMU_TARGET_PPC64:
2274         return CPU_INFO_ARCH_PPC;
2275
2276     case SYS_EMU_TARGET_SPARC:
2277     case SYS_EMU_TARGET_SPARC64:
2278         return CPU_INFO_ARCH_SPARC;
2279
2280     case SYS_EMU_TARGET_MIPS:
2281     case SYS_EMU_TARGET_MIPSEL:
2282     case SYS_EMU_TARGET_MIPS64:
2283     case SYS_EMU_TARGET_MIPS64EL:
2284         return CPU_INFO_ARCH_MIPS;
2285
2286     case SYS_EMU_TARGET_TRICORE:
2287         return CPU_INFO_ARCH_TRICORE;
2288
2289     case SYS_EMU_TARGET_S390X:
2290         return CPU_INFO_ARCH_S390;
2291
2292     case SYS_EMU_TARGET_RISCV32:
2293     case SYS_EMU_TARGET_RISCV64:
2294         return CPU_INFO_ARCH_RISCV;
2295
2296     default:
2297         return CPU_INFO_ARCH_OTHER;
2298     }
2299 }
2300
2301 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2302 {
2303 #ifdef TARGET_S390X
2304     S390CPU *s390_cpu = S390_CPU(cpu);
2305     CPUS390XState *env = &s390_cpu->env;
2306
2307     info->cpu_state = env->cpu_state;
2308 #else
2309     abort();
2310 #endif
2311 }
2312
2313 /*
2314  * fast means: we NEVER interrupt vCPU threads to retrieve
2315  * information from KVM.
2316  */
2317 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2318 {
2319     MachineState *ms = MACHINE(qdev_get_machine());
2320     MachineClass *mc = MACHINE_GET_CLASS(ms);
2321     CpuInfoFastList *head = NULL, *cur_item = NULL;
2322     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2323                                           -1, &error_abort);
2324     CPUState *cpu;
2325
2326     CPU_FOREACH(cpu) {
2327         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2328         info->value = g_malloc0(sizeof(*info->value));
2329
2330         info->value->cpu_index = cpu->cpu_index;
2331         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2332         info->value->thread_id = cpu->thread_id;
2333
2334         info->value->has_props = !!mc->cpu_index_to_instance_props;
2335         if (info->value->has_props) {
2336             CpuInstanceProperties *props;
2337             props = g_malloc0(sizeof(*props));
2338             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2339             info->value->props = props;
2340         }
2341
2342         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2343         info->value->target = target;
2344         if (target == SYS_EMU_TARGET_S390X) {
2345             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2346         }
2347
2348         if (!cur_item) {
2349             head = cur_item = info;
2350         } else {
2351             cur_item->next = info;
2352             cur_item = info;
2353         }
2354     }
2355
2356     return head;
2357 }
2358
2359 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2360                  bool has_cpu, int64_t cpu_index, Error **errp)
2361 {
2362     FILE *f;
2363     uint32_t l;
2364     CPUState *cpu;
2365     uint8_t buf[1024];
2366     int64_t orig_addr = addr, orig_size = size;
2367
2368     if (!has_cpu) {
2369         cpu_index = 0;
2370     }
2371
2372     cpu = qemu_get_cpu(cpu_index);
2373     if (cpu == NULL) {
2374         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2375                    "a CPU number");
2376         return;
2377     }
2378
2379     f = fopen(filename, "wb");
2380     if (!f) {
2381         error_setg_file_open(errp, errno, filename);
2382         return;
2383     }
2384
2385     while (size != 0) {
2386         l = sizeof(buf);
2387         if (l > size)
2388             l = size;
2389         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2390             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2391                              " specified", orig_addr, orig_size);
2392             goto exit;
2393         }
2394         if (fwrite(buf, 1, l, f) != l) {
2395             error_setg(errp, QERR_IO_ERROR);
2396             goto exit;
2397         }
2398         addr += l;
2399         size -= l;
2400     }
2401
2402 exit:
2403     fclose(f);
2404 }
2405
2406 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2407                   Error **errp)
2408 {
2409     FILE *f;
2410     uint32_t l;
2411     uint8_t buf[1024];
2412
2413     f = fopen(filename, "wb");
2414     if (!f) {
2415         error_setg_file_open(errp, errno, filename);
2416         return;
2417     }
2418
2419     while (size != 0) {
2420         l = sizeof(buf);
2421         if (l > size)
2422             l = size;
2423         cpu_physical_memory_read(addr, buf, l);
2424         if (fwrite(buf, 1, l, f) != l) {
2425             error_setg(errp, QERR_IO_ERROR);
2426             goto exit;
2427         }
2428         addr += l;
2429         size -= l;
2430     }
2431
2432 exit:
2433     fclose(f);
2434 }
2435
2436 void qmp_inject_nmi(Error **errp)
2437 {
2438     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2439 }
2440
2441 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2442 {
2443     if (!use_icount) {
2444         return;
2445     }
2446
2447     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2448                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2449     if (icount_align_option) {
2450         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2451         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2452     } else {
2453         cpu_fprintf(f, "Max guest delay     NA\n");
2454         cpu_fprintf(f, "Max guest advance   NA\n");
2455     }
2456 }