cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 typedef struct TimersState {
 128     /* Protected by BQL.  */
 129     int64_t cpu_ticks_prev;
 130     int64_t cpu_ticks_offset;
 131
 132     /* Protect fields that can be respectively read outside the
 133      * BQL, and written from multiple threads.
 134      */
 135     QemuSeqLock vm_clock_seqlock;
 136     QemuSpin vm_clock_lock;
 137
 138     int16_t cpu_ticks_enabled;
 139
 140     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 141     int16_t icount_time_shift;
 142
 143     /* Compensate for varying guest execution speed.  */
 144     int64_t qemu_icount_bias;
 145
 146     int64_t vm_clock_warp_start;
 147     int64_t cpu_clock_offset;
 148
 149     /* Only written by TCG thread */
 150     int64_t qemu_icount;
 151
 152     /* for adjusting icount */
 153     QEMUTimer *icount_rt_timer;
 154     QEMUTimer *icount_vm_timer;
 155     QEMUTimer *icount_warp_timer;
 156 } TimersState;
 157
 158 static TimersState timers_state;
 159 bool mttcg_enabled;
 160
 161 /*
 162  * We default to false if we know other options have been enabled
 163  * which are currently incompatible with MTTCG. Otherwise when each
 164  * guest (target) has been updated to support:
 165  *   - atomic instructions
 166  *   - memory ordering primitives (barriers)
 167  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 168  *
 169  * Once a guest architecture has been converted to the new primitives
 170  * there are two remaining limitations to check.
 171  *
 172  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 173  * - The host must have a stronger memory order than the guest
 174  *
 175  * It may be possible in future to support strong guests on weak hosts
 176  * but that will require tagging all load/stores in a guest with their
 177  * implicit memory order requirements which would likely slow things
 178  * down a lot.
 179  */
 180
 181 static bool check_tcg_memory_orders_compatible(void)
 182 {
 183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 184     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 185 #else
 186     return false;
 187 #endif
 188 }
 189
 190 static bool default_mttcg_enabled(void)
 191 {
 192     if (use_icount || TCG_OVERSIZED_GUEST) {
 193         return false;
 194     } else {
 195 #ifdef TARGET_SUPPORTS_MTTCG
 196         return check_tcg_memory_orders_compatible();
 197 #else
 198         return false;
 199 #endif
 200     }
 201 }
 202
 203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 204 {
 205     const char *t = qemu_opt_get(opts, "thread");
 206     if (t) {
 207         if (strcmp(t, "multi") == 0) {
 208             if (TCG_OVERSIZED_GUEST) {
 209                 error_setg(errp, "No MTTCG when guest word size > hosts");
 210             } else if (use_icount) {
 211                 error_setg(errp, "No MTTCG when icount is enabled");
 212             } else {
 213 #ifndef TARGET_SUPPORTS_MTTCG
 214                 error_report("Guest not yet converted to MTTCG - "
 215                              "you may get unexpected results");
 216 #endif
 217                 if (!check_tcg_memory_orders_compatible()) {
 218                     error_report("Guest expects a stronger memory ordering "
 219                                  "than the host provides");
 220                     error_printf("This may cause strange/hard to debug errors\n");
 221                 }
 222                 mttcg_enabled = true;
 223             }
 224         } else if (strcmp(t, "single") == 0) {
 225             mttcg_enabled = false;
 226         } else {
 227             error_setg(errp, "Invalid 'thread' setting %s", t);
 228         }
 229     } else {
 230         mttcg_enabled = default_mttcg_enabled();
 231     }
 232 }
 233
 234 /* The current number of executed instructions is based on what we
 235  * originally budgeted minus the current state of the decrementing
 236  * icount counters in extra/u16.low.
 237  */
 238 static int64_t cpu_get_icount_executed(CPUState *cpu)
 239 {
 240     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 241 }
 242
 243 /*
 244  * Update the global shared timer_state.qemu_icount to take into
 245  * account executed instructions. This is done by the TCG vCPU
 246  * thread so the main-loop can see time has moved forward.
 247  */
 248 static void cpu_update_icount_locked(CPUState *cpu)
 249 {
 250     int64_t executed = cpu_get_icount_executed(cpu);
 251     cpu->icount_budget -= executed;
 252
 253     atomic_set__nocheck(&timers_state.qemu_icount,
 254                         timers_state.qemu_icount + executed);
 255 }
 256
 257 /*
 258  * Update the global shared timer_state.qemu_icount to take into
 259  * account executed instructions. This is done by the TCG vCPU
 260  * thread so the main-loop can see time has moved forward.
 261  */
 262 void cpu_update_icount(CPUState *cpu)
 263 {
 264     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 265                        &timers_state.vm_clock_lock);
 266     cpu_update_icount_locked(cpu);
 267     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 268                          &timers_state.vm_clock_lock);
 269 }
 270
 271 static int64_t cpu_get_icount_raw_locked(void)
 272 {
 273     CPUState *cpu = current_cpu;
 274
 275     if (cpu && cpu->running) {
 276         if (!cpu->can_do_io) {
 277             error_report("Bad icount read");
 278             exit(1);
 279         }
 280         /* Take into account what has run */
 281         cpu_update_icount_locked(cpu);
 282     }
 283     /* The read is protected by the seqlock, so __nocheck is okay.  */
 284     return atomic_read__nocheck(&timers_state.qemu_icount);
 285 }
 286
 287 static int64_t cpu_get_icount_locked(void)
 288 {
 289     int64_t icount = cpu_get_icount_raw_locked();
 290     return atomic_read__nocheck(&timers_state.qemu_icount_bias) + cpu_icount_to_ns(icount);
 291 }
 292
 293 int64_t cpu_get_icount_raw(void)
 294 {
 295     int64_t icount;
 296     unsigned start;
 297
 298     do {
 299         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 300         icount = cpu_get_icount_raw_locked();
 301     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 302
 303     return icount;
 304 }
 305
 306 /* Return the virtual CPU time, based on the instruction counter.  */
 307 int64_t cpu_get_icount(void)
 308 {
 309     int64_t icount;
 310     unsigned start;
 311
 312     do {
 313         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 314         icount = cpu_get_icount_locked();
 315     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 316
 317     return icount;
 318 }
 319
 320 int64_t cpu_icount_to_ns(int64_t icount)
 321 {
 322     return icount << atomic_read(&timers_state.icount_time_shift);
 323 }
 324
 325 static int64_t cpu_get_ticks_locked(void)
 326 {
 327     int64_t ticks = timers_state.cpu_ticks_offset;
 328     if (timers_state.cpu_ticks_enabled) {
 329         ticks += cpu_get_host_ticks();
 330     }
 331
 332     if (timers_state.cpu_ticks_prev > ticks) {
 333         /* Non increasing ticks may happen if the host uses software suspend.  */
 334         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 335         ticks = timers_state.cpu_ticks_prev;
 336     }
 337
 338     timers_state.cpu_ticks_prev = ticks;
 339     return ticks;
 340 }
 341
 342 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 343  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 344  * counter.
 345  */
 346 int64_t cpu_get_ticks(void)
 347 {
 348     int64_t ticks;
 349
 350     if (use_icount) {
 351         return cpu_get_icount();
 352     }
 353
 354     qemu_spin_lock(&timers_state.vm_clock_lock);
 355     ticks = cpu_get_ticks_locked();
 356     qemu_spin_unlock(&timers_state.vm_clock_lock);
 357     return ticks;
 358 }
 359
 360 static int64_t cpu_get_clock_locked(void)
 361 {
 362     int64_t time;
 363
 364     time = timers_state.cpu_clock_offset;
 365     if (timers_state.cpu_ticks_enabled) {
 366         time += get_clock();
 367     }
 368
 369     return time;
 370 }
 371
 372 /* Return the monotonic time elapsed in VM, i.e.,
 373  * the time between vm_start and vm_stop
 374  */
 375 int64_t cpu_get_clock(void)
 376 {
 377     int64_t ti;
 378     unsigned start;
 379
 380     do {
 381         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 382         ti = cpu_get_clock_locked();
 383     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 384
 385     return ti;
 386 }
 387
 388 /* enable cpu_get_ticks()
 389  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 390  */
 391 void cpu_enable_ticks(void)
 392 {
 393     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 394                        &timers_state.vm_clock_lock);
 395     if (!timers_state.cpu_ticks_enabled) {
 396         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 397         timers_state.cpu_clock_offset -= get_clock();
 398         timers_state.cpu_ticks_enabled = 1;
 399     }
 400     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 401                        &timers_state.vm_clock_lock);
 402 }
 403
 404 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 405  * cpu_get_ticks() after that.
 406  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 407  */
 408 void cpu_disable_ticks(void)
 409 {
 410     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 411                        &timers_state.vm_clock_lock);
 412     if (timers_state.cpu_ticks_enabled) {
 413         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 414         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 415         timers_state.cpu_ticks_enabled = 0;
 416     }
 417     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 418                          &timers_state.vm_clock_lock);
 419 }
 420
 421 /* Correlation between real and virtual time is always going to be
 422    fairly approximate, so ignore small variation.
 423    When the guest is idle real and virtual time will be aligned in
 424    the IO wait loop.  */
 425 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 426
 427 static void icount_adjust(void)
 428 {
 429     int64_t cur_time;
 430     int64_t cur_icount;
 431     int64_t delta;
 432
 433     /* Protected by TimersState mutex.  */
 434     static int64_t last_delta;
 435
 436     /* If the VM is not running, then do nothing.  */
 437     if (!runstate_is_running()) {
 438         return;
 439     }
 440
 441     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 442                        &timers_state.vm_clock_lock);
 443     cur_time = cpu_get_clock_locked();
 444     cur_icount = cpu_get_icount_locked();
 445
 446     delta = cur_icount - cur_time;
 447     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 448     if (delta > 0
 449         && last_delta + ICOUNT_WOBBLE < delta * 2
 450         && timers_state.icount_time_shift > 0) {
 451         /* The guest is getting too far ahead.  Slow time down.  */
 452         atomic_set(&timers_state.icount_time_shift,
 453                    timers_state.icount_time_shift - 1);
 454     }
 455     if (delta < 0
 456         && last_delta - ICOUNT_WOBBLE > delta * 2
 457         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 458         /* The guest is getting too far behind.  Speed time up.  */
 459         atomic_set(&timers_state.icount_time_shift,
 460                    timers_state.icount_time_shift + 1);
 461     }
 462     last_delta = delta;
 463     atomic_set__nocheck(&timers_state.qemu_icount_bias,
 464                         cur_icount - (timers_state.qemu_icount
 465                                       << timers_state.icount_time_shift));
 466     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 467                          &timers_state.vm_clock_lock);
 468 }
 469
 470 static void icount_adjust_rt(void *opaque)
 471 {
 472     timer_mod(timers_state.icount_rt_timer,
 473               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 474     icount_adjust();
 475 }
 476
 477 static void icount_adjust_vm(void *opaque)
 478 {
 479     timer_mod(timers_state.icount_vm_timer,
 480                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 481                    NANOSECONDS_PER_SECOND / 10);
 482     icount_adjust();
 483 }
 484
 485 static int64_t qemu_icount_round(int64_t count)
 486 {
 487     int shift = atomic_read(&timers_state.icount_time_shift);
 488     return (count + (1 << shift) - 1) >> shift;
 489 }
 490
 491 static void icount_warp_rt(void)
 492 {
 493     unsigned seq;
 494     int64_t warp_start;
 495
 496     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 497      * changes from -1 to another value, so the race here is okay.
 498      */
 499     do {
 500         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 501         warp_start = timers_state.vm_clock_warp_start;
 502     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 503
 504     if (warp_start == -1) {
 505         return;
 506     }
 507
 508     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 509                        &timers_state.vm_clock_lock);
 510     if (runstate_is_running()) {
 511         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 512                                      cpu_get_clock_locked());
 513         int64_t warp_delta;
 514
 515         warp_delta = clock - timers_state.vm_clock_warp_start;
 516         if (use_icount == 2) {
 517             /*
 518              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 519              * far ahead of real time.
 520              */
 521             int64_t cur_icount = cpu_get_icount_locked();
 522             int64_t delta = clock - cur_icount;
 523             warp_delta = MIN(warp_delta, delta);
 524         }
 525         atomic_set__nocheck(&timers_state.qemu_icount_bias,
 526                             timers_state.qemu_icount_bias + warp_delta);
 527     }
 528     timers_state.vm_clock_warp_start = -1;
 529     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 530                        &timers_state.vm_clock_lock);
 531
 532     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 533         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 534     }
 535 }
 536
 537 static void icount_timer_cb(void *opaque)
 538 {
 539     /* No need for a checkpoint because the timer already synchronizes
 540      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 541      */
 542     icount_warp_rt();
 543 }
 544
 545 void qtest_clock_warp(int64_t dest)
 546 {
 547     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 548     AioContext *aio_context;
 549     assert(qtest_enabled());
 550     aio_context = qemu_get_aio_context();
 551     while (clock < dest) {
 552         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 553         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 554
 555         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 556                            &timers_state.vm_clock_lock);
 557         atomic_set__nocheck(&timers_state.qemu_icount_bias,
 558                             timers_state.qemu_icount_bias + warp);
 559         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 560                              &timers_state.vm_clock_lock);
 561
 562         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 563         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 564         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 565     }
 566     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 567 }
 568
 569 void qemu_start_warp_timer(void)
 570 {
 571     int64_t clock;
 572     int64_t deadline;
 573
 574     if (!use_icount) {
 575         return;
 576     }
 577
 578     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 579      * do not fire, so computing the deadline does not make sense.
 580      */
 581     if (!runstate_is_running()) {
 582         return;
 583     }
 584
 585     /* warp clock deterministically in record/replay mode */
 586     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 587         return;
 588     }
 589
 590     if (!all_cpu_threads_idle()) {
 591         return;
 592     }
 593
 594     if (qtest_enabled()) {
 595         /* When testing, qtest commands advance icount.  */
 596         return;
 597     }
 598
 599     /* We want to use the earliest deadline from ALL vm_clocks */
 600     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 601     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 602     if (deadline < 0) {
 603         static bool notified;
 604         if (!icount_sleep && !notified) {
 605             warn_report("icount sleep disabled and no active timers");
 606             notified = true;
 607         }
 608         return;
 609     }
 610
 611     if (deadline > 0) {
 612         /*
 613          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 614          * sleep.  Otherwise, the CPU might be waiting for a future timer
 615          * interrupt to wake it up, but the interrupt never comes because
 616          * the vCPU isn't running any insns and thus doesn't advance the
 617          * QEMU_CLOCK_VIRTUAL.
 618          */
 619         if (!icount_sleep) {
 620             /*
 621              * We never let VCPUs sleep in no sleep icount mode.
 622              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 623              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 624              * It is useful when we want a deterministic execution time,
 625              * isolated from host latencies.
 626              */
 627             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 628                                &timers_state.vm_clock_lock);
 629             atomic_set__nocheck(&timers_state.qemu_icount_bias,
 630                                 timers_state.qemu_icount_bias + deadline);
 631             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 632                                  &timers_state.vm_clock_lock);
 633             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 634         } else {
 635             /*
 636              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 637              * "real" time, (related to the time left until the next event) has
 638              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 639              * This avoids that the warps are visible externally; for example,
 640              * you will not be sending network packets continuously instead of
 641              * every 100ms.
 642              */
 643             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 644                                &timers_state.vm_clock_lock);
 645             if (timers_state.vm_clock_warp_start == -1
 646                 || timers_state.vm_clock_warp_start > clock) {
 647                 timers_state.vm_clock_warp_start = clock;
 648             }
 649             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 650                                  &timers_state.vm_clock_lock);
 651             timer_mod_anticipate(timers_state.icount_warp_timer,
 652                                  clock + deadline);
 653         }
 654     } else if (deadline == 0) {
 655         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 656     }
 657 }
 658
 659 static void qemu_account_warp_timer(void)
 660 {
 661     if (!use_icount || !icount_sleep) {
 662         return;
 663     }
 664
 665     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 666      * do not fire, so computing the deadline does not make sense.
 667      */
 668     if (!runstate_is_running()) {
 669         return;
 670     }
 671
 672     /* warp clock deterministically in record/replay mode */
 673     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 674         return;
 675     }
 676
 677     timer_del(timers_state.icount_warp_timer);
 678     icount_warp_rt();
 679 }
 680
 681 static bool icount_state_needed(void *opaque)
 682 {
 683     return use_icount;
 684 }
 685
 686 static bool warp_timer_state_needed(void *opaque)
 687 {
 688     TimersState *s = opaque;
 689     return s->icount_warp_timer != NULL;
 690 }
 691
 692 static bool adjust_timers_state_needed(void *opaque)
 693 {
 694     TimersState *s = opaque;
 695     return s->icount_rt_timer != NULL;
 696 }
 697
 698 /*
 699  * Subsection for warp timer migration is optional, because may not be created
 700  */
 701 static const VMStateDescription icount_vmstate_warp_timer = {
 702     .name = "timer/icount/warp_timer",
 703     .version_id = 1,
 704     .minimum_version_id = 1,
 705     .needed = warp_timer_state_needed,
 706     .fields = (VMStateField[]) {
 707         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 708         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 709         VMSTATE_END_OF_LIST()
 710     }
 711 };
 712
 713 static const VMStateDescription icount_vmstate_adjust_timers = {
 714     .name = "timer/icount/timers",
 715     .version_id = 1,
 716     .minimum_version_id = 1,
 717     .needed = adjust_timers_state_needed,
 718     .fields = (VMStateField[]) {
 719         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 720         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 721         VMSTATE_END_OF_LIST()
 722     }
 723 };
 724
 725 /*
 726  * This is a subsection for icount migration.
 727  */
 728 static const VMStateDescription icount_vmstate_timers = {
 729     .name = "timer/icount",
 730     .version_id = 1,
 731     .minimum_version_id = 1,
 732     .needed = icount_state_needed,
 733     .fields = (VMStateField[]) {
 734         VMSTATE_INT64(qemu_icount_bias, TimersState),
 735         VMSTATE_INT64(qemu_icount, TimersState),
 736         VMSTATE_END_OF_LIST()
 737     },
 738     .subsections = (const VMStateDescription*[]) {
 739         &icount_vmstate_warp_timer,
 740         &icount_vmstate_adjust_timers,
 741         NULL
 742     }
 743 };
 744
 745 static const VMStateDescription vmstate_timers = {
 746     .name = "timer",
 747     .version_id = 2,
 748     .minimum_version_id = 1,
 749     .fields = (VMStateField[]) {
 750         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 751         VMSTATE_UNUSED(8),
 752         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 753         VMSTATE_END_OF_LIST()
 754     },
 755     .subsections = (const VMStateDescription*[]) {
 756         &icount_vmstate_timers,
 757         NULL
 758     }
 759 };
 760
 761 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 762 {
 763     double pct;
 764     double throttle_ratio;
 765     long sleeptime_ns;
 766
 767     if (!cpu_throttle_get_percentage()) {
 768         return;
 769     }
 770
 771     pct = (double)cpu_throttle_get_percentage()/100;
 772     throttle_ratio = pct / (1 - pct);
 773     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 774
 775     qemu_mutex_unlock_iothread();
 776     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 777     qemu_mutex_lock_iothread();
 778     atomic_set(&cpu->throttle_thread_scheduled, 0);
 779 }
 780
 781 static void cpu_throttle_timer_tick(void *opaque)
 782 {
 783     CPUState *cpu;
 784     double pct;
 785
 786     /* Stop the timer if needed */
 787     if (!cpu_throttle_get_percentage()) {
 788         return;
 789     }
 790     CPU_FOREACH(cpu) {
 791         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 792             async_run_on_cpu(cpu, cpu_throttle_thread,
 793                              RUN_ON_CPU_NULL);
 794         }
 795     }
 796
 797     pct = (double)cpu_throttle_get_percentage()/100;
 798     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 799                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 800 }
 801
 802 void cpu_throttle_set(int new_throttle_pct)
 803 {
 804     /* Ensure throttle percentage is within valid range */
 805     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 806     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 807
 808     atomic_set(&throttle_percentage, new_throttle_pct);
 809
 810     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 811                                        CPU_THROTTLE_TIMESLICE_NS);
 812 }
 813
 814 void cpu_throttle_stop(void)
 815 {
 816     atomic_set(&throttle_percentage, 0);
 817 }
 818
 819 bool cpu_throttle_active(void)
 820 {
 821     return (cpu_throttle_get_percentage() != 0);
 822 }
 823
 824 int cpu_throttle_get_percentage(void)
 825 {
 826     return atomic_read(&throttle_percentage);
 827 }
 828
 829 void cpu_ticks_init(void)
 830 {
 831     seqlock_init(&timers_state.vm_clock_seqlock);
 832     qemu_spin_init(&timers_state.vm_clock_lock);
 833     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 834     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 835                                            cpu_throttle_timer_tick, NULL);
 836 }
 837
 838 void configure_icount(QemuOpts *opts, Error **errp)
 839 {
 840     const char *option;
 841     char *rem_str = NULL;
 842
 843     option = qemu_opt_get(opts, "shift");
 844     if (!option) {
 845         if (qemu_opt_get(opts, "align") != NULL) {
 846             error_setg(errp, "Please specify shift option when using align");
 847         }
 848         return;
 849     }
 850
 851     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 852     if (icount_sleep) {
 853         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 854                                          icount_timer_cb, NULL);
 855     }
 856
 857     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 858
 859     if (icount_align_option && !icount_sleep) {
 860         error_setg(errp, "align=on and sleep=off are incompatible");
 861     }
 862     if (strcmp(option, "auto") != 0) {
 863         errno = 0;
 864         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 865         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 866             error_setg(errp, "icount: Invalid shift value");
 867         }
 868         use_icount = 1;
 869         return;
 870     } else if (icount_align_option) {
 871         error_setg(errp, "shift=auto and align=on are incompatible");
 872     } else if (!icount_sleep) {
 873         error_setg(errp, "shift=auto and sleep=off are incompatible");
 874     }
 875
 876     use_icount = 2;
 877
 878     /* 125MIPS seems a reasonable initial guess at the guest speed.
 879        It will be corrected fairly quickly anyway.  */
 880     timers_state.icount_time_shift = 3;
 881
 882     /* Have both realtime and virtual time triggers for speed adjustment.
 883        The realtime trigger catches emulated time passing too slowly,
 884        the virtual time trigger catches emulated time passing too fast.
 885        Realtime triggers occur even when idle, so use them less frequently
 886        than VM triggers.  */
 887     timers_state.vm_clock_warp_start = -1;
 888     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 889                                    icount_adjust_rt, NULL);
 890     timer_mod(timers_state.icount_rt_timer,
 891                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 892     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 893                                         icount_adjust_vm, NULL);
 894     timer_mod(timers_state.icount_vm_timer,
 895                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 896                    NANOSECONDS_PER_SECOND / 10);
 897 }
 898
 899 /***********************************************************/
 900 /* TCG vCPU kick timer
 901  *
 902  * The kick timer is responsible for moving single threaded vCPU
 903  * emulation on to the next vCPU. If more than one vCPU is running a
 904  * timer event with force a cpu->exit so the next vCPU can get
 905  * scheduled.
 906  *
 907  * The timer is removed if all vCPUs are idle and restarted again once
 908  * idleness is complete.
 909  */
 910
 911 static QEMUTimer *tcg_kick_vcpu_timer;
 912 static CPUState *tcg_current_rr_cpu;
 913
 914 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 915
 916 static inline int64_t qemu_tcg_next_kick(void)
 917 {
 918     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 919 }
 920
 921 /* Kick the currently round-robin scheduled vCPU */
 922 static void qemu_cpu_kick_rr_cpu(void)
 923 {
 924     CPUState *cpu;
 925     do {
 926         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 927         if (cpu) {
 928             cpu_exit(cpu);
 929         }
 930     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 931 }
 932
 933 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 934 {
 935 }
 936
 937 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 938 {
 939     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 940         qemu_notify_event();
 941         return;
 942     }
 943
 944     if (qemu_in_vcpu_thread()) {
 945         /* A CPU is currently running; kick it back out to the
 946          * tcg_cpu_exec() loop so it will recalculate its
 947          * icount deadline immediately.
 948          */
 949         qemu_cpu_kick(current_cpu);
 950     } else if (first_cpu) {
 951         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 952          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 953          * causes cpu_thread_is_idle to return false.  This way,
 954          * handle_icount_deadline can run.
 955          * If we have no CPUs at all for some reason, we don't
 956          * need to do anything.
 957          */
 958         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 959     }
 960 }
 961
 962 static void kick_tcg_thread(void *opaque)
 963 {
 964     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 965     qemu_cpu_kick_rr_cpu();
 966 }
 967
 968 static void start_tcg_kick_timer(void)
 969 {
 970     assert(!mttcg_enabled);
 971     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 972         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 973                                            kick_tcg_thread, NULL);
 974         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 975     }
 976 }
 977
 978 static void stop_tcg_kick_timer(void)
 979 {
 980     assert(!mttcg_enabled);
 981     if (tcg_kick_vcpu_timer) {
 982         timer_del(tcg_kick_vcpu_timer);
 983         tcg_kick_vcpu_timer = NULL;
 984     }
 985 }
 986
 987 /***********************************************************/
 988 void hw_error(const char *fmt, ...)
 989 {
 990     va_list ap;
 991     CPUState *cpu;
 992
 993     va_start(ap, fmt);
 994     fprintf(stderr, "qemu: hardware error: ");
 995     vfprintf(stderr, fmt, ap);
 996     fprintf(stderr, "\n");
 997     CPU_FOREACH(cpu) {
 998         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 999         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
1000     }
1001     va_end(ap);
1002     abort();
1003 }
1004
1005 void cpu_synchronize_all_states(void)
1006 {
1007     CPUState *cpu;
1008
1009     CPU_FOREACH(cpu) {
1010         cpu_synchronize_state(cpu);
1011         /* TODO: move to cpu_synchronize_state() */
1012         if (hvf_enabled()) {
1013             hvf_cpu_synchronize_state(cpu);
1014         }
1015     }
1016 }
1017
1018 void cpu_synchronize_all_post_reset(void)
1019 {
1020     CPUState *cpu;
1021
1022     CPU_FOREACH(cpu) {
1023         cpu_synchronize_post_reset(cpu);
1024         /* TODO: move to cpu_synchronize_post_reset() */
1025         if (hvf_enabled()) {
1026             hvf_cpu_synchronize_post_reset(cpu);
1027         }
1028     }
1029 }
1030
1031 void cpu_synchronize_all_post_init(void)
1032 {
1033     CPUState *cpu;
1034
1035     CPU_FOREACH(cpu) {
1036         cpu_synchronize_post_init(cpu);
1037         /* TODO: move to cpu_synchronize_post_init() */
1038         if (hvf_enabled()) {
1039             hvf_cpu_synchronize_post_init(cpu);
1040         }
1041     }
1042 }
1043
1044 void cpu_synchronize_all_pre_loadvm(void)
1045 {
1046     CPUState *cpu;
1047
1048     CPU_FOREACH(cpu) {
1049         cpu_synchronize_pre_loadvm(cpu);
1050     }
1051 }
1052
1053 static int do_vm_stop(RunState state, bool send_stop)
1054 {
1055     int ret = 0;
1056
1057     if (runstate_is_running()) {
1058         cpu_disable_ticks();
1059         pause_all_vcpus();
1060         runstate_set(state);
1061         vm_state_notify(0, state);
1062         if (send_stop) {
1063             qapi_event_send_stop();
1064         }
1065     }
1066
1067     bdrv_drain_all();
1068     replay_disable_events();
1069     ret = bdrv_flush_all();
1070
1071     return ret;
1072 }
1073
1074 /* Special vm_stop() variant for terminating the process.  Historically clients
1075  * did not expect a QMP STOP event and so we need to retain compatibility.
1076  */
1077 int vm_shutdown(void)
1078 {
1079     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1080 }
1081
1082 static bool cpu_can_run(CPUState *cpu)
1083 {
1084     if (cpu->stop) {
1085         return false;
1086     }
1087     if (cpu_is_stopped(cpu)) {
1088         return false;
1089     }
1090     return true;
1091 }
1092
1093 static void cpu_handle_guest_debug(CPUState *cpu)
1094 {
1095     gdb_set_stop_cpu(cpu);
1096     qemu_system_debug_request();
1097     cpu->stopped = true;
1098 }
1099
1100 #ifdef CONFIG_LINUX
1101 static void sigbus_reraise(void)
1102 {
1103     sigset_t set;
1104     struct sigaction action;
1105
1106     memset(&action, 0, sizeof(action));
1107     action.sa_handler = SIG_DFL;
1108     if (!sigaction(SIGBUS, &action, NULL)) {
1109         raise(SIGBUS);
1110         sigemptyset(&set);
1111         sigaddset(&set, SIGBUS);
1112         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1113     }
1114     perror("Failed to re-raise SIGBUS!\n");
1115     abort();
1116 }
1117
1118 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1119 {
1120     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1121         sigbus_reraise();
1122     }
1123
1124     if (current_cpu) {
1125         /* Called asynchronously in VCPU thread.  */
1126         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1127             sigbus_reraise();
1128         }
1129     } else {
1130         /* Called synchronously (via signalfd) in main thread.  */
1131         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1132             sigbus_reraise();
1133         }
1134     }
1135 }
1136
1137 static void qemu_init_sigbus(void)
1138 {
1139     struct sigaction action;
1140
1141     memset(&action, 0, sizeof(action));
1142     action.sa_flags = SA_SIGINFO;
1143     action.sa_sigaction = sigbus_handler;
1144     sigaction(SIGBUS, &action, NULL);
1145
1146     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1147 }
1148 #else /* !CONFIG_LINUX */
1149 static void qemu_init_sigbus(void)
1150 {
1151 }
1152 #endif /* !CONFIG_LINUX */
1153
1154 static QemuMutex qemu_global_mutex;
1155
1156 static QemuThread io_thread;
1157
1158 /* cpu creation */
1159 static QemuCond qemu_cpu_cond;
1160 /* system init */
1161 static QemuCond qemu_pause_cond;
1162
1163 void qemu_init_cpu_loop(void)
1164 {
1165     qemu_init_sigbus();
1166     qemu_cond_init(&qemu_cpu_cond);
1167     qemu_cond_init(&qemu_pause_cond);
1168     qemu_mutex_init(&qemu_global_mutex);
1169
1170     qemu_thread_get_self(&io_thread);
1171 }
1172
1173 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1174 {
1175     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1176 }
1177
1178 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1179 {
1180     if (kvm_destroy_vcpu(cpu) < 0) {
1181         error_report("kvm_destroy_vcpu failed");
1182         exit(EXIT_FAILURE);
1183     }
1184 }
1185
1186 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1187 {
1188 }
1189
1190 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1191 {
1192     g_assert(qemu_cpu_is_self(cpu));
1193     cpu->stop = false;
1194     cpu->stopped = true;
1195     if (exit) {
1196         cpu_exit(cpu);
1197     }
1198     qemu_cond_broadcast(&qemu_pause_cond);
1199 }
1200
1201 static void qemu_wait_io_event_common(CPUState *cpu)
1202 {
1203     atomic_mb_set(&cpu->thread_kicked, false);
1204     if (cpu->stop) {
1205         qemu_cpu_stop(cpu, false);
1206     }
1207     process_queued_cpu_work(cpu);
1208 }
1209
1210 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1211 {
1212     while (all_cpu_threads_idle()) {
1213         stop_tcg_kick_timer();
1214         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1215     }
1216
1217     start_tcg_kick_timer();
1218
1219     qemu_wait_io_event_common(cpu);
1220 }
1221
1222 static void qemu_wait_io_event(CPUState *cpu)
1223 {
1224     while (cpu_thread_is_idle(cpu)) {
1225         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1226     }
1227
1228 #ifdef _WIN32
1229     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1230     if (!tcg_enabled()) {
1231         SleepEx(0, TRUE);
1232     }
1233 #endif
1234     qemu_wait_io_event_common(cpu);
1235 }
1236
1237 static void *qemu_kvm_cpu_thread_fn(void *arg)
1238 {
1239     CPUState *cpu = arg;
1240     int r;
1241
1242     rcu_register_thread();
1243
1244     qemu_mutex_lock_iothread();
1245     qemu_thread_get_self(cpu->thread);
1246     cpu->thread_id = qemu_get_thread_id();
1247     cpu->can_do_io = 1;
1248     current_cpu = cpu;
1249
1250     r = kvm_init_vcpu(cpu);
1251     if (r < 0) {
1252         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1253         exit(1);
1254     }
1255
1256     kvm_init_cpu_signals(cpu);
1257
1258     /* signal CPU creation */
1259     cpu->created = true;
1260     qemu_cond_signal(&qemu_cpu_cond);
1261
1262     do {
1263         if (cpu_can_run(cpu)) {
1264             r = kvm_cpu_exec(cpu);
1265             if (r == EXCP_DEBUG) {
1266                 cpu_handle_guest_debug(cpu);
1267             }
1268         }
1269         qemu_wait_io_event(cpu);
1270     } while (!cpu->unplug || cpu_can_run(cpu));
1271
1272     qemu_kvm_destroy_vcpu(cpu);
1273     cpu->created = false;
1274     qemu_cond_signal(&qemu_cpu_cond);
1275     qemu_mutex_unlock_iothread();
1276     rcu_unregister_thread();
1277     return NULL;
1278 }
1279
1280 static void *qemu_dummy_cpu_thread_fn(void *arg)
1281 {
1282 #ifdef _WIN32
1283     error_report("qtest is not supported under Windows");
1284     exit(1);
1285 #else
1286     CPUState *cpu = arg;
1287     sigset_t waitset;
1288     int r;
1289
1290     rcu_register_thread();
1291
1292     qemu_mutex_lock_iothread();
1293     qemu_thread_get_self(cpu->thread);
1294     cpu->thread_id = qemu_get_thread_id();
1295     cpu->can_do_io = 1;
1296     current_cpu = cpu;
1297
1298     sigemptyset(&waitset);
1299     sigaddset(&waitset, SIG_IPI);
1300
1301     /* signal CPU creation */
1302     cpu->created = true;
1303     qemu_cond_signal(&qemu_cpu_cond);
1304
1305     do {
1306         qemu_mutex_unlock_iothread();
1307         do {
1308             int sig;
1309             r = sigwait(&waitset, &sig);
1310         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1311         if (r == -1) {
1312             perror("sigwait");
1313             exit(1);
1314         }
1315         qemu_mutex_lock_iothread();
1316         qemu_wait_io_event(cpu);
1317     } while (!cpu->unplug);
1318
1319     rcu_unregister_thread();
1320     return NULL;
1321 #endif
1322 }
1323
1324 static int64_t tcg_get_icount_limit(void)
1325 {
1326     int64_t deadline;
1327
1328     if (replay_mode != REPLAY_MODE_PLAY) {
1329         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1330
1331         /* Maintain prior (possibly buggy) behaviour where if no deadline
1332          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1333          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1334          * nanoseconds.
1335          */
1336         if ((deadline < 0) || (deadline > INT32_MAX)) {
1337             deadline = INT32_MAX;
1338         }
1339
1340         return qemu_icount_round(deadline);
1341     } else {
1342         return replay_get_instructions();
1343     }
1344 }
1345
1346 static void handle_icount_deadline(void)
1347 {
1348     assert(qemu_in_vcpu_thread());
1349     if (use_icount) {
1350         int64_t deadline =
1351             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1352
1353         if (deadline == 0) {
1354             /* Wake up other AioContexts.  */
1355             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1356             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1357         }
1358     }
1359 }
1360
1361 static void prepare_icount_for_run(CPUState *cpu)
1362 {
1363     if (use_icount) {
1364         int insns_left;
1365
1366         /* These should always be cleared by process_icount_data after
1367          * each vCPU execution. However u16.high can be raised
1368          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1369          */
1370         g_assert(cpu->icount_decr.u16.low == 0);
1371         g_assert(cpu->icount_extra == 0);
1372
1373         cpu->icount_budget = tcg_get_icount_limit();
1374         insns_left = MIN(0xffff, cpu->icount_budget);
1375         cpu->icount_decr.u16.low = insns_left;
1376         cpu->icount_extra = cpu->icount_budget - insns_left;
1377
1378         replay_mutex_lock();
1379     }
1380 }
1381
1382 static void process_icount_data(CPUState *cpu)
1383 {
1384     if (use_icount) {
1385         /* Account for executed instructions */
1386         cpu_update_icount(cpu);
1387
1388         /* Reset the counters */
1389         cpu->icount_decr.u16.low = 0;
1390         cpu->icount_extra = 0;
1391         cpu->icount_budget = 0;
1392
1393         replay_account_executed_instructions();
1394
1395         replay_mutex_unlock();
1396     }
1397 }
1398
1399
1400 static int tcg_cpu_exec(CPUState *cpu)
1401 {
1402     int ret;
1403 #ifdef CONFIG_PROFILER
1404     int64_t ti;
1405 #endif
1406
1407     assert(tcg_enabled());
1408 #ifdef CONFIG_PROFILER
1409     ti = profile_getclock();
1410 #endif
1411     cpu_exec_start(cpu);
1412     ret = cpu_exec(cpu);
1413     cpu_exec_end(cpu);
1414 #ifdef CONFIG_PROFILER
1415     tcg_time += profile_getclock() - ti;
1416 #endif
1417     return ret;
1418 }
1419
1420 /* Destroy any remaining vCPUs which have been unplugged and have
1421  * finished running
1422  */
1423 static void deal_with_unplugged_cpus(void)
1424 {
1425     CPUState *cpu;
1426
1427     CPU_FOREACH(cpu) {
1428         if (cpu->unplug && !cpu_can_run(cpu)) {
1429             qemu_tcg_destroy_vcpu(cpu);
1430             cpu->created = false;
1431             qemu_cond_signal(&qemu_cpu_cond);
1432             break;
1433         }
1434     }
1435 }
1436
1437 /* Single-threaded TCG
1438  *
1439  * In the single-threaded case each vCPU is simulated in turn. If
1440  * there is more than a single vCPU we create a simple timer to kick
1441  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1442  * This is done explicitly rather than relying on side-effects
1443  * elsewhere.
1444  */
1445
1446 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1447 {
1448     CPUState *cpu = arg;
1449
1450     assert(tcg_enabled());
1451     rcu_register_thread();
1452     tcg_register_thread();
1453
1454     qemu_mutex_lock_iothread();
1455     qemu_thread_get_self(cpu->thread);
1456
1457     cpu->thread_id = qemu_get_thread_id();
1458     cpu->created = true;
1459     cpu->can_do_io = 1;
1460     qemu_cond_signal(&qemu_cpu_cond);
1461
1462     /* wait for initial kick-off after machine start */
1463     while (first_cpu->stopped) {
1464         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1465
1466         /* process any pending work */
1467         CPU_FOREACH(cpu) {
1468             current_cpu = cpu;
1469             qemu_wait_io_event_common(cpu);
1470         }
1471     }
1472
1473     start_tcg_kick_timer();
1474
1475     cpu = first_cpu;
1476
1477     /* process any pending work */
1478     cpu->exit_request = 1;
1479
1480     while (1) {
1481         qemu_mutex_unlock_iothread();
1482         replay_mutex_lock();
1483         qemu_mutex_lock_iothread();
1484         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1485         qemu_account_warp_timer();
1486
1487         /* Run the timers here.  This is much more efficient than
1488          * waking up the I/O thread and waiting for completion.
1489          */
1490         handle_icount_deadline();
1491
1492         replay_mutex_unlock();
1493
1494         if (!cpu) {
1495             cpu = first_cpu;
1496         }
1497
1498         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1499
1500             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1501             current_cpu = cpu;
1502
1503             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1504                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1505
1506             if (cpu_can_run(cpu)) {
1507                 int r;
1508
1509                 qemu_mutex_unlock_iothread();
1510                 prepare_icount_for_run(cpu);
1511
1512                 r = tcg_cpu_exec(cpu);
1513
1514                 process_icount_data(cpu);
1515                 qemu_mutex_lock_iothread();
1516
1517                 if (r == EXCP_DEBUG) {
1518                     cpu_handle_guest_debug(cpu);
1519                     break;
1520                 } else if (r == EXCP_ATOMIC) {
1521                     qemu_mutex_unlock_iothread();
1522                     cpu_exec_step_atomic(cpu);
1523                     qemu_mutex_lock_iothread();
1524                     break;
1525                 }
1526             } else if (cpu->stop) {
1527                 if (cpu->unplug) {
1528                     cpu = CPU_NEXT(cpu);
1529                 }
1530                 break;
1531             }
1532
1533             cpu = CPU_NEXT(cpu);
1534         } /* while (cpu && !cpu->exit_request).. */
1535
1536         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1537         atomic_set(&tcg_current_rr_cpu, NULL);
1538
1539         if (cpu && cpu->exit_request) {
1540             atomic_mb_set(&cpu->exit_request, 0);
1541         }
1542
1543         qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1544         deal_with_unplugged_cpus();
1545     }
1546
1547     rcu_unregister_thread();
1548     return NULL;
1549 }
1550
1551 static void *qemu_hax_cpu_thread_fn(void *arg)
1552 {
1553     CPUState *cpu = arg;
1554     int r;
1555
1556     rcu_register_thread();
1557     qemu_mutex_lock_iothread();
1558     qemu_thread_get_self(cpu->thread);
1559
1560     cpu->thread_id = qemu_get_thread_id();
1561     cpu->created = true;
1562     cpu->halted = 0;
1563     current_cpu = cpu;
1564
1565     hax_init_vcpu(cpu);
1566     qemu_cond_signal(&qemu_cpu_cond);
1567
1568     do {
1569         if (cpu_can_run(cpu)) {
1570             r = hax_smp_cpu_exec(cpu);
1571             if (r == EXCP_DEBUG) {
1572                 cpu_handle_guest_debug(cpu);
1573             }
1574         }
1575
1576         qemu_wait_io_event(cpu);
1577     } while (!cpu->unplug || cpu_can_run(cpu));
1578     rcu_unregister_thread();
1579     return NULL;
1580 }
1581
1582 /* The HVF-specific vCPU thread function. This one should only run when the host
1583  * CPU supports the VMX "unrestricted guest" feature. */
1584 static void *qemu_hvf_cpu_thread_fn(void *arg)
1585 {
1586     CPUState *cpu = arg;
1587
1588     int r;
1589
1590     assert(hvf_enabled());
1591
1592     rcu_register_thread();
1593
1594     qemu_mutex_lock_iothread();
1595     qemu_thread_get_self(cpu->thread);
1596
1597     cpu->thread_id = qemu_get_thread_id();
1598     cpu->can_do_io = 1;
1599     current_cpu = cpu;
1600
1601     hvf_init_vcpu(cpu);
1602
1603     /* signal CPU creation */
1604     cpu->created = true;
1605     qemu_cond_signal(&qemu_cpu_cond);
1606
1607     do {
1608         if (cpu_can_run(cpu)) {
1609             r = hvf_vcpu_exec(cpu);
1610             if (r == EXCP_DEBUG) {
1611                 cpu_handle_guest_debug(cpu);
1612             }
1613         }
1614         qemu_wait_io_event(cpu);
1615     } while (!cpu->unplug || cpu_can_run(cpu));
1616
1617     hvf_vcpu_destroy(cpu);
1618     cpu->created = false;
1619     qemu_cond_signal(&qemu_cpu_cond);
1620     qemu_mutex_unlock_iothread();
1621     rcu_unregister_thread();
1622     return NULL;
1623 }
1624
1625 static void *qemu_whpx_cpu_thread_fn(void *arg)
1626 {
1627     CPUState *cpu = arg;
1628     int r;
1629
1630     rcu_register_thread();
1631
1632     qemu_mutex_lock_iothread();
1633     qemu_thread_get_self(cpu->thread);
1634     cpu->thread_id = qemu_get_thread_id();
1635     current_cpu = cpu;
1636
1637     r = whpx_init_vcpu(cpu);
1638     if (r < 0) {
1639         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1640         exit(1);
1641     }
1642
1643     /* signal CPU creation */
1644     cpu->created = true;
1645     qemu_cond_signal(&qemu_cpu_cond);
1646
1647     do {
1648         if (cpu_can_run(cpu)) {
1649             r = whpx_vcpu_exec(cpu);
1650             if (r == EXCP_DEBUG) {
1651                 cpu_handle_guest_debug(cpu);
1652             }
1653         }
1654         while (cpu_thread_is_idle(cpu)) {
1655             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1656         }
1657         qemu_wait_io_event_common(cpu);
1658     } while (!cpu->unplug || cpu_can_run(cpu));
1659
1660     whpx_destroy_vcpu(cpu);
1661     cpu->created = false;
1662     qemu_cond_signal(&qemu_cpu_cond);
1663     qemu_mutex_unlock_iothread();
1664     rcu_unregister_thread();
1665     return NULL;
1666 }
1667
1668 #ifdef _WIN32
1669 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1670 {
1671 }
1672 #endif
1673
1674 /* Multi-threaded TCG
1675  *
1676  * In the multi-threaded case each vCPU has its own thread. The TLS
1677  * variable current_cpu can be used deep in the code to find the
1678  * current CPUState for a given thread.
1679  */
1680
1681 static void *qemu_tcg_cpu_thread_fn(void *arg)
1682 {
1683     CPUState *cpu = arg;
1684
1685     assert(tcg_enabled());
1686     g_assert(!use_icount);
1687
1688     rcu_register_thread();
1689     tcg_register_thread();
1690
1691     qemu_mutex_lock_iothread();
1692     qemu_thread_get_self(cpu->thread);
1693
1694     cpu->thread_id = qemu_get_thread_id();
1695     cpu->created = true;
1696     cpu->can_do_io = 1;
1697     current_cpu = cpu;
1698     qemu_cond_signal(&qemu_cpu_cond);
1699
1700     /* process any pending work */
1701     cpu->exit_request = 1;
1702
1703     do {
1704         if (cpu_can_run(cpu)) {
1705             int r;
1706             qemu_mutex_unlock_iothread();
1707             r = tcg_cpu_exec(cpu);
1708             qemu_mutex_lock_iothread();
1709             switch (r) {
1710             case EXCP_DEBUG:
1711                 cpu_handle_guest_debug(cpu);
1712                 break;
1713             case EXCP_HALTED:
1714                 /* during start-up the vCPU is reset and the thread is
1715                  * kicked several times. If we don't ensure we go back
1716                  * to sleep in the halted state we won't cleanly
1717                  * start-up when the vCPU is enabled.
1718                  *
1719                  * cpu->halted should ensure we sleep in wait_io_event
1720                  */
1721                 g_assert(cpu->halted);
1722                 break;
1723             case EXCP_ATOMIC:
1724                 qemu_mutex_unlock_iothread();
1725                 cpu_exec_step_atomic(cpu);
1726                 qemu_mutex_lock_iothread();
1727             default:
1728                 /* Ignore everything else? */
1729                 break;
1730             }
1731         }
1732
1733         atomic_mb_set(&cpu->exit_request, 0);
1734         qemu_wait_io_event(cpu);
1735     } while (!cpu->unplug || cpu_can_run(cpu));
1736
1737     qemu_tcg_destroy_vcpu(cpu);
1738     cpu->created = false;
1739     qemu_cond_signal(&qemu_cpu_cond);
1740     qemu_mutex_unlock_iothread();
1741     rcu_unregister_thread();
1742     return NULL;
1743 }
1744
1745 static void qemu_cpu_kick_thread(CPUState *cpu)
1746 {
1747 #ifndef _WIN32
1748     int err;
1749
1750     if (cpu->thread_kicked) {
1751         return;
1752     }
1753     cpu->thread_kicked = true;
1754     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1755     if (err) {
1756         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1757         exit(1);
1758     }
1759 #else /* _WIN32 */
1760     if (!qemu_cpu_is_self(cpu)) {
1761         if (whpx_enabled()) {
1762             whpx_vcpu_kick(cpu);
1763         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1764             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1765                     __func__, GetLastError());
1766             exit(1);
1767         }
1768     }
1769 #endif
1770 }
1771
1772 void qemu_cpu_kick(CPUState *cpu)
1773 {
1774     qemu_cond_broadcast(cpu->halt_cond);
1775     if (tcg_enabled()) {
1776         cpu_exit(cpu);
1777         /* NOP unless doing single-thread RR */
1778         qemu_cpu_kick_rr_cpu();
1779     } else {
1780         if (hax_enabled()) {
1781             /*
1782              * FIXME: race condition with the exit_request check in
1783              * hax_vcpu_hax_exec
1784              */
1785             cpu->exit_request = 1;
1786         }
1787         qemu_cpu_kick_thread(cpu);
1788     }
1789 }
1790
1791 void qemu_cpu_kick_self(void)
1792 {
1793     assert(current_cpu);
1794     qemu_cpu_kick_thread(current_cpu);
1795 }
1796
1797 bool qemu_cpu_is_self(CPUState *cpu)
1798 {
1799     return qemu_thread_is_self(cpu->thread);
1800 }
1801
1802 bool qemu_in_vcpu_thread(void)
1803 {
1804     return current_cpu && qemu_cpu_is_self(current_cpu);
1805 }
1806
1807 static __thread bool iothread_locked = false;
1808
1809 bool qemu_mutex_iothread_locked(void)
1810 {
1811     return iothread_locked;
1812 }
1813
1814 /*
1815  * The BQL is taken from so many places that it is worth profiling the
1816  * callers directly, instead of funneling them all through a single function.
1817  */
1818 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1819 {
1820     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1821
1822     g_assert(!qemu_mutex_iothread_locked());
1823     bql_lock(&qemu_global_mutex, file, line);
1824     iothread_locked = true;
1825 }
1826
1827 void qemu_mutex_unlock_iothread(void)
1828 {
1829     g_assert(qemu_mutex_iothread_locked());
1830     iothread_locked = false;
1831     qemu_mutex_unlock(&qemu_global_mutex);
1832 }
1833
1834 static bool all_vcpus_paused(void)
1835 {
1836     CPUState *cpu;
1837
1838     CPU_FOREACH(cpu) {
1839         if (!cpu->stopped) {
1840             return false;
1841         }
1842     }
1843
1844     return true;
1845 }
1846
1847 void pause_all_vcpus(void)
1848 {
1849     CPUState *cpu;
1850
1851     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1852     CPU_FOREACH(cpu) {
1853         if (qemu_cpu_is_self(cpu)) {
1854             qemu_cpu_stop(cpu, true);
1855         } else {
1856             cpu->stop = true;
1857             qemu_cpu_kick(cpu);
1858         }
1859     }
1860
1861     /* We need to drop the replay_lock so any vCPU threads woken up
1862      * can finish their replay tasks
1863      */
1864     replay_mutex_unlock();
1865
1866     while (!all_vcpus_paused()) {
1867         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1868         CPU_FOREACH(cpu) {
1869             qemu_cpu_kick(cpu);
1870         }
1871     }
1872
1873     qemu_mutex_unlock_iothread();
1874     replay_mutex_lock();
1875     qemu_mutex_lock_iothread();
1876 }
1877
1878 void cpu_resume(CPUState *cpu)
1879 {
1880     cpu->stop = false;
1881     cpu->stopped = false;
1882     qemu_cpu_kick(cpu);
1883 }
1884
1885 void resume_all_vcpus(void)
1886 {
1887     CPUState *cpu;
1888
1889     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1890     CPU_FOREACH(cpu) {
1891         cpu_resume(cpu);
1892     }
1893 }
1894
1895 void cpu_remove_sync(CPUState *cpu)
1896 {
1897     cpu->stop = true;
1898     cpu->unplug = true;
1899     qemu_cpu_kick(cpu);
1900     qemu_mutex_unlock_iothread();
1901     qemu_thread_join(cpu->thread);
1902     qemu_mutex_lock_iothread();
1903 }
1904
1905 /* For temporary buffers for forming a name */
1906 #define VCPU_THREAD_NAME_SIZE 16
1907
1908 static void qemu_tcg_init_vcpu(CPUState *cpu)
1909 {
1910     char thread_name[VCPU_THREAD_NAME_SIZE];
1911     static QemuCond *single_tcg_halt_cond;
1912     static QemuThread *single_tcg_cpu_thread;
1913     static int tcg_region_inited;
1914
1915     assert(tcg_enabled());
1916     /*
1917      * Initialize TCG regions--once. Now is a good time, because:
1918      * (1) TCG's init context, prologue and target globals have been set up.
1919      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1920      *     -accel flag is processed, so the check doesn't work then).
1921      */
1922     if (!tcg_region_inited) {
1923         tcg_region_inited = 1;
1924         tcg_region_init();
1925     }
1926
1927     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1928         cpu->thread = g_malloc0(sizeof(QemuThread));
1929         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1930         qemu_cond_init(cpu->halt_cond);
1931
1932         if (qemu_tcg_mttcg_enabled()) {
1933             /* create a thread per vCPU with TCG (MTTCG) */
1934             parallel_cpus = true;
1935             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1936                  cpu->cpu_index);
1937
1938             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1939                                cpu, QEMU_THREAD_JOINABLE);
1940
1941         } else {
1942             /* share a single thread for all cpus with TCG */
1943             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1944             qemu_thread_create(cpu->thread, thread_name,
1945                                qemu_tcg_rr_cpu_thread_fn,
1946                                cpu, QEMU_THREAD_JOINABLE);
1947
1948             single_tcg_halt_cond = cpu->halt_cond;
1949             single_tcg_cpu_thread = cpu->thread;
1950         }
1951 #ifdef _WIN32
1952         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1953 #endif
1954     } else {
1955         /* For non-MTTCG cases we share the thread */
1956         cpu->thread = single_tcg_cpu_thread;
1957         cpu->halt_cond = single_tcg_halt_cond;
1958         cpu->thread_id = first_cpu->thread_id;
1959         cpu->can_do_io = 1;
1960         cpu->created = true;
1961     }
1962 }
1963
1964 static void qemu_hax_start_vcpu(CPUState *cpu)
1965 {
1966     char thread_name[VCPU_THREAD_NAME_SIZE];
1967
1968     cpu->thread = g_malloc0(sizeof(QemuThread));
1969     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1970     qemu_cond_init(cpu->halt_cond);
1971
1972     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1973              cpu->cpu_index);
1974     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1975                        cpu, QEMU_THREAD_JOINABLE);
1976 #ifdef _WIN32
1977     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1978 #endif
1979 }
1980
1981 static void qemu_kvm_start_vcpu(CPUState *cpu)
1982 {
1983     char thread_name[VCPU_THREAD_NAME_SIZE];
1984
1985     cpu->thread = g_malloc0(sizeof(QemuThread));
1986     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1987     qemu_cond_init(cpu->halt_cond);
1988     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1989              cpu->cpu_index);
1990     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1991                        cpu, QEMU_THREAD_JOINABLE);
1992 }
1993
1994 static void qemu_hvf_start_vcpu(CPUState *cpu)
1995 {
1996     char thread_name[VCPU_THREAD_NAME_SIZE];
1997
1998     /* HVF currently does not support TCG, and only runs in
1999      * unrestricted-guest mode. */
2000     assert(hvf_enabled());
2001
2002     cpu->thread = g_malloc0(sizeof(QemuThread));
2003     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2004     qemu_cond_init(cpu->halt_cond);
2005
2006     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2007              cpu->cpu_index);
2008     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2009                        cpu, QEMU_THREAD_JOINABLE);
2010 }
2011
2012 static void qemu_whpx_start_vcpu(CPUState *cpu)
2013 {
2014     char thread_name[VCPU_THREAD_NAME_SIZE];
2015
2016     cpu->thread = g_malloc0(sizeof(QemuThread));
2017     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2018     qemu_cond_init(cpu->halt_cond);
2019     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2020              cpu->cpu_index);
2021     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2022                        cpu, QEMU_THREAD_JOINABLE);
2023 #ifdef _WIN32
2024     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2025 #endif
2026 }
2027
2028 static void qemu_dummy_start_vcpu(CPUState *cpu)
2029 {
2030     char thread_name[VCPU_THREAD_NAME_SIZE];
2031
2032     cpu->thread = g_malloc0(sizeof(QemuThread));
2033     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2034     qemu_cond_init(cpu->halt_cond);
2035     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2036              cpu->cpu_index);
2037     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2038                        QEMU_THREAD_JOINABLE);
2039 }
2040
2041 void qemu_init_vcpu(CPUState *cpu)
2042 {
2043     cpu->nr_cores = smp_cores;
2044     cpu->nr_threads = smp_threads;
2045     cpu->stopped = true;
2046
2047     if (!cpu->as) {
2048         /* If the target cpu hasn't set up any address spaces itself,
2049          * give it the default one.
2050          */
2051         cpu->num_ases = 1;
2052         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2053     }
2054
2055     if (kvm_enabled()) {
2056         qemu_kvm_start_vcpu(cpu);
2057     } else if (hax_enabled()) {
2058         qemu_hax_start_vcpu(cpu);
2059     } else if (hvf_enabled()) {
2060         qemu_hvf_start_vcpu(cpu);
2061     } else if (tcg_enabled()) {
2062         qemu_tcg_init_vcpu(cpu);
2063     } else if (whpx_enabled()) {
2064         qemu_whpx_start_vcpu(cpu);
2065     } else {
2066         qemu_dummy_start_vcpu(cpu);
2067     }
2068
2069     while (!cpu->created) {
2070         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2071     }
2072 }
2073
2074 void cpu_stop_current(void)
2075 {
2076     if (current_cpu) {
2077         qemu_cpu_stop(current_cpu, true);
2078     }
2079 }
2080
2081 int vm_stop(RunState state)
2082 {
2083     if (qemu_in_vcpu_thread()) {
2084         qemu_system_vmstop_request_prepare();
2085         qemu_system_vmstop_request(state);
2086         /*
2087          * FIXME: should not return to device code in case
2088          * vm_stop() has been requested.
2089          */
2090         cpu_stop_current();
2091         return 0;
2092     }
2093
2094     return do_vm_stop(state, true);
2095 }
2096
2097 /**
2098  * Prepare for (re)starting the VM.
2099  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2100  * running or in case of an error condition), 0 otherwise.
2101  */
2102 int vm_prepare_start(void)
2103 {
2104     RunState requested;
2105
2106     qemu_vmstop_requested(&requested);
2107     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2108         return -1;
2109     }
2110
2111     /* Ensure that a STOP/RESUME pair of events is emitted if a
2112      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2113      * example, according to documentation is always followed by
2114      * the STOP event.
2115      */
2116     if (runstate_is_running()) {
2117         qapi_event_send_stop();
2118         qapi_event_send_resume();
2119         return -1;
2120     }
2121
2122     /* We are sending this now, but the CPUs will be resumed shortly later */
2123     qapi_event_send_resume();
2124
2125     replay_enable_events();
2126     cpu_enable_ticks();
2127     runstate_set(RUN_STATE_RUNNING);
2128     vm_state_notify(1, RUN_STATE_RUNNING);
2129     return 0;
2130 }
2131
2132 void vm_start(void)
2133 {
2134     if (!vm_prepare_start()) {
2135         resume_all_vcpus();
2136     }
2137 }
2138
2139 /* does a state transition even if the VM is already stopped,
2140    current state is forgotten forever */
2141 int vm_stop_force_state(RunState state)
2142 {
2143     if (runstate_is_running()) {
2144         return vm_stop(state);
2145     } else {
2146         runstate_set(state);
2147
2148         bdrv_drain_all();
2149         /* Make sure to return an error if the flush in a previous vm_stop()
2150          * failed. */
2151         return bdrv_flush_all();
2152     }
2153 }
2154
2155 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2156 {
2157     /* XXX: implement xxx_cpu_list for targets that still miss it */
2158 #if defined(cpu_list)
2159     cpu_list(f, cpu_fprintf);
2160 #endif
2161 }
2162
2163 CpuInfoList *qmp_query_cpus(Error **errp)
2164 {
2165     MachineState *ms = MACHINE(qdev_get_machine());
2166     MachineClass *mc = MACHINE_GET_CLASS(ms);
2167     CpuInfoList *head = NULL, *cur_item = NULL;
2168     CPUState *cpu;
2169
2170     CPU_FOREACH(cpu) {
2171         CpuInfoList *info;
2172 #if defined(TARGET_I386)
2173         X86CPU *x86_cpu = X86_CPU(cpu);
2174         CPUX86State *env = &x86_cpu->env;
2175 #elif defined(TARGET_PPC)
2176         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2177         CPUPPCState *env = &ppc_cpu->env;
2178 #elif defined(TARGET_SPARC)
2179         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2180         CPUSPARCState *env = &sparc_cpu->env;
2181 #elif defined(TARGET_RISCV)
2182         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2183         CPURISCVState *env = &riscv_cpu->env;
2184 #elif defined(TARGET_MIPS)
2185         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2186         CPUMIPSState *env = &mips_cpu->env;
2187 #elif defined(TARGET_TRICORE)
2188         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2189         CPUTriCoreState *env = &tricore_cpu->env;
2190 #elif defined(TARGET_S390X)
2191         S390CPU *s390_cpu = S390_CPU(cpu);
2192         CPUS390XState *env = &s390_cpu->env;
2193 #endif
2194
2195         cpu_synchronize_state(cpu);
2196
2197         info = g_malloc0(sizeof(*info));
2198         info->value = g_malloc0(sizeof(*info->value));
2199         info->value->CPU = cpu->cpu_index;
2200         info->value->current = (cpu == first_cpu);
2201         info->value->halted = cpu->halted;
2202         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2203         info->value->thread_id = cpu->thread_id;
2204 #if defined(TARGET_I386)
2205         info->value->arch = CPU_INFO_ARCH_X86;
2206         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2207 #elif defined(TARGET_PPC)
2208         info->value->arch = CPU_INFO_ARCH_PPC;
2209         info->value->u.ppc.nip = env->nip;
2210 #elif defined(TARGET_SPARC)
2211         info->value->arch = CPU_INFO_ARCH_SPARC;
2212         info->value->u.q_sparc.pc = env->pc;
2213         info->value->u.q_sparc.npc = env->npc;
2214 #elif defined(TARGET_MIPS)
2215         info->value->arch = CPU_INFO_ARCH_MIPS;
2216         info->value->u.q_mips.PC = env->active_tc.PC;
2217 #elif defined(TARGET_TRICORE)
2218         info->value->arch = CPU_INFO_ARCH_TRICORE;
2219         info->value->u.tricore.PC = env->PC;
2220 #elif defined(TARGET_S390X)
2221         info->value->arch = CPU_INFO_ARCH_S390;
2222         info->value->u.s390.cpu_state = env->cpu_state;
2223 #elif defined(TARGET_RISCV)
2224         info->value->arch = CPU_INFO_ARCH_RISCV;
2225         info->value->u.riscv.pc = env->pc;
2226 #else
2227         info->value->arch = CPU_INFO_ARCH_OTHER;
2228 #endif
2229         info->value->has_props = !!mc->cpu_index_to_instance_props;
2230         if (info->value->has_props) {
2231             CpuInstanceProperties *props;
2232             props = g_malloc0(sizeof(*props));
2233             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2234             info->value->props = props;
2235         }
2236
2237         /* XXX: waiting for the qapi to support GSList */
2238         if (!cur_item) {
2239             head = cur_item = info;
2240         } else {
2241             cur_item->next = info;
2242             cur_item = info;
2243         }
2244     }
2245
2246     return head;
2247 }
2248
2249 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2250 {
2251     /*
2252      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2253      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2254      */
2255     switch (target) {
2256     case SYS_EMU_TARGET_I386:
2257     case SYS_EMU_TARGET_X86_64:
2258         return CPU_INFO_ARCH_X86;
2259
2260     case SYS_EMU_TARGET_PPC:
2261     case SYS_EMU_TARGET_PPC64:
2262         return CPU_INFO_ARCH_PPC;
2263
2264     case SYS_EMU_TARGET_SPARC:
2265     case SYS_EMU_TARGET_SPARC64:
2266         return CPU_INFO_ARCH_SPARC;
2267
2268     case SYS_EMU_TARGET_MIPS:
2269     case SYS_EMU_TARGET_MIPSEL:
2270     case SYS_EMU_TARGET_MIPS64:
2271     case SYS_EMU_TARGET_MIPS64EL:
2272         return CPU_INFO_ARCH_MIPS;
2273
2274     case SYS_EMU_TARGET_TRICORE:
2275         return CPU_INFO_ARCH_TRICORE;
2276
2277     case SYS_EMU_TARGET_S390X:
2278         return CPU_INFO_ARCH_S390;
2279
2280     case SYS_EMU_TARGET_RISCV32:
2281     case SYS_EMU_TARGET_RISCV64:
2282         return CPU_INFO_ARCH_RISCV;
2283
2284     default:
2285         return CPU_INFO_ARCH_OTHER;
2286     }
2287 }
2288
2289 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2290 {
2291 #ifdef TARGET_S390X
2292     S390CPU *s390_cpu = S390_CPU(cpu);
2293     CPUS390XState *env = &s390_cpu->env;
2294
2295     info->cpu_state = env->cpu_state;
2296 #else
2297     abort();
2298 #endif
2299 }
2300
2301 /*
2302  * fast means: we NEVER interrupt vCPU threads to retrieve
2303  * information from KVM.
2304  */
2305 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2306 {
2307     MachineState *ms = MACHINE(qdev_get_machine());
2308     MachineClass *mc = MACHINE_GET_CLASS(ms);
2309     CpuInfoFastList *head = NULL, *cur_item = NULL;
2310     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2311                                           -1, &error_abort);
2312     CPUState *cpu;
2313
2314     CPU_FOREACH(cpu) {
2315         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2316         info->value = g_malloc0(sizeof(*info->value));
2317
2318         info->value->cpu_index = cpu->cpu_index;
2319         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2320         info->value->thread_id = cpu->thread_id;
2321
2322         info->value->has_props = !!mc->cpu_index_to_instance_props;
2323         if (info->value->has_props) {
2324             CpuInstanceProperties *props;
2325             props = g_malloc0(sizeof(*props));
2326             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2327             info->value->props = props;
2328         }
2329
2330         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2331         info->value->target = target;
2332         if (target == SYS_EMU_TARGET_S390X) {
2333             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2334         }
2335
2336         if (!cur_item) {
2337             head = cur_item = info;
2338         } else {
2339             cur_item->next = info;
2340             cur_item = info;
2341         }
2342     }
2343
2344     return head;
2345 }
2346
2347 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2348                  bool has_cpu, int64_t cpu_index, Error **errp)
2349 {
2350     FILE *f;
2351     uint32_t l;
2352     CPUState *cpu;
2353     uint8_t buf[1024];
2354     int64_t orig_addr = addr, orig_size = size;
2355
2356     if (!has_cpu) {
2357         cpu_index = 0;
2358     }
2359
2360     cpu = qemu_get_cpu(cpu_index);
2361     if (cpu == NULL) {
2362         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2363                    "a CPU number");
2364         return;
2365     }
2366
2367     f = fopen(filename, "wb");
2368     if (!f) {
2369         error_setg_file_open(errp, errno, filename);
2370         return;
2371     }
2372
2373     while (size != 0) {
2374         l = sizeof(buf);
2375         if (l > size)
2376             l = size;
2377         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2378             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2379                              " specified", orig_addr, orig_size);
2380             goto exit;
2381         }
2382         if (fwrite(buf, 1, l, f) != l) {
2383             error_setg(errp, QERR_IO_ERROR);
2384             goto exit;
2385         }
2386         addr += l;
2387         size -= l;
2388     }
2389
2390 exit:
2391     fclose(f);
2392 }
2393
2394 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2395                   Error **errp)
2396 {
2397     FILE *f;
2398     uint32_t l;
2399     uint8_t buf[1024];
2400
2401     f = fopen(filename, "wb");
2402     if (!f) {
2403         error_setg_file_open(errp, errno, filename);
2404         return;
2405     }
2406
2407     while (size != 0) {
2408         l = sizeof(buf);
2409         if (l > size)
2410             l = size;
2411         cpu_physical_memory_read(addr, buf, l);
2412         if (fwrite(buf, 1, l, f) != l) {
2413             error_setg(errp, QERR_IO_ERROR);
2414             goto exit;
2415         }
2416         addr += l;
2417         size -= l;
2418     }
2419
2420 exit:
2421     fclose(f);
2422 }
2423
2424 void qmp_inject_nmi(Error **errp)
2425 {
2426     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2427 }
2428
2429 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2430 {
2431     if (!use_icount) {
2432         return;
2433     }
2434
2435     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2436                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2437     if (icount_align_option) {
2438         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2439         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2440     } else {
2441         cpu_fprintf(f, "Max guest delay     NA\n");
2442         cpu_fprintf(f, "Max guest advance   NA\n");
2443     }
2444 }