cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "qemu/qemu-print.h"
  35 #include "sysemu/tcg.h"
  36 #include "sysemu/block-backend.h"
  37 #include "exec/gdbstub.h"
  38 #include "sysemu/dma.h"
  39 #include "sysemu/hw_accel.h"
  40 #include "sysemu/kvm.h"
  41 #include "sysemu/hax.h"
  42 #include "sysemu/hvf.h"
  43 #include "sysemu/whpx.h"
  44 #include "exec/exec-all.h"
  45
  46 #include "qemu/thread.h"
  47 #include "sysemu/cpus.h"
  48 #include "sysemu/qtest.h"
  49 #include "qemu/main-loop.h"
  50 #include "qemu/option.h"
  51 #include "qemu/bitmap.h"
  52 #include "qemu/seqlock.h"
  53 #include "qemu/guest-random.h"
  54 #include "tcg.h"
  55 #include "hw/nmi.h"
  56 #include "sysemu/replay.h"
  57
  58 #ifdef CONFIG_LINUX
  59
  60 #include <sys/prctl.h>
  61
  62 #ifndef PR_MCE_KILL
  63 #define PR_MCE_KILL 33
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_SET
  67 #define PR_MCE_KILL_SET 1
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_EARLY
  71 #define PR_MCE_KILL_EARLY 1
  72 #endif
  73
  74 #endif /* CONFIG_LINUX */
  75
  76 int64_t max_delay;
  77 int64_t max_advance;
  78
  79 /* vcpu throttling controls */
  80 static QEMUTimer *throttle_timer;
  81 static unsigned int throttle_percentage;
  82
  83 #define CPU_THROTTLE_PCT_MIN 1
  84 #define CPU_THROTTLE_PCT_MAX 99
  85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  86
  87 bool cpu_is_stopped(CPUState *cpu)
  88 {
  89     return cpu->stopped || !runstate_is_running();
  90 }
  91
  92 static bool cpu_thread_is_idle(CPUState *cpu)
  93 {
  94     if (cpu->stop || cpu->queued_work_first) {
  95         return false;
  96     }
  97     if (cpu_is_stopped(cpu)) {
  98         return true;
  99     }
 100     if (!cpu->halted || cpu_has_work(cpu) ||
 101         kvm_halt_in_kernel()) {
 102         return false;
 103     }
 104     return true;
 105 }
 106
 107 static bool all_cpu_threads_idle(void)
 108 {
 109     CPUState *cpu;
 110
 111     CPU_FOREACH(cpu) {
 112         if (!cpu_thread_is_idle(cpu)) {
 113             return false;
 114         }
 115     }
 116     return true;
 117 }
 118
 119 /***********************************************************/
 120 /* guest cycle counter */
 121
 122 /* Protected by TimersState seqlock */
 123
 124 static bool icount_sleep = true;
 125 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 126 #define MAX_ICOUNT_SHIFT 10
 127
 128 typedef struct TimersState {
 129     /* Protected by BQL.  */
 130     int64_t cpu_ticks_prev;
 131     int64_t cpu_ticks_offset;
 132
 133     /* Protect fields that can be respectively read outside the
 134      * BQL, and written from multiple threads.
 135      */
 136     QemuSeqLock vm_clock_seqlock;
 137     QemuSpin vm_clock_lock;
 138
 139     int16_t cpu_ticks_enabled;
 140
 141     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 142     int16_t icount_time_shift;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146
 147     int64_t vm_clock_warp_start;
 148     int64_t cpu_clock_offset;
 149
 150     /* Only written by TCG thread */
 151     int64_t qemu_icount;
 152
 153     /* for adjusting icount */
 154     QEMUTimer *icount_rt_timer;
 155     QEMUTimer *icount_vm_timer;
 156     QEMUTimer *icount_warp_timer;
 157 } TimersState;
 158
 159 static TimersState timers_state;
 160 bool mttcg_enabled;
 161
 162 /*
 163  * We default to false if we know other options have been enabled
 164  * which are currently incompatible with MTTCG. Otherwise when each
 165  * guest (target) has been updated to support:
 166  *   - atomic instructions
 167  *   - memory ordering primitives (barriers)
 168  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 169  *
 170  * Once a guest architecture has been converted to the new primitives
 171  * there are two remaining limitations to check.
 172  *
 173  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 174  * - The host must have a stronger memory order than the guest
 175  *
 176  * It may be possible in future to support strong guests on weak hosts
 177  * but that will require tagging all load/stores in a guest with their
 178  * implicit memory order requirements which would likely slow things
 179  * down a lot.
 180  */
 181
 182 static bool check_tcg_memory_orders_compatible(void)
 183 {
 184 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 185     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 186 #else
 187     return false;
 188 #endif
 189 }
 190
 191 static bool default_mttcg_enabled(void)
 192 {
 193     if (use_icount || TCG_OVERSIZED_GUEST) {
 194         return false;
 195     } else {
 196 #ifdef TARGET_SUPPORTS_MTTCG
 197         return check_tcg_memory_orders_compatible();
 198 #else
 199         return false;
 200 #endif
 201     }
 202 }
 203
 204 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 205 {
 206     const char *t = qemu_opt_get(opts, "thread");
 207     if (t) {
 208         if (strcmp(t, "multi") == 0) {
 209             if (TCG_OVERSIZED_GUEST) {
 210                 error_setg(errp, "No MTTCG when guest word size > hosts");
 211             } else if (use_icount) {
 212                 error_setg(errp, "No MTTCG when icount is enabled");
 213             } else {
 214 #ifndef TARGET_SUPPORTS_MTTCG
 215                 warn_report("Guest not yet converted to MTTCG - "
 216                             "you may get unexpected results");
 217 #endif
 218                 if (!check_tcg_memory_orders_compatible()) {
 219                     warn_report("Guest expects a stronger memory ordering "
 220                                 "than the host provides");
 221                     error_printf("This may cause strange/hard to debug errors\n");
 222                 }
 223                 mttcg_enabled = true;
 224             }
 225         } else if (strcmp(t, "single") == 0) {
 226             mttcg_enabled = false;
 227         } else {
 228             error_setg(errp, "Invalid 'thread' setting %s", t);
 229         }
 230     } else {
 231         mttcg_enabled = default_mttcg_enabled();
 232     }
 233 }
 234
 235 /* The current number of executed instructions is based on what we
 236  * originally budgeted minus the current state of the decrementing
 237  * icount counters in extra/u16.low.
 238  */
 239 static int64_t cpu_get_icount_executed(CPUState *cpu)
 240 {
 241     return (cpu->icount_budget -
 242             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 243 }
 244
 245 /*
 246  * Update the global shared timer_state.qemu_icount to take into
 247  * account executed instructions. This is done by the TCG vCPU
 248  * thread so the main-loop can see time has moved forward.
 249  */
 250 static void cpu_update_icount_locked(CPUState *cpu)
 251 {
 252     int64_t executed = cpu_get_icount_executed(cpu);
 253     cpu->icount_budget -= executed;
 254
 255     atomic_set_i64(&timers_state.qemu_icount,
 256                    timers_state.qemu_icount + executed);
 257 }
 258
 259 /*
 260  * Update the global shared timer_state.qemu_icount to take into
 261  * account executed instructions. This is done by the TCG vCPU
 262  * thread so the main-loop can see time has moved forward.
 263  */
 264 void cpu_update_icount(CPUState *cpu)
 265 {
 266     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 267                        &timers_state.vm_clock_lock);
 268     cpu_update_icount_locked(cpu);
 269     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 270                          &timers_state.vm_clock_lock);
 271 }
 272
 273 static int64_t cpu_get_icount_raw_locked(void)
 274 {
 275     CPUState *cpu = current_cpu;
 276
 277     if (cpu && cpu->running) {
 278         if (!cpu->can_do_io) {
 279             error_report("Bad icount read");
 280             exit(1);
 281         }
 282         /* Take into account what has run */
 283         cpu_update_icount_locked(cpu);
 284     }
 285     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 286     return atomic_read_i64(&timers_state.qemu_icount);
 287 }
 288
 289 static int64_t cpu_get_icount_locked(void)
 290 {
 291     int64_t icount = cpu_get_icount_raw_locked();
 292     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 293         cpu_icount_to_ns(icount);
 294 }
 295
 296 int64_t cpu_get_icount_raw(void)
 297 {
 298     int64_t icount;
 299     unsigned start;
 300
 301     do {
 302         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 303         icount = cpu_get_icount_raw_locked();
 304     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 305
 306     return icount;
 307 }
 308
 309 /* Return the virtual CPU time, based on the instruction counter.  */
 310 int64_t cpu_get_icount(void)
 311 {
 312     int64_t icount;
 313     unsigned start;
 314
 315     do {
 316         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 317         icount = cpu_get_icount_locked();
 318     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 319
 320     return icount;
 321 }
 322
 323 int64_t cpu_icount_to_ns(int64_t icount)
 324 {
 325     return icount << atomic_read(&timers_state.icount_time_shift);
 326 }
 327
 328 static int64_t cpu_get_ticks_locked(void)
 329 {
 330     int64_t ticks = timers_state.cpu_ticks_offset;
 331     if (timers_state.cpu_ticks_enabled) {
 332         ticks += cpu_get_host_ticks();
 333     }
 334
 335     if (timers_state.cpu_ticks_prev > ticks) {
 336         /* Non increasing ticks may happen if the host uses software suspend.  */
 337         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 338         ticks = timers_state.cpu_ticks_prev;
 339     }
 340
 341     timers_state.cpu_ticks_prev = ticks;
 342     return ticks;
 343 }
 344
 345 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 346  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 347  * counter.
 348  */
 349 int64_t cpu_get_ticks(void)
 350 {
 351     int64_t ticks;
 352
 353     if (use_icount) {
 354         return cpu_get_icount();
 355     }
 356
 357     qemu_spin_lock(&timers_state.vm_clock_lock);
 358     ticks = cpu_get_ticks_locked();
 359     qemu_spin_unlock(&timers_state.vm_clock_lock);
 360     return ticks;
 361 }
 362
 363 static int64_t cpu_get_clock_locked(void)
 364 {
 365     int64_t time;
 366
 367     time = timers_state.cpu_clock_offset;
 368     if (timers_state.cpu_ticks_enabled) {
 369         time += get_clock();
 370     }
 371
 372     return time;
 373 }
 374
 375 /* Return the monotonic time elapsed in VM, i.e.,
 376  * the time between vm_start and vm_stop
 377  */
 378 int64_t cpu_get_clock(void)
 379 {
 380     int64_t ti;
 381     unsigned start;
 382
 383     do {
 384         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 385         ti = cpu_get_clock_locked();
 386     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 387
 388     return ti;
 389 }
 390
 391 /* enable cpu_get_ticks()
 392  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 393  */
 394 void cpu_enable_ticks(void)
 395 {
 396     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 397                        &timers_state.vm_clock_lock);
 398     if (!timers_state.cpu_ticks_enabled) {
 399         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 400         timers_state.cpu_clock_offset -= get_clock();
 401         timers_state.cpu_ticks_enabled = 1;
 402     }
 403     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 404                        &timers_state.vm_clock_lock);
 405 }
 406
 407 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 408  * cpu_get_ticks() after that.
 409  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 410  */
 411 void cpu_disable_ticks(void)
 412 {
 413     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 414                        &timers_state.vm_clock_lock);
 415     if (timers_state.cpu_ticks_enabled) {
 416         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 417         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 418         timers_state.cpu_ticks_enabled = 0;
 419     }
 420     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 421                          &timers_state.vm_clock_lock);
 422 }
 423
 424 /* Correlation between real and virtual time is always going to be
 425    fairly approximate, so ignore small variation.
 426    When the guest is idle real and virtual time will be aligned in
 427    the IO wait loop.  */
 428 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 429
 430 static void icount_adjust(void)
 431 {
 432     int64_t cur_time;
 433     int64_t cur_icount;
 434     int64_t delta;
 435
 436     /* Protected by TimersState mutex.  */
 437     static int64_t last_delta;
 438
 439     /* If the VM is not running, then do nothing.  */
 440     if (!runstate_is_running()) {
 441         return;
 442     }
 443
 444     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 445                        &timers_state.vm_clock_lock);
 446     cur_time = cpu_get_clock_locked();
 447     cur_icount = cpu_get_icount_locked();
 448
 449     delta = cur_icount - cur_time;
 450     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 451     if (delta > 0
 452         && last_delta + ICOUNT_WOBBLE < delta * 2
 453         && timers_state.icount_time_shift > 0) {
 454         /* The guest is getting too far ahead.  Slow time down.  */
 455         atomic_set(&timers_state.icount_time_shift,
 456                    timers_state.icount_time_shift - 1);
 457     }
 458     if (delta < 0
 459         && last_delta - ICOUNT_WOBBLE > delta * 2
 460         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 461         /* The guest is getting too far behind.  Speed time up.  */
 462         atomic_set(&timers_state.icount_time_shift,
 463                    timers_state.icount_time_shift + 1);
 464     }
 465     last_delta = delta;
 466     atomic_set_i64(&timers_state.qemu_icount_bias,
 467                    cur_icount - (timers_state.qemu_icount
 468                                  << timers_state.icount_time_shift));
 469     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 470                          &timers_state.vm_clock_lock);
 471 }
 472
 473 static void icount_adjust_rt(void *opaque)
 474 {
 475     timer_mod(timers_state.icount_rt_timer,
 476               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 477     icount_adjust();
 478 }
 479
 480 static void icount_adjust_vm(void *opaque)
 481 {
 482     timer_mod(timers_state.icount_vm_timer,
 483                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 484                    NANOSECONDS_PER_SECOND / 10);
 485     icount_adjust();
 486 }
 487
 488 static int64_t qemu_icount_round(int64_t count)
 489 {
 490     int shift = atomic_read(&timers_state.icount_time_shift);
 491     return (count + (1 << shift) - 1) >> shift;
 492 }
 493
 494 static void icount_warp_rt(void)
 495 {
 496     unsigned seq;
 497     int64_t warp_start;
 498
 499     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 500      * changes from -1 to another value, so the race here is okay.
 501      */
 502     do {
 503         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 504         warp_start = timers_state.vm_clock_warp_start;
 505     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 506
 507     if (warp_start == -1) {
 508         return;
 509     }
 510
 511     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 512                        &timers_state.vm_clock_lock);
 513     if (runstate_is_running()) {
 514         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 515                                             cpu_get_clock_locked());
 516         int64_t warp_delta;
 517
 518         warp_delta = clock - timers_state.vm_clock_warp_start;
 519         if (use_icount == 2) {
 520             /*
 521              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 522              * far ahead of real time.
 523              */
 524             int64_t cur_icount = cpu_get_icount_locked();
 525             int64_t delta = clock - cur_icount;
 526             warp_delta = MIN(warp_delta, delta);
 527         }
 528         atomic_set_i64(&timers_state.qemu_icount_bias,
 529                        timers_state.qemu_icount_bias + warp_delta);
 530     }
 531     timers_state.vm_clock_warp_start = -1;
 532     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 533                        &timers_state.vm_clock_lock);
 534
 535     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 536         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 537     }
 538 }
 539
 540 static void icount_timer_cb(void *opaque)
 541 {
 542     /* No need for a checkpoint because the timer already synchronizes
 543      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 544      */
 545     icount_warp_rt();
 546 }
 547
 548 void qtest_clock_warp(int64_t dest)
 549 {
 550     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 551     AioContext *aio_context;
 552     assert(qtest_enabled());
 553     aio_context = qemu_get_aio_context();
 554     while (clock < dest) {
 555         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 556         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 557
 558         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 559                            &timers_state.vm_clock_lock);
 560         atomic_set_i64(&timers_state.qemu_icount_bias,
 561                        timers_state.qemu_icount_bias + warp);
 562         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 563                              &timers_state.vm_clock_lock);
 564
 565         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 566         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 567         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 568     }
 569     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 570 }
 571
 572 void qemu_start_warp_timer(void)
 573 {
 574     int64_t clock;
 575     int64_t deadline;
 576
 577     if (!use_icount) {
 578         return;
 579     }
 580
 581     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 582      * do not fire, so computing the deadline does not make sense.
 583      */
 584     if (!runstate_is_running()) {
 585         return;
 586     }
 587
 588     if (replay_mode != REPLAY_MODE_PLAY) {
 589         if (!all_cpu_threads_idle()) {
 590             return;
 591         }
 592
 593         if (qtest_enabled()) {
 594             /* When testing, qtest commands advance icount.  */
 595             return;
 596         }
 597
 598         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 599     } else {
 600         /* warp clock deterministically in record/replay mode */
 601         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 602             /* vCPU is sleeping and warp can't be started.
 603                It is probably a race condition: notification sent
 604                to vCPU was processed in advance and vCPU went to sleep.
 605                Therefore we have to wake it up for doing someting. */
 606             if (replay_has_checkpoint()) {
 607                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 608             }
 609             return;
 610         }
 611     }
 612
 613     /* We want to use the earliest deadline from ALL vm_clocks */
 614     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 615     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 616     if (deadline < 0) {
 617         static bool notified;
 618         if (!icount_sleep && !notified) {
 619             warn_report("icount sleep disabled and no active timers");
 620             notified = true;
 621         }
 622         return;
 623     }
 624
 625     if (deadline > 0) {
 626         /*
 627          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 628          * sleep.  Otherwise, the CPU might be waiting for a future timer
 629          * interrupt to wake it up, but the interrupt never comes because
 630          * the vCPU isn't running any insns and thus doesn't advance the
 631          * QEMU_CLOCK_VIRTUAL.
 632          */
 633         if (!icount_sleep) {
 634             /*
 635              * We never let VCPUs sleep in no sleep icount mode.
 636              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 637              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 638              * It is useful when we want a deterministic execution time,
 639              * isolated from host latencies.
 640              */
 641             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 642                                &timers_state.vm_clock_lock);
 643             atomic_set_i64(&timers_state.qemu_icount_bias,
 644                            timers_state.qemu_icount_bias + deadline);
 645             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 646                                  &timers_state.vm_clock_lock);
 647             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 648         } else {
 649             /*
 650              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 651              * "real" time, (related to the time left until the next event) has
 652              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 653              * This avoids that the warps are visible externally; for example,
 654              * you will not be sending network packets continuously instead of
 655              * every 100ms.
 656              */
 657             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 658                                &timers_state.vm_clock_lock);
 659             if (timers_state.vm_clock_warp_start == -1
 660                 || timers_state.vm_clock_warp_start > clock) {
 661                 timers_state.vm_clock_warp_start = clock;
 662             }
 663             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 664                                  &timers_state.vm_clock_lock);
 665             timer_mod_anticipate(timers_state.icount_warp_timer,
 666                                  clock + deadline);
 667         }
 668     } else if (deadline == 0) {
 669         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 670     }
 671 }
 672
 673 static void qemu_account_warp_timer(void)
 674 {
 675     if (!use_icount || !icount_sleep) {
 676         return;
 677     }
 678
 679     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 680      * do not fire, so computing the deadline does not make sense.
 681      */
 682     if (!runstate_is_running()) {
 683         return;
 684     }
 685
 686     /* warp clock deterministically in record/replay mode */
 687     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 688         return;
 689     }
 690
 691     timer_del(timers_state.icount_warp_timer);
 692     icount_warp_rt();
 693 }
 694
 695 static bool icount_state_needed(void *opaque)
 696 {
 697     return use_icount;
 698 }
 699
 700 static bool warp_timer_state_needed(void *opaque)
 701 {
 702     TimersState *s = opaque;
 703     return s->icount_warp_timer != NULL;
 704 }
 705
 706 static bool adjust_timers_state_needed(void *opaque)
 707 {
 708     TimersState *s = opaque;
 709     return s->icount_rt_timer != NULL;
 710 }
 711
 712 /*
 713  * Subsection for warp timer migration is optional, because may not be created
 714  */
 715 static const VMStateDescription icount_vmstate_warp_timer = {
 716     .name = "timer/icount/warp_timer",
 717     .version_id = 1,
 718     .minimum_version_id = 1,
 719     .needed = warp_timer_state_needed,
 720     .fields = (VMStateField[]) {
 721         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 722         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 723         VMSTATE_END_OF_LIST()
 724     }
 725 };
 726
 727 static const VMStateDescription icount_vmstate_adjust_timers = {
 728     .name = "timer/icount/timers",
 729     .version_id = 1,
 730     .minimum_version_id = 1,
 731     .needed = adjust_timers_state_needed,
 732     .fields = (VMStateField[]) {
 733         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 734         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 735         VMSTATE_END_OF_LIST()
 736     }
 737 };
 738
 739 /*
 740  * This is a subsection for icount migration.
 741  */
 742 static const VMStateDescription icount_vmstate_timers = {
 743     .name = "timer/icount",
 744     .version_id = 1,
 745     .minimum_version_id = 1,
 746     .needed = icount_state_needed,
 747     .fields = (VMStateField[]) {
 748         VMSTATE_INT64(qemu_icount_bias, TimersState),
 749         VMSTATE_INT64(qemu_icount, TimersState),
 750         VMSTATE_END_OF_LIST()
 751     },
 752     .subsections = (const VMStateDescription*[]) {
 753         &icount_vmstate_warp_timer,
 754         &icount_vmstate_adjust_timers,
 755         NULL
 756     }
 757 };
 758
 759 static const VMStateDescription vmstate_timers = {
 760     .name = "timer",
 761     .version_id = 2,
 762     .minimum_version_id = 1,
 763     .fields = (VMStateField[]) {
 764         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 765         VMSTATE_UNUSED(8),
 766         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 767         VMSTATE_END_OF_LIST()
 768     },
 769     .subsections = (const VMStateDescription*[]) {
 770         &icount_vmstate_timers,
 771         NULL
 772     }
 773 };
 774
 775 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 776 {
 777     double pct;
 778     double throttle_ratio;
 779     long sleeptime_ns;
 780
 781     if (!cpu_throttle_get_percentage()) {
 782         return;
 783     }
 784
 785     pct = (double)cpu_throttle_get_percentage()/100;
 786     throttle_ratio = pct / (1 - pct);
 787     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 788
 789     qemu_mutex_unlock_iothread();
 790     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 791     qemu_mutex_lock_iothread();
 792     atomic_set(&cpu->throttle_thread_scheduled, 0);
 793 }
 794
 795 static void cpu_throttle_timer_tick(void *opaque)
 796 {
 797     CPUState *cpu;
 798     double pct;
 799
 800     /* Stop the timer if needed */
 801     if (!cpu_throttle_get_percentage()) {
 802         return;
 803     }
 804     CPU_FOREACH(cpu) {
 805         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 806             async_run_on_cpu(cpu, cpu_throttle_thread,
 807                              RUN_ON_CPU_NULL);
 808         }
 809     }
 810
 811     pct = (double)cpu_throttle_get_percentage()/100;
 812     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 813                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 814 }
 815
 816 void cpu_throttle_set(int new_throttle_pct)
 817 {
 818     /* Ensure throttle percentage is within valid range */
 819     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 820     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 821
 822     atomic_set(&throttle_percentage, new_throttle_pct);
 823
 824     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 825                                        CPU_THROTTLE_TIMESLICE_NS);
 826 }
 827
 828 void cpu_throttle_stop(void)
 829 {
 830     atomic_set(&throttle_percentage, 0);
 831 }
 832
 833 bool cpu_throttle_active(void)
 834 {
 835     return (cpu_throttle_get_percentage() != 0);
 836 }
 837
 838 int cpu_throttle_get_percentage(void)
 839 {
 840     return atomic_read(&throttle_percentage);
 841 }
 842
 843 void cpu_ticks_init(void)
 844 {
 845     seqlock_init(&timers_state.vm_clock_seqlock);
 846     qemu_spin_init(&timers_state.vm_clock_lock);
 847     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 848     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 849                                            cpu_throttle_timer_tick, NULL);
 850 }
 851
 852 void configure_icount(QemuOpts *opts, Error **errp)
 853 {
 854     const char *option;
 855     char *rem_str = NULL;
 856
 857     option = qemu_opt_get(opts, "shift");
 858     if (!option) {
 859         if (qemu_opt_get(opts, "align") != NULL) {
 860             error_setg(errp, "Please specify shift option when using align");
 861         }
 862         return;
 863     }
 864
 865     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 866     if (icount_sleep) {
 867         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 868                                          icount_timer_cb, NULL);
 869     }
 870
 871     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 872
 873     if (icount_align_option && !icount_sleep) {
 874         error_setg(errp, "align=on and sleep=off are incompatible");
 875     }
 876     if (strcmp(option, "auto") != 0) {
 877         errno = 0;
 878         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 879         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 880             error_setg(errp, "icount: Invalid shift value");
 881         }
 882         use_icount = 1;
 883         return;
 884     } else if (icount_align_option) {
 885         error_setg(errp, "shift=auto and align=on are incompatible");
 886     } else if (!icount_sleep) {
 887         error_setg(errp, "shift=auto and sleep=off are incompatible");
 888     }
 889
 890     use_icount = 2;
 891
 892     /* 125MIPS seems a reasonable initial guess at the guest speed.
 893        It will be corrected fairly quickly anyway.  */
 894     timers_state.icount_time_shift = 3;
 895
 896     /* Have both realtime and virtual time triggers for speed adjustment.
 897        The realtime trigger catches emulated time passing too slowly,
 898        the virtual time trigger catches emulated time passing too fast.
 899        Realtime triggers occur even when idle, so use them less frequently
 900        than VM triggers.  */
 901     timers_state.vm_clock_warp_start = -1;
 902     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 903                                    icount_adjust_rt, NULL);
 904     timer_mod(timers_state.icount_rt_timer,
 905                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 906     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 907                                         icount_adjust_vm, NULL);
 908     timer_mod(timers_state.icount_vm_timer,
 909                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 910                    NANOSECONDS_PER_SECOND / 10);
 911 }
 912
 913 /***********************************************************/
 914 /* TCG vCPU kick timer
 915  *
 916  * The kick timer is responsible for moving single threaded vCPU
 917  * emulation on to the next vCPU. If more than one vCPU is running a
 918  * timer event with force a cpu->exit so the next vCPU can get
 919  * scheduled.
 920  *
 921  * The timer is removed if all vCPUs are idle and restarted again once
 922  * idleness is complete.
 923  */
 924
 925 static QEMUTimer *tcg_kick_vcpu_timer;
 926 static CPUState *tcg_current_rr_cpu;
 927
 928 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 929
 930 static inline int64_t qemu_tcg_next_kick(void)
 931 {
 932     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 933 }
 934
 935 /* Kick the currently round-robin scheduled vCPU */
 936 static void qemu_cpu_kick_rr_cpu(void)
 937 {
 938     CPUState *cpu;
 939     do {
 940         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 941         if (cpu) {
 942             cpu_exit(cpu);
 943         }
 944     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 945 }
 946
 947 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 948 {
 949 }
 950
 951 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 952 {
 953     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 954         qemu_notify_event();
 955         return;
 956     }
 957
 958     if (qemu_in_vcpu_thread()) {
 959         /* A CPU is currently running; kick it back out to the
 960          * tcg_cpu_exec() loop so it will recalculate its
 961          * icount deadline immediately.
 962          */
 963         qemu_cpu_kick(current_cpu);
 964     } else if (first_cpu) {
 965         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 966          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 967          * causes cpu_thread_is_idle to return false.  This way,
 968          * handle_icount_deadline can run.
 969          * If we have no CPUs at all for some reason, we don't
 970          * need to do anything.
 971          */
 972         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 973     }
 974 }
 975
 976 static void kick_tcg_thread(void *opaque)
 977 {
 978     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 979     qemu_cpu_kick_rr_cpu();
 980 }
 981
 982 static void start_tcg_kick_timer(void)
 983 {
 984     assert(!mttcg_enabled);
 985     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 986         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 987                                            kick_tcg_thread, NULL);
 988     }
 989     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 990         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 991     }
 992 }
 993
 994 static void stop_tcg_kick_timer(void)
 995 {
 996     assert(!mttcg_enabled);
 997     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 998         timer_del(tcg_kick_vcpu_timer);
 999     }
1000 }
1001
1002 /***********************************************************/
1003 void hw_error(const char *fmt, ...)
1004 {
1005     va_list ap;
1006     CPUState *cpu;
1007
1008     va_start(ap, fmt);
1009     fprintf(stderr, "qemu: hardware error: ");
1010     vfprintf(stderr, fmt, ap);
1011     fprintf(stderr, "\n");
1012     CPU_FOREACH(cpu) {
1013         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1014         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1015     }
1016     va_end(ap);
1017     abort();
1018 }
1019
1020 void cpu_synchronize_all_states(void)
1021 {
1022     CPUState *cpu;
1023
1024     CPU_FOREACH(cpu) {
1025         cpu_synchronize_state(cpu);
1026         /* TODO: move to cpu_synchronize_state() */
1027         if (hvf_enabled()) {
1028             hvf_cpu_synchronize_state(cpu);
1029         }
1030     }
1031 }
1032
1033 void cpu_synchronize_all_post_reset(void)
1034 {
1035     CPUState *cpu;
1036
1037     CPU_FOREACH(cpu) {
1038         cpu_synchronize_post_reset(cpu);
1039         /* TODO: move to cpu_synchronize_post_reset() */
1040         if (hvf_enabled()) {
1041             hvf_cpu_synchronize_post_reset(cpu);
1042         }
1043     }
1044 }
1045
1046 void cpu_synchronize_all_post_init(void)
1047 {
1048     CPUState *cpu;
1049
1050     CPU_FOREACH(cpu) {
1051         cpu_synchronize_post_init(cpu);
1052         /* TODO: move to cpu_synchronize_post_init() */
1053         if (hvf_enabled()) {
1054             hvf_cpu_synchronize_post_init(cpu);
1055         }
1056     }
1057 }
1058
1059 void cpu_synchronize_all_pre_loadvm(void)
1060 {
1061     CPUState *cpu;
1062
1063     CPU_FOREACH(cpu) {
1064         cpu_synchronize_pre_loadvm(cpu);
1065     }
1066 }
1067
1068 static int do_vm_stop(RunState state, bool send_stop)
1069 {
1070     int ret = 0;
1071
1072     if (runstate_is_running()) {
1073         cpu_disable_ticks();
1074         pause_all_vcpus();
1075         runstate_set(state);
1076         vm_state_notify(0, state);
1077         if (send_stop) {
1078             qapi_event_send_stop();
1079         }
1080     }
1081
1082     bdrv_drain_all();
1083     replay_disable_events();
1084     ret = bdrv_flush_all();
1085
1086     return ret;
1087 }
1088
1089 /* Special vm_stop() variant for terminating the process.  Historically clients
1090  * did not expect a QMP STOP event and so we need to retain compatibility.
1091  */
1092 int vm_shutdown(void)
1093 {
1094     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1095 }
1096
1097 static bool cpu_can_run(CPUState *cpu)
1098 {
1099     if (cpu->stop) {
1100         return false;
1101     }
1102     if (cpu_is_stopped(cpu)) {
1103         return false;
1104     }
1105     return true;
1106 }
1107
1108 static void cpu_handle_guest_debug(CPUState *cpu)
1109 {
1110     gdb_set_stop_cpu(cpu);
1111     qemu_system_debug_request();
1112     cpu->stopped = true;
1113 }
1114
1115 #ifdef CONFIG_LINUX
1116 static void sigbus_reraise(void)
1117 {
1118     sigset_t set;
1119     struct sigaction action;
1120
1121     memset(&action, 0, sizeof(action));
1122     action.sa_handler = SIG_DFL;
1123     if (!sigaction(SIGBUS, &action, NULL)) {
1124         raise(SIGBUS);
1125         sigemptyset(&set);
1126         sigaddset(&set, SIGBUS);
1127         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1128     }
1129     perror("Failed to re-raise SIGBUS!\n");
1130     abort();
1131 }
1132
1133 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1134 {
1135     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1136         sigbus_reraise();
1137     }
1138
1139     if (current_cpu) {
1140         /* Called asynchronously in VCPU thread.  */
1141         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1142             sigbus_reraise();
1143         }
1144     } else {
1145         /* Called synchronously (via signalfd) in main thread.  */
1146         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1147             sigbus_reraise();
1148         }
1149     }
1150 }
1151
1152 static void qemu_init_sigbus(void)
1153 {
1154     struct sigaction action;
1155
1156     memset(&action, 0, sizeof(action));
1157     action.sa_flags = SA_SIGINFO;
1158     action.sa_sigaction = sigbus_handler;
1159     sigaction(SIGBUS, &action, NULL);
1160
1161     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1162 }
1163 #else /* !CONFIG_LINUX */
1164 static void qemu_init_sigbus(void)
1165 {
1166 }
1167 #endif /* !CONFIG_LINUX */
1168
1169 static QemuMutex qemu_global_mutex;
1170
1171 static QemuThread io_thread;
1172
1173 /* cpu creation */
1174 static QemuCond qemu_cpu_cond;
1175 /* system init */
1176 static QemuCond qemu_pause_cond;
1177
1178 void qemu_init_cpu_loop(void)
1179 {
1180     qemu_init_sigbus();
1181     qemu_cond_init(&qemu_cpu_cond);
1182     qemu_cond_init(&qemu_pause_cond);
1183     qemu_mutex_init(&qemu_global_mutex);
1184
1185     qemu_thread_get_self(&io_thread);
1186 }
1187
1188 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1189 {
1190     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1191 }
1192
1193 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1194 {
1195     if (kvm_destroy_vcpu(cpu) < 0) {
1196         error_report("kvm_destroy_vcpu failed");
1197         exit(EXIT_FAILURE);
1198     }
1199 }
1200
1201 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1202 {
1203 }
1204
1205 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1206 {
1207     g_assert(qemu_cpu_is_self(cpu));
1208     cpu->stop = false;
1209     cpu->stopped = true;
1210     if (exit) {
1211         cpu_exit(cpu);
1212     }
1213     qemu_cond_broadcast(&qemu_pause_cond);
1214 }
1215
1216 static void qemu_wait_io_event_common(CPUState *cpu)
1217 {
1218     atomic_mb_set(&cpu->thread_kicked, false);
1219     if (cpu->stop) {
1220         qemu_cpu_stop(cpu, false);
1221     }
1222     process_queued_cpu_work(cpu);
1223 }
1224
1225 static void qemu_tcg_rr_wait_io_event(void)
1226 {
1227     CPUState *cpu;
1228
1229     while (all_cpu_threads_idle()) {
1230         stop_tcg_kick_timer();
1231         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1232     }
1233
1234     start_tcg_kick_timer();
1235
1236     CPU_FOREACH(cpu) {
1237         qemu_wait_io_event_common(cpu);
1238     }
1239 }
1240
1241 static void qemu_wait_io_event(CPUState *cpu)
1242 {
1243     while (cpu_thread_is_idle(cpu)) {
1244         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1245     }
1246
1247 #ifdef _WIN32
1248     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1249     if (!tcg_enabled()) {
1250         SleepEx(0, TRUE);
1251     }
1252 #endif
1253     qemu_wait_io_event_common(cpu);
1254 }
1255
1256 static void *qemu_kvm_cpu_thread_fn(void *arg)
1257 {
1258     CPUState *cpu = arg;
1259     int r;
1260
1261     rcu_register_thread();
1262
1263     qemu_mutex_lock_iothread();
1264     qemu_thread_get_self(cpu->thread);
1265     cpu->thread_id = qemu_get_thread_id();
1266     cpu->can_do_io = 1;
1267     current_cpu = cpu;
1268
1269     r = kvm_init_vcpu(cpu);
1270     if (r < 0) {
1271         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1272         exit(1);
1273     }
1274
1275     kvm_init_cpu_signals(cpu);
1276
1277     /* signal CPU creation */
1278     cpu->created = true;
1279     qemu_cond_signal(&qemu_cpu_cond);
1280     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1281
1282     do {
1283         if (cpu_can_run(cpu)) {
1284             r = kvm_cpu_exec(cpu);
1285             if (r == EXCP_DEBUG) {
1286                 cpu_handle_guest_debug(cpu);
1287             }
1288         }
1289         qemu_wait_io_event(cpu);
1290     } while (!cpu->unplug || cpu_can_run(cpu));
1291
1292     qemu_kvm_destroy_vcpu(cpu);
1293     cpu->created = false;
1294     qemu_cond_signal(&qemu_cpu_cond);
1295     qemu_mutex_unlock_iothread();
1296     rcu_unregister_thread();
1297     return NULL;
1298 }
1299
1300 static void *qemu_dummy_cpu_thread_fn(void *arg)
1301 {
1302 #ifdef _WIN32
1303     error_report("qtest is not supported under Windows");
1304     exit(1);
1305 #else
1306     CPUState *cpu = arg;
1307     sigset_t waitset;
1308     int r;
1309
1310     rcu_register_thread();
1311
1312     qemu_mutex_lock_iothread();
1313     qemu_thread_get_self(cpu->thread);
1314     cpu->thread_id = qemu_get_thread_id();
1315     cpu->can_do_io = 1;
1316     current_cpu = cpu;
1317
1318     sigemptyset(&waitset);
1319     sigaddset(&waitset, SIG_IPI);
1320
1321     /* signal CPU creation */
1322     cpu->created = true;
1323     qemu_cond_signal(&qemu_cpu_cond);
1324     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1325
1326     do {
1327         qemu_mutex_unlock_iothread();
1328         do {
1329             int sig;
1330             r = sigwait(&waitset, &sig);
1331         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1332         if (r == -1) {
1333             perror("sigwait");
1334             exit(1);
1335         }
1336         qemu_mutex_lock_iothread();
1337         qemu_wait_io_event(cpu);
1338     } while (!cpu->unplug);
1339
1340     qemu_mutex_unlock_iothread();
1341     rcu_unregister_thread();
1342     return NULL;
1343 #endif
1344 }
1345
1346 static int64_t tcg_get_icount_limit(void)
1347 {
1348     int64_t deadline;
1349
1350     if (replay_mode != REPLAY_MODE_PLAY) {
1351         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1352
1353         /* Maintain prior (possibly buggy) behaviour where if no deadline
1354          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1355          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1356          * nanoseconds.
1357          */
1358         if ((deadline < 0) || (deadline > INT32_MAX)) {
1359             deadline = INT32_MAX;
1360         }
1361
1362         return qemu_icount_round(deadline);
1363     } else {
1364         return replay_get_instructions();
1365     }
1366 }
1367
1368 static void handle_icount_deadline(void)
1369 {
1370     assert(qemu_in_vcpu_thread());
1371     if (use_icount) {
1372         int64_t deadline =
1373             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1374
1375         if (deadline == 0) {
1376             /* Wake up other AioContexts.  */
1377             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1378             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1379         }
1380     }
1381 }
1382
1383 static void prepare_icount_for_run(CPUState *cpu)
1384 {
1385     if (use_icount) {
1386         int insns_left;
1387
1388         /* These should always be cleared by process_icount_data after
1389          * each vCPU execution. However u16.high can be raised
1390          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1391          */
1392         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1393         g_assert(cpu->icount_extra == 0);
1394
1395         cpu->icount_budget = tcg_get_icount_limit();
1396         insns_left = MIN(0xffff, cpu->icount_budget);
1397         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1398         cpu->icount_extra = cpu->icount_budget - insns_left;
1399
1400         replay_mutex_lock();
1401     }
1402 }
1403
1404 static void process_icount_data(CPUState *cpu)
1405 {
1406     if (use_icount) {
1407         /* Account for executed instructions */
1408         cpu_update_icount(cpu);
1409
1410         /* Reset the counters */
1411         cpu_neg(cpu)->icount_decr.u16.low = 0;
1412         cpu->icount_extra = 0;
1413         cpu->icount_budget = 0;
1414
1415         replay_account_executed_instructions();
1416
1417         replay_mutex_unlock();
1418     }
1419 }
1420
1421
1422 static int tcg_cpu_exec(CPUState *cpu)
1423 {
1424     int ret;
1425 #ifdef CONFIG_PROFILER
1426     int64_t ti;
1427 #endif
1428
1429     assert(tcg_enabled());
1430 #ifdef CONFIG_PROFILER
1431     ti = profile_getclock();
1432 #endif
1433     cpu_exec_start(cpu);
1434     ret = cpu_exec(cpu);
1435     cpu_exec_end(cpu);
1436 #ifdef CONFIG_PROFILER
1437     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1438                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1439 #endif
1440     return ret;
1441 }
1442
1443 /* Destroy any remaining vCPUs which have been unplugged and have
1444  * finished running
1445  */
1446 static void deal_with_unplugged_cpus(void)
1447 {
1448     CPUState *cpu;
1449
1450     CPU_FOREACH(cpu) {
1451         if (cpu->unplug && !cpu_can_run(cpu)) {
1452             qemu_tcg_destroy_vcpu(cpu);
1453             cpu->created = false;
1454             qemu_cond_signal(&qemu_cpu_cond);
1455             break;
1456         }
1457     }
1458 }
1459
1460 /* Single-threaded TCG
1461  *
1462  * In the single-threaded case each vCPU is simulated in turn. If
1463  * there is more than a single vCPU we create a simple timer to kick
1464  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1465  * This is done explicitly rather than relying on side-effects
1466  * elsewhere.
1467  */
1468
1469 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1470 {
1471     CPUState *cpu = arg;
1472
1473     assert(tcg_enabled());
1474     rcu_register_thread();
1475     tcg_register_thread();
1476
1477     qemu_mutex_lock_iothread();
1478     qemu_thread_get_self(cpu->thread);
1479
1480     cpu->thread_id = qemu_get_thread_id();
1481     cpu->created = true;
1482     cpu->can_do_io = 1;
1483     qemu_cond_signal(&qemu_cpu_cond);
1484     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1485
1486     /* wait for initial kick-off after machine start */
1487     while (first_cpu->stopped) {
1488         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1489
1490         /* process any pending work */
1491         CPU_FOREACH(cpu) {
1492             current_cpu = cpu;
1493             qemu_wait_io_event_common(cpu);
1494         }
1495     }
1496
1497     start_tcg_kick_timer();
1498
1499     cpu = first_cpu;
1500
1501     /* process any pending work */
1502     cpu->exit_request = 1;
1503
1504     while (1) {
1505         qemu_mutex_unlock_iothread();
1506         replay_mutex_lock();
1507         qemu_mutex_lock_iothread();
1508         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1509         qemu_account_warp_timer();
1510
1511         /* Run the timers here.  This is much more efficient than
1512          * waking up the I/O thread and waiting for completion.
1513          */
1514         handle_icount_deadline();
1515
1516         replay_mutex_unlock();
1517
1518         if (!cpu) {
1519             cpu = first_cpu;
1520         }
1521
1522         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1523
1524             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1525             current_cpu = cpu;
1526
1527             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1528                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1529
1530             if (cpu_can_run(cpu)) {
1531                 int r;
1532
1533                 qemu_mutex_unlock_iothread();
1534                 prepare_icount_for_run(cpu);
1535
1536                 r = tcg_cpu_exec(cpu);
1537
1538                 process_icount_data(cpu);
1539                 qemu_mutex_lock_iothread();
1540
1541                 if (r == EXCP_DEBUG) {
1542                     cpu_handle_guest_debug(cpu);
1543                     break;
1544                 } else if (r == EXCP_ATOMIC) {
1545                     qemu_mutex_unlock_iothread();
1546                     cpu_exec_step_atomic(cpu);
1547                     qemu_mutex_lock_iothread();
1548                     break;
1549                 }
1550             } else if (cpu->stop) {
1551                 if (cpu->unplug) {
1552                     cpu = CPU_NEXT(cpu);
1553                 }
1554                 break;
1555             }
1556
1557             cpu = CPU_NEXT(cpu);
1558         } /* while (cpu && !cpu->exit_request).. */
1559
1560         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1561         atomic_set(&tcg_current_rr_cpu, NULL);
1562
1563         if (cpu && cpu->exit_request) {
1564             atomic_mb_set(&cpu->exit_request, 0);
1565         }
1566
1567         if (use_icount && all_cpu_threads_idle()) {
1568             /*
1569              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1570              * in the main_loop, wake it up in order to start the warp timer.
1571              */
1572             qemu_notify_event();
1573         }
1574
1575         qemu_tcg_rr_wait_io_event();
1576         deal_with_unplugged_cpus();
1577     }
1578
1579     rcu_unregister_thread();
1580     return NULL;
1581 }
1582
1583 static void *qemu_hax_cpu_thread_fn(void *arg)
1584 {
1585     CPUState *cpu = arg;
1586     int r;
1587
1588     rcu_register_thread();
1589     qemu_mutex_lock_iothread();
1590     qemu_thread_get_self(cpu->thread);
1591
1592     cpu->thread_id = qemu_get_thread_id();
1593     cpu->created = true;
1594     current_cpu = cpu;
1595
1596     hax_init_vcpu(cpu);
1597     qemu_cond_signal(&qemu_cpu_cond);
1598     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1599
1600     do {
1601         if (cpu_can_run(cpu)) {
1602             r = hax_smp_cpu_exec(cpu);
1603             if (r == EXCP_DEBUG) {
1604                 cpu_handle_guest_debug(cpu);
1605             }
1606         }
1607
1608         qemu_wait_io_event(cpu);
1609     } while (!cpu->unplug || cpu_can_run(cpu));
1610     rcu_unregister_thread();
1611     return NULL;
1612 }
1613
1614 /* The HVF-specific vCPU thread function. This one should only run when the host
1615  * CPU supports the VMX "unrestricted guest" feature. */
1616 static void *qemu_hvf_cpu_thread_fn(void *arg)
1617 {
1618     CPUState *cpu = arg;
1619
1620     int r;
1621
1622     assert(hvf_enabled());
1623
1624     rcu_register_thread();
1625
1626     qemu_mutex_lock_iothread();
1627     qemu_thread_get_self(cpu->thread);
1628
1629     cpu->thread_id = qemu_get_thread_id();
1630     cpu->can_do_io = 1;
1631     current_cpu = cpu;
1632
1633     hvf_init_vcpu(cpu);
1634
1635     /* signal CPU creation */
1636     cpu->created = true;
1637     qemu_cond_signal(&qemu_cpu_cond);
1638     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1639
1640     do {
1641         if (cpu_can_run(cpu)) {
1642             r = hvf_vcpu_exec(cpu);
1643             if (r == EXCP_DEBUG) {
1644                 cpu_handle_guest_debug(cpu);
1645             }
1646         }
1647         qemu_wait_io_event(cpu);
1648     } while (!cpu->unplug || cpu_can_run(cpu));
1649
1650     hvf_vcpu_destroy(cpu);
1651     cpu->created = false;
1652     qemu_cond_signal(&qemu_cpu_cond);
1653     qemu_mutex_unlock_iothread();
1654     rcu_unregister_thread();
1655     return NULL;
1656 }
1657
1658 static void *qemu_whpx_cpu_thread_fn(void *arg)
1659 {
1660     CPUState *cpu = arg;
1661     int r;
1662
1663     rcu_register_thread();
1664
1665     qemu_mutex_lock_iothread();
1666     qemu_thread_get_self(cpu->thread);
1667     cpu->thread_id = qemu_get_thread_id();
1668     current_cpu = cpu;
1669
1670     r = whpx_init_vcpu(cpu);
1671     if (r < 0) {
1672         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1673         exit(1);
1674     }
1675
1676     /* signal CPU creation */
1677     cpu->created = true;
1678     qemu_cond_signal(&qemu_cpu_cond);
1679     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1680
1681     do {
1682         if (cpu_can_run(cpu)) {
1683             r = whpx_vcpu_exec(cpu);
1684             if (r == EXCP_DEBUG) {
1685                 cpu_handle_guest_debug(cpu);
1686             }
1687         }
1688         while (cpu_thread_is_idle(cpu)) {
1689             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1690         }
1691         qemu_wait_io_event_common(cpu);
1692     } while (!cpu->unplug || cpu_can_run(cpu));
1693
1694     whpx_destroy_vcpu(cpu);
1695     cpu->created = false;
1696     qemu_cond_signal(&qemu_cpu_cond);
1697     qemu_mutex_unlock_iothread();
1698     rcu_unregister_thread();
1699     return NULL;
1700 }
1701
1702 #ifdef _WIN32
1703 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1704 {
1705 }
1706 #endif
1707
1708 /* Multi-threaded TCG
1709  *
1710  * In the multi-threaded case each vCPU has its own thread. The TLS
1711  * variable current_cpu can be used deep in the code to find the
1712  * current CPUState for a given thread.
1713  */
1714
1715 static void *qemu_tcg_cpu_thread_fn(void *arg)
1716 {
1717     CPUState *cpu = arg;
1718
1719     assert(tcg_enabled());
1720     g_assert(!use_icount);
1721
1722     rcu_register_thread();
1723     tcg_register_thread();
1724
1725     qemu_mutex_lock_iothread();
1726     qemu_thread_get_self(cpu->thread);
1727
1728     cpu->thread_id = qemu_get_thread_id();
1729     cpu->created = true;
1730     cpu->can_do_io = 1;
1731     current_cpu = cpu;
1732     qemu_cond_signal(&qemu_cpu_cond);
1733     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1734
1735     /* process any pending work */
1736     cpu->exit_request = 1;
1737
1738     do {
1739         if (cpu_can_run(cpu)) {
1740             int r;
1741             qemu_mutex_unlock_iothread();
1742             r = tcg_cpu_exec(cpu);
1743             qemu_mutex_lock_iothread();
1744             switch (r) {
1745             case EXCP_DEBUG:
1746                 cpu_handle_guest_debug(cpu);
1747                 break;
1748             case EXCP_HALTED:
1749                 /* during start-up the vCPU is reset and the thread is
1750                  * kicked several times. If we don't ensure we go back
1751                  * to sleep in the halted state we won't cleanly
1752                  * start-up when the vCPU is enabled.
1753                  *
1754                  * cpu->halted should ensure we sleep in wait_io_event
1755                  */
1756                 g_assert(cpu->halted);
1757                 break;
1758             case EXCP_ATOMIC:
1759                 qemu_mutex_unlock_iothread();
1760                 cpu_exec_step_atomic(cpu);
1761                 qemu_mutex_lock_iothread();
1762             default:
1763                 /* Ignore everything else? */
1764                 break;
1765             }
1766         }
1767
1768         atomic_mb_set(&cpu->exit_request, 0);
1769         qemu_wait_io_event(cpu);
1770     } while (!cpu->unplug || cpu_can_run(cpu));
1771
1772     qemu_tcg_destroy_vcpu(cpu);
1773     cpu->created = false;
1774     qemu_cond_signal(&qemu_cpu_cond);
1775     qemu_mutex_unlock_iothread();
1776     rcu_unregister_thread();
1777     return NULL;
1778 }
1779
1780 static void qemu_cpu_kick_thread(CPUState *cpu)
1781 {
1782 #ifndef _WIN32
1783     int err;
1784
1785     if (cpu->thread_kicked) {
1786         return;
1787     }
1788     cpu->thread_kicked = true;
1789     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1790     if (err && err != ESRCH) {
1791         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1792         exit(1);
1793     }
1794 #else /* _WIN32 */
1795     if (!qemu_cpu_is_self(cpu)) {
1796         if (whpx_enabled()) {
1797             whpx_vcpu_kick(cpu);
1798         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1799             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1800                     __func__, GetLastError());
1801             exit(1);
1802         }
1803     }
1804 #endif
1805 }
1806
1807 void qemu_cpu_kick(CPUState *cpu)
1808 {
1809     qemu_cond_broadcast(cpu->halt_cond);
1810     if (tcg_enabled()) {
1811         cpu_exit(cpu);
1812         /* NOP unless doing single-thread RR */
1813         qemu_cpu_kick_rr_cpu();
1814     } else {
1815         if (hax_enabled()) {
1816             /*
1817              * FIXME: race condition with the exit_request check in
1818              * hax_vcpu_hax_exec
1819              */
1820             cpu->exit_request = 1;
1821         }
1822         qemu_cpu_kick_thread(cpu);
1823     }
1824 }
1825
1826 void qemu_cpu_kick_self(void)
1827 {
1828     assert(current_cpu);
1829     qemu_cpu_kick_thread(current_cpu);
1830 }
1831
1832 bool qemu_cpu_is_self(CPUState *cpu)
1833 {
1834     return qemu_thread_is_self(cpu->thread);
1835 }
1836
1837 bool qemu_in_vcpu_thread(void)
1838 {
1839     return current_cpu && qemu_cpu_is_self(current_cpu);
1840 }
1841
1842 static __thread bool iothread_locked = false;
1843
1844 bool qemu_mutex_iothread_locked(void)
1845 {
1846     return iothread_locked;
1847 }
1848
1849 /*
1850  * The BQL is taken from so many places that it is worth profiling the
1851  * callers directly, instead of funneling them all through a single function.
1852  */
1853 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1854 {
1855     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1856
1857     g_assert(!qemu_mutex_iothread_locked());
1858     bql_lock(&qemu_global_mutex, file, line);
1859     iothread_locked = true;
1860 }
1861
1862 void qemu_mutex_unlock_iothread(void)
1863 {
1864     g_assert(qemu_mutex_iothread_locked());
1865     iothread_locked = false;
1866     qemu_mutex_unlock(&qemu_global_mutex);
1867 }
1868
1869 static bool all_vcpus_paused(void)
1870 {
1871     CPUState *cpu;
1872
1873     CPU_FOREACH(cpu) {
1874         if (!cpu->stopped) {
1875             return false;
1876         }
1877     }
1878
1879     return true;
1880 }
1881
1882 void pause_all_vcpus(void)
1883 {
1884     CPUState *cpu;
1885
1886     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1887     CPU_FOREACH(cpu) {
1888         if (qemu_cpu_is_self(cpu)) {
1889             qemu_cpu_stop(cpu, true);
1890         } else {
1891             cpu->stop = true;
1892             qemu_cpu_kick(cpu);
1893         }
1894     }
1895
1896     /* We need to drop the replay_lock so any vCPU threads woken up
1897      * can finish their replay tasks
1898      */
1899     replay_mutex_unlock();
1900
1901     while (!all_vcpus_paused()) {
1902         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1903         CPU_FOREACH(cpu) {
1904             qemu_cpu_kick(cpu);
1905         }
1906     }
1907
1908     qemu_mutex_unlock_iothread();
1909     replay_mutex_lock();
1910     qemu_mutex_lock_iothread();
1911 }
1912
1913 void cpu_resume(CPUState *cpu)
1914 {
1915     cpu->stop = false;
1916     cpu->stopped = false;
1917     qemu_cpu_kick(cpu);
1918 }
1919
1920 void resume_all_vcpus(void)
1921 {
1922     CPUState *cpu;
1923
1924     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1925     CPU_FOREACH(cpu) {
1926         cpu_resume(cpu);
1927     }
1928 }
1929
1930 void cpu_remove_sync(CPUState *cpu)
1931 {
1932     cpu->stop = true;
1933     cpu->unplug = true;
1934     qemu_cpu_kick(cpu);
1935     qemu_mutex_unlock_iothread();
1936     qemu_thread_join(cpu->thread);
1937     qemu_mutex_lock_iothread();
1938 }
1939
1940 /* For temporary buffers for forming a name */
1941 #define VCPU_THREAD_NAME_SIZE 16
1942
1943 static void qemu_tcg_init_vcpu(CPUState *cpu)
1944 {
1945     char thread_name[VCPU_THREAD_NAME_SIZE];
1946     static QemuCond *single_tcg_halt_cond;
1947     static QemuThread *single_tcg_cpu_thread;
1948     static int tcg_region_inited;
1949
1950     assert(tcg_enabled());
1951     /*
1952      * Initialize TCG regions--once. Now is a good time, because:
1953      * (1) TCG's init context, prologue and target globals have been set up.
1954      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1955      *     -accel flag is processed, so the check doesn't work then).
1956      */
1957     if (!tcg_region_inited) {
1958         tcg_region_inited = 1;
1959         tcg_region_init();
1960     }
1961
1962     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1963         cpu->thread = g_malloc0(sizeof(QemuThread));
1964         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1965         qemu_cond_init(cpu->halt_cond);
1966
1967         if (qemu_tcg_mttcg_enabled()) {
1968             /* create a thread per vCPU with TCG (MTTCG) */
1969             parallel_cpus = true;
1970             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1971                  cpu->cpu_index);
1972
1973             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1974                                cpu, QEMU_THREAD_JOINABLE);
1975
1976         } else {
1977             /* share a single thread for all cpus with TCG */
1978             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1979             qemu_thread_create(cpu->thread, thread_name,
1980                                qemu_tcg_rr_cpu_thread_fn,
1981                                cpu, QEMU_THREAD_JOINABLE);
1982
1983             single_tcg_halt_cond = cpu->halt_cond;
1984             single_tcg_cpu_thread = cpu->thread;
1985         }
1986 #ifdef _WIN32
1987         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1988 #endif
1989     } else {
1990         /* For non-MTTCG cases we share the thread */
1991         cpu->thread = single_tcg_cpu_thread;
1992         cpu->halt_cond = single_tcg_halt_cond;
1993         cpu->thread_id = first_cpu->thread_id;
1994         cpu->can_do_io = 1;
1995         cpu->created = true;
1996     }
1997 }
1998
1999 static void qemu_hax_start_vcpu(CPUState *cpu)
2000 {
2001     char thread_name[VCPU_THREAD_NAME_SIZE];
2002
2003     cpu->thread = g_malloc0(sizeof(QemuThread));
2004     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2005     qemu_cond_init(cpu->halt_cond);
2006
2007     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2008              cpu->cpu_index);
2009     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2010                        cpu, QEMU_THREAD_JOINABLE);
2011 #ifdef _WIN32
2012     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2013 #endif
2014 }
2015
2016 static void qemu_kvm_start_vcpu(CPUState *cpu)
2017 {
2018     char thread_name[VCPU_THREAD_NAME_SIZE];
2019
2020     cpu->thread = g_malloc0(sizeof(QemuThread));
2021     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2022     qemu_cond_init(cpu->halt_cond);
2023     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2024              cpu->cpu_index);
2025     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2026                        cpu, QEMU_THREAD_JOINABLE);
2027 }
2028
2029 static void qemu_hvf_start_vcpu(CPUState *cpu)
2030 {
2031     char thread_name[VCPU_THREAD_NAME_SIZE];
2032
2033     /* HVF currently does not support TCG, and only runs in
2034      * unrestricted-guest mode. */
2035     assert(hvf_enabled());
2036
2037     cpu->thread = g_malloc0(sizeof(QemuThread));
2038     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2039     qemu_cond_init(cpu->halt_cond);
2040
2041     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2042              cpu->cpu_index);
2043     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2044                        cpu, QEMU_THREAD_JOINABLE);
2045 }
2046
2047 static void qemu_whpx_start_vcpu(CPUState *cpu)
2048 {
2049     char thread_name[VCPU_THREAD_NAME_SIZE];
2050
2051     cpu->thread = g_malloc0(sizeof(QemuThread));
2052     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2053     qemu_cond_init(cpu->halt_cond);
2054     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2055              cpu->cpu_index);
2056     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2057                        cpu, QEMU_THREAD_JOINABLE);
2058 #ifdef _WIN32
2059     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2060 #endif
2061 }
2062
2063 static void qemu_dummy_start_vcpu(CPUState *cpu)
2064 {
2065     char thread_name[VCPU_THREAD_NAME_SIZE];
2066
2067     cpu->thread = g_malloc0(sizeof(QemuThread));
2068     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2069     qemu_cond_init(cpu->halt_cond);
2070     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2071              cpu->cpu_index);
2072     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2073                        QEMU_THREAD_JOINABLE);
2074 }
2075
2076 void qemu_init_vcpu(CPUState *cpu)
2077 {
2078     cpu->nr_cores = smp_cores;
2079     cpu->nr_threads = smp_threads;
2080     cpu->stopped = true;
2081     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2082
2083     if (!cpu->as) {
2084         /* If the target cpu hasn't set up any address spaces itself,
2085          * give it the default one.
2086          */
2087         cpu->num_ases = 1;
2088         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2089     }
2090
2091     if (kvm_enabled()) {
2092         qemu_kvm_start_vcpu(cpu);
2093     } else if (hax_enabled()) {
2094         qemu_hax_start_vcpu(cpu);
2095     } else if (hvf_enabled()) {
2096         qemu_hvf_start_vcpu(cpu);
2097     } else if (tcg_enabled()) {
2098         qemu_tcg_init_vcpu(cpu);
2099     } else if (whpx_enabled()) {
2100         qemu_whpx_start_vcpu(cpu);
2101     } else {
2102         qemu_dummy_start_vcpu(cpu);
2103     }
2104
2105     while (!cpu->created) {
2106         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2107     }
2108 }
2109
2110 void cpu_stop_current(void)
2111 {
2112     if (current_cpu) {
2113         current_cpu->stop = true;
2114         cpu_exit(current_cpu);
2115     }
2116 }
2117
2118 int vm_stop(RunState state)
2119 {
2120     if (qemu_in_vcpu_thread()) {
2121         qemu_system_vmstop_request_prepare();
2122         qemu_system_vmstop_request(state);
2123         /*
2124          * FIXME: should not return to device code in case
2125          * vm_stop() has been requested.
2126          */
2127         cpu_stop_current();
2128         return 0;
2129     }
2130
2131     return do_vm_stop(state, true);
2132 }
2133
2134 /**
2135  * Prepare for (re)starting the VM.
2136  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2137  * running or in case of an error condition), 0 otherwise.
2138  */
2139 int vm_prepare_start(void)
2140 {
2141     RunState requested;
2142
2143     qemu_vmstop_requested(&requested);
2144     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2145         return -1;
2146     }
2147
2148     /* Ensure that a STOP/RESUME pair of events is emitted if a
2149      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2150      * example, according to documentation is always followed by
2151      * the STOP event.
2152      */
2153     if (runstate_is_running()) {
2154         qapi_event_send_stop();
2155         qapi_event_send_resume();
2156         return -1;
2157     }
2158
2159     /* We are sending this now, but the CPUs will be resumed shortly later */
2160     qapi_event_send_resume();
2161
2162     replay_enable_events();
2163     cpu_enable_ticks();
2164     runstate_set(RUN_STATE_RUNNING);
2165     vm_state_notify(1, RUN_STATE_RUNNING);
2166     return 0;
2167 }
2168
2169 void vm_start(void)
2170 {
2171     if (!vm_prepare_start()) {
2172         resume_all_vcpus();
2173     }
2174 }
2175
2176 /* does a state transition even if the VM is already stopped,
2177    current state is forgotten forever */
2178 int vm_stop_force_state(RunState state)
2179 {
2180     if (runstate_is_running()) {
2181         return vm_stop(state);
2182     } else {
2183         runstate_set(state);
2184
2185         bdrv_drain_all();
2186         /* Make sure to return an error if the flush in a previous vm_stop()
2187          * failed. */
2188         return bdrv_flush_all();
2189     }
2190 }
2191
2192 void list_cpus(const char *optarg)
2193 {
2194     /* XXX: implement xxx_cpu_list for targets that still miss it */
2195 #if defined(cpu_list)
2196     cpu_list();
2197 #endif
2198 }
2199
2200 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2201                  bool has_cpu, int64_t cpu_index, Error **errp)
2202 {
2203     FILE *f;
2204     uint32_t l;
2205     CPUState *cpu;
2206     uint8_t buf[1024];
2207     int64_t orig_addr = addr, orig_size = size;
2208
2209     if (!has_cpu) {
2210         cpu_index = 0;
2211     }
2212
2213     cpu = qemu_get_cpu(cpu_index);
2214     if (cpu == NULL) {
2215         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2216                    "a CPU number");
2217         return;
2218     }
2219
2220     f = fopen(filename, "wb");
2221     if (!f) {
2222         error_setg_file_open(errp, errno, filename);
2223         return;
2224     }
2225
2226     while (size != 0) {
2227         l = sizeof(buf);
2228         if (l > size)
2229             l = size;
2230         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2231             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2232                              " specified", orig_addr, orig_size);
2233             goto exit;
2234         }
2235         if (fwrite(buf, 1, l, f) != l) {
2236             error_setg(errp, QERR_IO_ERROR);
2237             goto exit;
2238         }
2239         addr += l;
2240         size -= l;
2241     }
2242
2243 exit:
2244     fclose(f);
2245 }
2246
2247 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2248                   Error **errp)
2249 {
2250     FILE *f;
2251     uint32_t l;
2252     uint8_t buf[1024];
2253
2254     f = fopen(filename, "wb");
2255     if (!f) {
2256         error_setg_file_open(errp, errno, filename);
2257         return;
2258     }
2259
2260     while (size != 0) {
2261         l = sizeof(buf);
2262         if (l > size)
2263             l = size;
2264         cpu_physical_memory_read(addr, buf, l);
2265         if (fwrite(buf, 1, l, f) != l) {
2266             error_setg(errp, QERR_IO_ERROR);
2267             goto exit;
2268         }
2269         addr += l;
2270         size -= l;
2271     }
2272
2273 exit:
2274     fclose(f);
2275 }
2276
2277 void qmp_inject_nmi(Error **errp)
2278 {
2279     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2280 }
2281
2282 void dump_drift_info(void)
2283 {
2284     if (!use_icount) {
2285         return;
2286     }
2287
2288     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2289                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2290     if (icount_align_option) {
2291         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2292                     -max_delay / SCALE_MS);
2293         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2294                     max_advance / SCALE_MS);
2295     } else {
2296         qemu_printf("Max guest delay     NA\n");
2297         qemu_printf("Max guest advance   NA\n");
2298     }
2299 }