cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "migration/vmstate.h"
  29 #include "monitor/monitor.h"
  30 #include "qapi/error.h"
  31 #include "qapi/qapi-commands-misc.h"
  32 #include "qapi/qapi-events-run-state.h"
  33 #include "qapi/qmp/qerror.h"
  34 #include "qemu/error-report.h"
  35 #include "qemu/qemu-print.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "qemu/plugin.h"
  49 #include "sysemu/cpus.h"
  50 #include "sysemu/qtest.h"
  51 #include "qemu/main-loop.h"
  52 #include "qemu/option.h"
  53 #include "qemu/bitmap.h"
  54 #include "qemu/seqlock.h"
  55 #include "qemu/guest-random.h"
  56 #include "tcg.h"
  57 #include "hw/nmi.h"
  58 #include "sysemu/replay.h"
  59 #include "sysemu/runstate.h"
  60 #include "hw/boards.h"
  61 #include "hw/hw.h"
  62
  63 #ifdef CONFIG_LINUX
  64
  65 #include <sys/prctl.h>
  66
  67 #ifndef PR_MCE_KILL
  68 #define PR_MCE_KILL 33
  69 #endif
  70
  71 #ifndef PR_MCE_KILL_SET
  72 #define PR_MCE_KILL_SET 1
  73 #endif
  74
  75 #ifndef PR_MCE_KILL_EARLY
  76 #define PR_MCE_KILL_EARLY 1
  77 #endif
  78
  79 #endif /* CONFIG_LINUX */
  80
  81 static QemuMutex qemu_global_mutex;
  82
  83 int64_t max_delay;
  84 int64_t max_advance;
  85
  86 /* vcpu throttling controls */
  87 static QEMUTimer *throttle_timer;
  88 static unsigned int throttle_percentage;
  89
  90 #define CPU_THROTTLE_PCT_MIN 1
  91 #define CPU_THROTTLE_PCT_MAX 99
  92 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  93
  94 bool cpu_is_stopped(CPUState *cpu)
  95 {
  96     return cpu->stopped || !runstate_is_running();
  97 }
  98
  99 static bool cpu_thread_is_idle(CPUState *cpu)
 100 {
 101     if (cpu->stop || cpu->queued_work_first) {
 102         return false;
 103     }
 104     if (cpu_is_stopped(cpu)) {
 105         return true;
 106     }
 107     if (!cpu->halted || cpu_has_work(cpu) ||
 108         kvm_halt_in_kernel()) {
 109         return false;
 110     }
 111     return true;
 112 }
 113
 114 static bool all_cpu_threads_idle(void)
 115 {
 116     CPUState *cpu;
 117
 118     CPU_FOREACH(cpu) {
 119         if (!cpu_thread_is_idle(cpu)) {
 120             return false;
 121         }
 122     }
 123     return true;
 124 }
 125
 126 /***********************************************************/
 127 /* guest cycle counter */
 128
 129 /* Protected by TimersState seqlock */
 130
 131 static bool icount_sleep = true;
 132 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 133 #define MAX_ICOUNT_SHIFT 10
 134
 135 typedef struct TimersState {
 136     /* Protected by BQL.  */
 137     int64_t cpu_ticks_prev;
 138     int64_t cpu_ticks_offset;
 139
 140     /* Protect fields that can be respectively read outside the
 141      * BQL, and written from multiple threads.
 142      */
 143     QemuSeqLock vm_clock_seqlock;
 144     QemuSpin vm_clock_lock;
 145
 146     int16_t cpu_ticks_enabled;
 147
 148     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 149     int16_t icount_time_shift;
 150
 151     /* Compensate for varying guest execution speed.  */
 152     int64_t qemu_icount_bias;
 153
 154     int64_t vm_clock_warp_start;
 155     int64_t cpu_clock_offset;
 156
 157     /* Only written by TCG thread */
 158     int64_t qemu_icount;
 159
 160     /* for adjusting icount */
 161     QEMUTimer *icount_rt_timer;
 162     QEMUTimer *icount_vm_timer;
 163     QEMUTimer *icount_warp_timer;
 164 } TimersState;
 165
 166 static TimersState timers_state;
 167 bool mttcg_enabled;
 168
 169 /*
 170  * We default to false if we know other options have been enabled
 171  * which are currently incompatible with MTTCG. Otherwise when each
 172  * guest (target) has been updated to support:
 173  *   - atomic instructions
 174  *   - memory ordering primitives (barriers)
 175  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 176  *
 177  * Once a guest architecture has been converted to the new primitives
 178  * there are two remaining limitations to check.
 179  *
 180  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 181  * - The host must have a stronger memory order than the guest
 182  *
 183  * It may be possible in future to support strong guests on weak hosts
 184  * but that will require tagging all load/stores in a guest with their
 185  * implicit memory order requirements which would likely slow things
 186  * down a lot.
 187  */
 188
 189 static bool check_tcg_memory_orders_compatible(void)
 190 {
 191 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 192     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 193 #else
 194     return false;
 195 #endif
 196 }
 197
 198 static bool default_mttcg_enabled(void)
 199 {
 200     if (use_icount || TCG_OVERSIZED_GUEST) {
 201         return false;
 202     } else {
 203 #ifdef TARGET_SUPPORTS_MTTCG
 204         return check_tcg_memory_orders_compatible();
 205 #else
 206         return false;
 207 #endif
 208     }
 209 }
 210
 211 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 212 {
 213     const char *t = qemu_opt_get(opts, "thread");
 214     if (t) {
 215         if (strcmp(t, "multi") == 0) {
 216             if (TCG_OVERSIZED_GUEST) {
 217                 error_setg(errp, "No MTTCG when guest word size > hosts");
 218             } else if (use_icount) {
 219                 error_setg(errp, "No MTTCG when icount is enabled");
 220             } else {
 221 #ifndef TARGET_SUPPORTS_MTTCG
 222                 warn_report("Guest not yet converted to MTTCG - "
 223                             "you may get unexpected results");
 224 #endif
 225                 if (!check_tcg_memory_orders_compatible()) {
 226                     warn_report("Guest expects a stronger memory ordering "
 227                                 "than the host provides");
 228                     error_printf("This may cause strange/hard to debug errors\n");
 229                 }
 230                 mttcg_enabled = true;
 231             }
 232         } else if (strcmp(t, "single") == 0) {
 233             mttcg_enabled = false;
 234         } else {
 235             error_setg(errp, "Invalid 'thread' setting %s", t);
 236         }
 237     } else {
 238         mttcg_enabled = default_mttcg_enabled();
 239     }
 240 }
 241
 242 /* The current number of executed instructions is based on what we
 243  * originally budgeted minus the current state of the decrementing
 244  * icount counters in extra/u16.low.
 245  */
 246 static int64_t cpu_get_icount_executed(CPUState *cpu)
 247 {
 248     return (cpu->icount_budget -
 249             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 250 }
 251
 252 /*
 253  * Update the global shared timer_state.qemu_icount to take into
 254  * account executed instructions. This is done by the TCG vCPU
 255  * thread so the main-loop can see time has moved forward.
 256  */
 257 static void cpu_update_icount_locked(CPUState *cpu)
 258 {
 259     int64_t executed = cpu_get_icount_executed(cpu);
 260     cpu->icount_budget -= executed;
 261
 262     atomic_set_i64(&timers_state.qemu_icount,
 263                    timers_state.qemu_icount + executed);
 264 }
 265
 266 /*
 267  * Update the global shared timer_state.qemu_icount to take into
 268  * account executed instructions. This is done by the TCG vCPU
 269  * thread so the main-loop can see time has moved forward.
 270  */
 271 void cpu_update_icount(CPUState *cpu)
 272 {
 273     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 274                        &timers_state.vm_clock_lock);
 275     cpu_update_icount_locked(cpu);
 276     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 277                          &timers_state.vm_clock_lock);
 278 }
 279
 280 static int64_t cpu_get_icount_raw_locked(void)
 281 {
 282     CPUState *cpu = current_cpu;
 283
 284     if (cpu && cpu->running) {
 285         if (!cpu->can_do_io) {
 286             error_report("Bad icount read");
 287             exit(1);
 288         }
 289         /* Take into account what has run */
 290         cpu_update_icount_locked(cpu);
 291     }
 292     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 293     return atomic_read_i64(&timers_state.qemu_icount);
 294 }
 295
 296 static int64_t cpu_get_icount_locked(void)
 297 {
 298     int64_t icount = cpu_get_icount_raw_locked();
 299     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 300         cpu_icount_to_ns(icount);
 301 }
 302
 303 int64_t cpu_get_icount_raw(void)
 304 {
 305     int64_t icount;
 306     unsigned start;
 307
 308     do {
 309         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 310         icount = cpu_get_icount_raw_locked();
 311     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 312
 313     return icount;
 314 }
 315
 316 /* Return the virtual CPU time, based on the instruction counter.  */
 317 int64_t cpu_get_icount(void)
 318 {
 319     int64_t icount;
 320     unsigned start;
 321
 322     do {
 323         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 324         icount = cpu_get_icount_locked();
 325     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 326
 327     return icount;
 328 }
 329
 330 int64_t cpu_icount_to_ns(int64_t icount)
 331 {
 332     return icount << atomic_read(&timers_state.icount_time_shift);
 333 }
 334
 335 static int64_t cpu_get_ticks_locked(void)
 336 {
 337     int64_t ticks = timers_state.cpu_ticks_offset;
 338     if (timers_state.cpu_ticks_enabled) {
 339         ticks += cpu_get_host_ticks();
 340     }
 341
 342     if (timers_state.cpu_ticks_prev > ticks) {
 343         /* Non increasing ticks may happen if the host uses software suspend.  */
 344         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 345         ticks = timers_state.cpu_ticks_prev;
 346     }
 347
 348     timers_state.cpu_ticks_prev = ticks;
 349     return ticks;
 350 }
 351
 352 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 353  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 354  * counter.
 355  */
 356 int64_t cpu_get_ticks(void)
 357 {
 358     int64_t ticks;
 359
 360     if (use_icount) {
 361         return cpu_get_icount();
 362     }
 363
 364     qemu_spin_lock(&timers_state.vm_clock_lock);
 365     ticks = cpu_get_ticks_locked();
 366     qemu_spin_unlock(&timers_state.vm_clock_lock);
 367     return ticks;
 368 }
 369
 370 static int64_t cpu_get_clock_locked(void)
 371 {
 372     int64_t time;
 373
 374     time = timers_state.cpu_clock_offset;
 375     if (timers_state.cpu_ticks_enabled) {
 376         time += get_clock();
 377     }
 378
 379     return time;
 380 }
 381
 382 /* Return the monotonic time elapsed in VM, i.e.,
 383  * the time between vm_start and vm_stop
 384  */
 385 int64_t cpu_get_clock(void)
 386 {
 387     int64_t ti;
 388     unsigned start;
 389
 390     do {
 391         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 392         ti = cpu_get_clock_locked();
 393     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 394
 395     return ti;
 396 }
 397
 398 /* enable cpu_get_ticks()
 399  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 400  */
 401 void cpu_enable_ticks(void)
 402 {
 403     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 404                        &timers_state.vm_clock_lock);
 405     if (!timers_state.cpu_ticks_enabled) {
 406         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 407         timers_state.cpu_clock_offset -= get_clock();
 408         timers_state.cpu_ticks_enabled = 1;
 409     }
 410     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 411                        &timers_state.vm_clock_lock);
 412 }
 413
 414 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 415  * cpu_get_ticks() after that.
 416  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 417  */
 418 void cpu_disable_ticks(void)
 419 {
 420     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 421                        &timers_state.vm_clock_lock);
 422     if (timers_state.cpu_ticks_enabled) {
 423         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 424         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 425         timers_state.cpu_ticks_enabled = 0;
 426     }
 427     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 428                          &timers_state.vm_clock_lock);
 429 }
 430
 431 /* Correlation between real and virtual time is always going to be
 432    fairly approximate, so ignore small variation.
 433    When the guest is idle real and virtual time will be aligned in
 434    the IO wait loop.  */
 435 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 436
 437 static void icount_adjust(void)
 438 {
 439     int64_t cur_time;
 440     int64_t cur_icount;
 441     int64_t delta;
 442
 443     /* Protected by TimersState mutex.  */
 444     static int64_t last_delta;
 445
 446     /* If the VM is not running, then do nothing.  */
 447     if (!runstate_is_running()) {
 448         return;
 449     }
 450
 451     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 452                        &timers_state.vm_clock_lock);
 453     cur_time = cpu_get_clock_locked();
 454     cur_icount = cpu_get_icount_locked();
 455
 456     delta = cur_icount - cur_time;
 457     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 458     if (delta > 0
 459         && last_delta + ICOUNT_WOBBLE < delta * 2
 460         && timers_state.icount_time_shift > 0) {
 461         /* The guest is getting too far ahead.  Slow time down.  */
 462         atomic_set(&timers_state.icount_time_shift,
 463                    timers_state.icount_time_shift - 1);
 464     }
 465     if (delta < 0
 466         && last_delta - ICOUNT_WOBBLE > delta * 2
 467         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 468         /* The guest is getting too far behind.  Speed time up.  */
 469         atomic_set(&timers_state.icount_time_shift,
 470                    timers_state.icount_time_shift + 1);
 471     }
 472     last_delta = delta;
 473     atomic_set_i64(&timers_state.qemu_icount_bias,
 474                    cur_icount - (timers_state.qemu_icount
 475                                  << timers_state.icount_time_shift));
 476     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 477                          &timers_state.vm_clock_lock);
 478 }
 479
 480 static void icount_adjust_rt(void *opaque)
 481 {
 482     timer_mod(timers_state.icount_rt_timer,
 483               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 484     icount_adjust();
 485 }
 486
 487 static void icount_adjust_vm(void *opaque)
 488 {
 489     timer_mod(timers_state.icount_vm_timer,
 490                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 491                    NANOSECONDS_PER_SECOND / 10);
 492     icount_adjust();
 493 }
 494
 495 static int64_t qemu_icount_round(int64_t count)
 496 {
 497     int shift = atomic_read(&timers_state.icount_time_shift);
 498     return (count + (1 << shift) - 1) >> shift;
 499 }
 500
 501 static void icount_warp_rt(void)
 502 {
 503     unsigned seq;
 504     int64_t warp_start;
 505
 506     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 507      * changes from -1 to another value, so the race here is okay.
 508      */
 509     do {
 510         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 511         warp_start = timers_state.vm_clock_warp_start;
 512     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 513
 514     if (warp_start == -1) {
 515         return;
 516     }
 517
 518     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 519                        &timers_state.vm_clock_lock);
 520     if (runstate_is_running()) {
 521         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 522                                             cpu_get_clock_locked());
 523         int64_t warp_delta;
 524
 525         warp_delta = clock - timers_state.vm_clock_warp_start;
 526         if (use_icount == 2) {
 527             /*
 528              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 529              * far ahead of real time.
 530              */
 531             int64_t cur_icount = cpu_get_icount_locked();
 532             int64_t delta = clock - cur_icount;
 533             warp_delta = MIN(warp_delta, delta);
 534         }
 535         atomic_set_i64(&timers_state.qemu_icount_bias,
 536                        timers_state.qemu_icount_bias + warp_delta);
 537     }
 538     timers_state.vm_clock_warp_start = -1;
 539     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 540                        &timers_state.vm_clock_lock);
 541
 542     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 543         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 544     }
 545 }
 546
 547 static void icount_timer_cb(void *opaque)
 548 {
 549     /* No need for a checkpoint because the timer already synchronizes
 550      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 551      */
 552     icount_warp_rt();
 553 }
 554
 555 void qtest_clock_warp(int64_t dest)
 556 {
 557     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 558     AioContext *aio_context;
 559     assert(qtest_enabled());
 560     aio_context = qemu_get_aio_context();
 561     while (clock < dest) {
 562         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 563                                                       QEMU_TIMER_ATTR_ALL);
 564         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 565
 566         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 567                            &timers_state.vm_clock_lock);
 568         atomic_set_i64(&timers_state.qemu_icount_bias,
 569                        timers_state.qemu_icount_bias + warp);
 570         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 571                              &timers_state.vm_clock_lock);
 572
 573         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 574         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 575         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 576     }
 577     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 578 }
 579
 580 void qemu_start_warp_timer(void)
 581 {
 582     int64_t clock;
 583     int64_t deadline;
 584
 585     if (!use_icount) {
 586         return;
 587     }
 588
 589     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 590      * do not fire, so computing the deadline does not make sense.
 591      */
 592     if (!runstate_is_running()) {
 593         return;
 594     }
 595
 596     if (replay_mode != REPLAY_MODE_PLAY) {
 597         if (!all_cpu_threads_idle()) {
 598             return;
 599         }
 600
 601         if (qtest_enabled()) {
 602             /* When testing, qtest commands advance icount.  */
 603             return;
 604         }
 605
 606         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 607     } else {
 608         /* warp clock deterministically in record/replay mode */
 609         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 610             /* vCPU is sleeping and warp can't be started.
 611                It is probably a race condition: notification sent
 612                to vCPU was processed in advance and vCPU went to sleep.
 613                Therefore we have to wake it up for doing someting. */
 614             if (replay_has_checkpoint()) {
 615                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 616             }
 617             return;
 618         }
 619     }
 620
 621     /* We want to use the earliest deadline from ALL vm_clocks */
 622     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 623     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 624                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 625     if (deadline < 0) {
 626         static bool notified;
 627         if (!icount_sleep && !notified) {
 628             warn_report("icount sleep disabled and no active timers");
 629             notified = true;
 630         }
 631         return;
 632     }
 633
 634     if (deadline > 0) {
 635         /*
 636          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 637          * sleep.  Otherwise, the CPU might be waiting for a future timer
 638          * interrupt to wake it up, but the interrupt never comes because
 639          * the vCPU isn't running any insns and thus doesn't advance the
 640          * QEMU_CLOCK_VIRTUAL.
 641          */
 642         if (!icount_sleep) {
 643             /*
 644              * We never let VCPUs sleep in no sleep icount mode.
 645              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 646              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 647              * It is useful when we want a deterministic execution time,
 648              * isolated from host latencies.
 649              */
 650             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 651                                &timers_state.vm_clock_lock);
 652             atomic_set_i64(&timers_state.qemu_icount_bias,
 653                            timers_state.qemu_icount_bias + deadline);
 654             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 655                                  &timers_state.vm_clock_lock);
 656             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 657         } else {
 658             /*
 659              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 660              * "real" time, (related to the time left until the next event) has
 661              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 662              * This avoids that the warps are visible externally; for example,
 663              * you will not be sending network packets continuously instead of
 664              * every 100ms.
 665              */
 666             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 667                                &timers_state.vm_clock_lock);
 668             if (timers_state.vm_clock_warp_start == -1
 669                 || timers_state.vm_clock_warp_start > clock) {
 670                 timers_state.vm_clock_warp_start = clock;
 671             }
 672             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 673                                  &timers_state.vm_clock_lock);
 674             timer_mod_anticipate(timers_state.icount_warp_timer,
 675                                  clock + deadline);
 676         }
 677     } else if (deadline == 0) {
 678         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 679     }
 680 }
 681
 682 static void qemu_account_warp_timer(void)
 683 {
 684     if (!use_icount || !icount_sleep) {
 685         return;
 686     }
 687
 688     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 689      * do not fire, so computing the deadline does not make sense.
 690      */
 691     if (!runstate_is_running()) {
 692         return;
 693     }
 694
 695     /* warp clock deterministically in record/replay mode */
 696     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 697         return;
 698     }
 699
 700     timer_del(timers_state.icount_warp_timer);
 701     icount_warp_rt();
 702 }
 703
 704 static bool icount_state_needed(void *opaque)
 705 {
 706     return use_icount;
 707 }
 708
 709 static bool warp_timer_state_needed(void *opaque)
 710 {
 711     TimersState *s = opaque;
 712     return s->icount_warp_timer != NULL;
 713 }
 714
 715 static bool adjust_timers_state_needed(void *opaque)
 716 {
 717     TimersState *s = opaque;
 718     return s->icount_rt_timer != NULL;
 719 }
 720
 721 /*
 722  * Subsection for warp timer migration is optional, because may not be created
 723  */
 724 static const VMStateDescription icount_vmstate_warp_timer = {
 725     .name = "timer/icount/warp_timer",
 726     .version_id = 1,
 727     .minimum_version_id = 1,
 728     .needed = warp_timer_state_needed,
 729     .fields = (VMStateField[]) {
 730         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 731         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 732         VMSTATE_END_OF_LIST()
 733     }
 734 };
 735
 736 static const VMStateDescription icount_vmstate_adjust_timers = {
 737     .name = "timer/icount/timers",
 738     .version_id = 1,
 739     .minimum_version_id = 1,
 740     .needed = adjust_timers_state_needed,
 741     .fields = (VMStateField[]) {
 742         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 743         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 744         VMSTATE_END_OF_LIST()
 745     }
 746 };
 747
 748 /*
 749  * This is a subsection for icount migration.
 750  */
 751 static const VMStateDescription icount_vmstate_timers = {
 752     .name = "timer/icount",
 753     .version_id = 1,
 754     .minimum_version_id = 1,
 755     .needed = icount_state_needed,
 756     .fields = (VMStateField[]) {
 757         VMSTATE_INT64(qemu_icount_bias, TimersState),
 758         VMSTATE_INT64(qemu_icount, TimersState),
 759         VMSTATE_END_OF_LIST()
 760     },
 761     .subsections = (const VMStateDescription*[]) {
 762         &icount_vmstate_warp_timer,
 763         &icount_vmstate_adjust_timers,
 764         NULL
 765     }
 766 };
 767
 768 static const VMStateDescription vmstate_timers = {
 769     .name = "timer",
 770     .version_id = 2,
 771     .minimum_version_id = 1,
 772     .fields = (VMStateField[]) {
 773         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 774         VMSTATE_UNUSED(8),
 775         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 776         VMSTATE_END_OF_LIST()
 777     },
 778     .subsections = (const VMStateDescription*[]) {
 779         &icount_vmstate_timers,
 780         NULL
 781     }
 782 };
 783
 784 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 785 {
 786     double pct;
 787     double throttle_ratio;
 788     int64_t sleeptime_ns, endtime_ns;
 789
 790     if (!cpu_throttle_get_percentage()) {
 791         return;
 792     }
 793
 794     pct = (double)cpu_throttle_get_percentage()/100;
 795     throttle_ratio = pct / (1 - pct);
 796     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 797     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 798     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 799     while (sleeptime_ns > 0 && !cpu->stop) {
 800         if (sleeptime_ns > SCALE_MS) {
 801             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 802                                 sleeptime_ns / SCALE_MS);
 803         } else {
 804             qemu_mutex_unlock_iothread();
 805             g_usleep(sleeptime_ns / SCALE_US);
 806             qemu_mutex_lock_iothread();
 807         }
 808         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 809     }
 810     atomic_set(&cpu->throttle_thread_scheduled, 0);
 811 }
 812
 813 static void cpu_throttle_timer_tick(void *opaque)
 814 {
 815     CPUState *cpu;
 816     double pct;
 817
 818     /* Stop the timer if needed */
 819     if (!cpu_throttle_get_percentage()) {
 820         return;
 821     }
 822     CPU_FOREACH(cpu) {
 823         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 824             async_run_on_cpu(cpu, cpu_throttle_thread,
 825                              RUN_ON_CPU_NULL);
 826         }
 827     }
 828
 829     pct = (double)cpu_throttle_get_percentage()/100;
 830     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 831                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 832 }
 833
 834 void cpu_throttle_set(int new_throttle_pct)
 835 {
 836     /* Ensure throttle percentage is within valid range */
 837     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 838     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 839
 840     atomic_set(&throttle_percentage, new_throttle_pct);
 841
 842     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 843                                        CPU_THROTTLE_TIMESLICE_NS);
 844 }
 845
 846 void cpu_throttle_stop(void)
 847 {
 848     atomic_set(&throttle_percentage, 0);
 849 }
 850
 851 bool cpu_throttle_active(void)
 852 {
 853     return (cpu_throttle_get_percentage() != 0);
 854 }
 855
 856 int cpu_throttle_get_percentage(void)
 857 {
 858     return atomic_read(&throttle_percentage);
 859 }
 860
 861 void cpu_ticks_init(void)
 862 {
 863     seqlock_init(&timers_state.vm_clock_seqlock);
 864     qemu_spin_init(&timers_state.vm_clock_lock);
 865     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 866     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 867                                            cpu_throttle_timer_tick, NULL);
 868 }
 869
 870 void configure_icount(QemuOpts *opts, Error **errp)
 871 {
 872     const char *option;
 873     char *rem_str = NULL;
 874
 875     option = qemu_opt_get(opts, "shift");
 876     if (!option) {
 877         if (qemu_opt_get(opts, "align") != NULL) {
 878             error_setg(errp, "Please specify shift option when using align");
 879         }
 880         return;
 881     }
 882
 883     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 884     if (icount_sleep) {
 885         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 886                                          icount_timer_cb, NULL);
 887     }
 888
 889     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 890
 891     if (icount_align_option && !icount_sleep) {
 892         error_setg(errp, "align=on and sleep=off are incompatible");
 893     }
 894     if (strcmp(option, "auto") != 0) {
 895         errno = 0;
 896         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 897         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 898             error_setg(errp, "icount: Invalid shift value");
 899         }
 900         use_icount = 1;
 901         return;
 902     } else if (icount_align_option) {
 903         error_setg(errp, "shift=auto and align=on are incompatible");
 904     } else if (!icount_sleep) {
 905         error_setg(errp, "shift=auto and sleep=off are incompatible");
 906     }
 907
 908     use_icount = 2;
 909
 910     /* 125MIPS seems a reasonable initial guess at the guest speed.
 911        It will be corrected fairly quickly anyway.  */
 912     timers_state.icount_time_shift = 3;
 913
 914     /* Have both realtime and virtual time triggers for speed adjustment.
 915        The realtime trigger catches emulated time passing too slowly,
 916        the virtual time trigger catches emulated time passing too fast.
 917        Realtime triggers occur even when idle, so use them less frequently
 918        than VM triggers.  */
 919     timers_state.vm_clock_warp_start = -1;
 920     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 921                                    icount_adjust_rt, NULL);
 922     timer_mod(timers_state.icount_rt_timer,
 923                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 924     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 925                                         icount_adjust_vm, NULL);
 926     timer_mod(timers_state.icount_vm_timer,
 927                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 928                    NANOSECONDS_PER_SECOND / 10);
 929 }
 930
 931 /***********************************************************/
 932 /* TCG vCPU kick timer
 933  *
 934  * The kick timer is responsible for moving single threaded vCPU
 935  * emulation on to the next vCPU. If more than one vCPU is running a
 936  * timer event with force a cpu->exit so the next vCPU can get
 937  * scheduled.
 938  *
 939  * The timer is removed if all vCPUs are idle and restarted again once
 940  * idleness is complete.
 941  */
 942
 943 static QEMUTimer *tcg_kick_vcpu_timer;
 944 static CPUState *tcg_current_rr_cpu;
 945
 946 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 947
 948 static inline int64_t qemu_tcg_next_kick(void)
 949 {
 950     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 951 }
 952
 953 /* Kick the currently round-robin scheduled vCPU to next */
 954 static void qemu_cpu_kick_rr_next_cpu(void)
 955 {
 956     CPUState *cpu;
 957     do {
 958         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 959         if (cpu) {
 960             cpu_exit(cpu);
 961         }
 962     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 963 }
 964
 965 /* Kick all RR vCPUs */
 966 static void qemu_cpu_kick_rr_cpus(void)
 967 {
 968     CPUState *cpu;
 969
 970     CPU_FOREACH(cpu) {
 971         cpu_exit(cpu);
 972     };
 973 }
 974
 975 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 976 {
 977 }
 978
 979 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 980 {
 981     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 982         qemu_notify_event();
 983         return;
 984     }
 985
 986     if (qemu_in_vcpu_thread()) {
 987         /* A CPU is currently running; kick it back out to the
 988          * tcg_cpu_exec() loop so it will recalculate its
 989          * icount deadline immediately.
 990          */
 991         qemu_cpu_kick(current_cpu);
 992     } else if (first_cpu) {
 993         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 994          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 995          * causes cpu_thread_is_idle to return false.  This way,
 996          * handle_icount_deadline can run.
 997          * If we have no CPUs at all for some reason, we don't
 998          * need to do anything.
 999          */
1000         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
1001     }
1002 }
1003
1004 static void kick_tcg_thread(void *opaque)
1005 {
1006     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1007     qemu_cpu_kick_rr_next_cpu();
1008 }
1009
1010 static void start_tcg_kick_timer(void)
1011 {
1012     assert(!mttcg_enabled);
1013     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1014         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1015                                            kick_tcg_thread, NULL);
1016     }
1017     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1018         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1019     }
1020 }
1021
1022 static void stop_tcg_kick_timer(void)
1023 {
1024     assert(!mttcg_enabled);
1025     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1026         timer_del(tcg_kick_vcpu_timer);
1027     }
1028 }
1029
1030 /***********************************************************/
1031 void hw_error(const char *fmt, ...)
1032 {
1033     va_list ap;
1034     CPUState *cpu;
1035
1036     va_start(ap, fmt);
1037     fprintf(stderr, "qemu: hardware error: ");
1038     vfprintf(stderr, fmt, ap);
1039     fprintf(stderr, "\n");
1040     CPU_FOREACH(cpu) {
1041         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1042         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1043     }
1044     va_end(ap);
1045     abort();
1046 }
1047
1048 void cpu_synchronize_all_states(void)
1049 {
1050     CPUState *cpu;
1051
1052     CPU_FOREACH(cpu) {
1053         cpu_synchronize_state(cpu);
1054         /* TODO: move to cpu_synchronize_state() */
1055         if (hvf_enabled()) {
1056             hvf_cpu_synchronize_state(cpu);
1057         }
1058     }
1059 }
1060
1061 void cpu_synchronize_all_post_reset(void)
1062 {
1063     CPUState *cpu;
1064
1065     CPU_FOREACH(cpu) {
1066         cpu_synchronize_post_reset(cpu);
1067         /* TODO: move to cpu_synchronize_post_reset() */
1068         if (hvf_enabled()) {
1069             hvf_cpu_synchronize_post_reset(cpu);
1070         }
1071     }
1072 }
1073
1074 void cpu_synchronize_all_post_init(void)
1075 {
1076     CPUState *cpu;
1077
1078     CPU_FOREACH(cpu) {
1079         cpu_synchronize_post_init(cpu);
1080         /* TODO: move to cpu_synchronize_post_init() */
1081         if (hvf_enabled()) {
1082             hvf_cpu_synchronize_post_init(cpu);
1083         }
1084     }
1085 }
1086
1087 void cpu_synchronize_all_pre_loadvm(void)
1088 {
1089     CPUState *cpu;
1090
1091     CPU_FOREACH(cpu) {
1092         cpu_synchronize_pre_loadvm(cpu);
1093     }
1094 }
1095
1096 static int do_vm_stop(RunState state, bool send_stop)
1097 {
1098     int ret = 0;
1099
1100     if (runstate_is_running()) {
1101         cpu_disable_ticks();
1102         pause_all_vcpus();
1103         runstate_set(state);
1104         vm_state_notify(0, state);
1105         if (send_stop) {
1106             qapi_event_send_stop();
1107         }
1108     }
1109
1110     bdrv_drain_all();
1111     ret = bdrv_flush_all();
1112
1113     return ret;
1114 }
1115
1116 /* Special vm_stop() variant for terminating the process.  Historically clients
1117  * did not expect a QMP STOP event and so we need to retain compatibility.
1118  */
1119 int vm_shutdown(void)
1120 {
1121     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1122 }
1123
1124 static bool cpu_can_run(CPUState *cpu)
1125 {
1126     if (cpu->stop) {
1127         return false;
1128     }
1129     if (cpu_is_stopped(cpu)) {
1130         return false;
1131     }
1132     return true;
1133 }
1134
1135 static void cpu_handle_guest_debug(CPUState *cpu)
1136 {
1137     gdb_set_stop_cpu(cpu);
1138     qemu_system_debug_request();
1139     cpu->stopped = true;
1140 }
1141
1142 #ifdef CONFIG_LINUX
1143 static void sigbus_reraise(void)
1144 {
1145     sigset_t set;
1146     struct sigaction action;
1147
1148     memset(&action, 0, sizeof(action));
1149     action.sa_handler = SIG_DFL;
1150     if (!sigaction(SIGBUS, &action, NULL)) {
1151         raise(SIGBUS);
1152         sigemptyset(&set);
1153         sigaddset(&set, SIGBUS);
1154         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1155     }
1156     perror("Failed to re-raise SIGBUS!\n");
1157     abort();
1158 }
1159
1160 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1161 {
1162     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1163         sigbus_reraise();
1164     }
1165
1166     if (current_cpu) {
1167         /* Called asynchronously in VCPU thread.  */
1168         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1169             sigbus_reraise();
1170         }
1171     } else {
1172         /* Called synchronously (via signalfd) in main thread.  */
1173         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1174             sigbus_reraise();
1175         }
1176     }
1177 }
1178
1179 static void qemu_init_sigbus(void)
1180 {
1181     struct sigaction action;
1182
1183     memset(&action, 0, sizeof(action));
1184     action.sa_flags = SA_SIGINFO;
1185     action.sa_sigaction = sigbus_handler;
1186     sigaction(SIGBUS, &action, NULL);
1187
1188     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1189 }
1190 #else /* !CONFIG_LINUX */
1191 static void qemu_init_sigbus(void)
1192 {
1193 }
1194 #endif /* !CONFIG_LINUX */
1195
1196 static QemuThread io_thread;
1197
1198 /* cpu creation */
1199 static QemuCond qemu_cpu_cond;
1200 /* system init */
1201 static QemuCond qemu_pause_cond;
1202
1203 void qemu_init_cpu_loop(void)
1204 {
1205     qemu_init_sigbus();
1206     qemu_cond_init(&qemu_cpu_cond);
1207     qemu_cond_init(&qemu_pause_cond);
1208     qemu_mutex_init(&qemu_global_mutex);
1209
1210     qemu_thread_get_self(&io_thread);
1211 }
1212
1213 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1214 {
1215     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1216 }
1217
1218 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1219 {
1220     if (kvm_destroy_vcpu(cpu) < 0) {
1221         error_report("kvm_destroy_vcpu failed");
1222         exit(EXIT_FAILURE);
1223     }
1224 }
1225
1226 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1227 {
1228 }
1229
1230 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1231 {
1232     g_assert(qemu_cpu_is_self(cpu));
1233     cpu->stop = false;
1234     cpu->stopped = true;
1235     if (exit) {
1236         cpu_exit(cpu);
1237     }
1238     qemu_cond_broadcast(&qemu_pause_cond);
1239 }
1240
1241 static void qemu_wait_io_event_common(CPUState *cpu)
1242 {
1243     atomic_mb_set(&cpu->thread_kicked, false);
1244     if (cpu->stop) {
1245         qemu_cpu_stop(cpu, false);
1246     }
1247     process_queued_cpu_work(cpu);
1248 }
1249
1250 static void qemu_tcg_rr_wait_io_event(void)
1251 {
1252     CPUState *cpu;
1253
1254     while (all_cpu_threads_idle()) {
1255         stop_tcg_kick_timer();
1256         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1257     }
1258
1259     start_tcg_kick_timer();
1260
1261     CPU_FOREACH(cpu) {
1262         qemu_wait_io_event_common(cpu);
1263     }
1264 }
1265
1266 static void qemu_wait_io_event(CPUState *cpu)
1267 {
1268     bool slept = false;
1269
1270     while (cpu_thread_is_idle(cpu)) {
1271         if (!slept) {
1272             slept = true;
1273             qemu_plugin_vcpu_idle_cb(cpu);
1274         }
1275         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1276     }
1277     if (slept) {
1278         qemu_plugin_vcpu_resume_cb(cpu);
1279     }
1280
1281 #ifdef _WIN32
1282     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1283     if (!tcg_enabled()) {
1284         SleepEx(0, TRUE);
1285     }
1286 #endif
1287     qemu_wait_io_event_common(cpu);
1288 }
1289
1290 static void *qemu_kvm_cpu_thread_fn(void *arg)
1291 {
1292     CPUState *cpu = arg;
1293     int r;
1294
1295     rcu_register_thread();
1296
1297     qemu_mutex_lock_iothread();
1298     qemu_thread_get_self(cpu->thread);
1299     cpu->thread_id = qemu_get_thread_id();
1300     cpu->can_do_io = 1;
1301     current_cpu = cpu;
1302
1303     r = kvm_init_vcpu(cpu);
1304     if (r < 0) {
1305         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1306         exit(1);
1307     }
1308
1309     kvm_init_cpu_signals(cpu);
1310
1311     /* signal CPU creation */
1312     cpu->created = true;
1313     qemu_cond_signal(&qemu_cpu_cond);
1314     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1315
1316     do {
1317         if (cpu_can_run(cpu)) {
1318             r = kvm_cpu_exec(cpu);
1319             if (r == EXCP_DEBUG) {
1320                 cpu_handle_guest_debug(cpu);
1321             }
1322         }
1323         qemu_wait_io_event(cpu);
1324     } while (!cpu->unplug || cpu_can_run(cpu));
1325
1326     qemu_kvm_destroy_vcpu(cpu);
1327     cpu->created = false;
1328     qemu_cond_signal(&qemu_cpu_cond);
1329     qemu_mutex_unlock_iothread();
1330     rcu_unregister_thread();
1331     return NULL;
1332 }
1333
1334 static void *qemu_dummy_cpu_thread_fn(void *arg)
1335 {
1336 #ifdef _WIN32
1337     error_report("qtest is not supported under Windows");
1338     exit(1);
1339 #else
1340     CPUState *cpu = arg;
1341     sigset_t waitset;
1342     int r;
1343
1344     rcu_register_thread();
1345
1346     qemu_mutex_lock_iothread();
1347     qemu_thread_get_self(cpu->thread);
1348     cpu->thread_id = qemu_get_thread_id();
1349     cpu->can_do_io = 1;
1350     current_cpu = cpu;
1351
1352     sigemptyset(&waitset);
1353     sigaddset(&waitset, SIG_IPI);
1354
1355     /* signal CPU creation */
1356     cpu->created = true;
1357     qemu_cond_signal(&qemu_cpu_cond);
1358     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1359
1360     do {
1361         qemu_mutex_unlock_iothread();
1362         do {
1363             int sig;
1364             r = sigwait(&waitset, &sig);
1365         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1366         if (r == -1) {
1367             perror("sigwait");
1368             exit(1);
1369         }
1370         qemu_mutex_lock_iothread();
1371         qemu_wait_io_event(cpu);
1372     } while (!cpu->unplug);
1373
1374     qemu_mutex_unlock_iothread();
1375     rcu_unregister_thread();
1376     return NULL;
1377 #endif
1378 }
1379
1380 static int64_t tcg_get_icount_limit(void)
1381 {
1382     int64_t deadline;
1383
1384     if (replay_mode != REPLAY_MODE_PLAY) {
1385         /*
1386          * Include all the timers, because they may need an attention.
1387          * Too long CPU execution may create unnecessary delay in UI.
1388          */
1389         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1390                                               QEMU_TIMER_ATTR_ALL);
1391
1392         /* Maintain prior (possibly buggy) behaviour where if no deadline
1393          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1394          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1395          * nanoseconds.
1396          */
1397         if ((deadline < 0) || (deadline > INT32_MAX)) {
1398             deadline = INT32_MAX;
1399         }
1400
1401         return qemu_icount_round(deadline);
1402     } else {
1403         return replay_get_instructions();
1404     }
1405 }
1406
1407 static void handle_icount_deadline(void)
1408 {
1409     assert(qemu_in_vcpu_thread());
1410     if (use_icount) {
1411         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1412                                                       QEMU_TIMER_ATTR_ALL);
1413
1414         if (deadline == 0) {
1415             /* Wake up other AioContexts.  */
1416             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1417             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1418         }
1419     }
1420 }
1421
1422 static void prepare_icount_for_run(CPUState *cpu)
1423 {
1424     if (use_icount) {
1425         int insns_left;
1426
1427         /* These should always be cleared by process_icount_data after
1428          * each vCPU execution. However u16.high can be raised
1429          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1430          */
1431         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1432         g_assert(cpu->icount_extra == 0);
1433
1434         cpu->icount_budget = tcg_get_icount_limit();
1435         insns_left = MIN(0xffff, cpu->icount_budget);
1436         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1437         cpu->icount_extra = cpu->icount_budget - insns_left;
1438
1439         replay_mutex_lock();
1440     }
1441 }
1442
1443 static void process_icount_data(CPUState *cpu)
1444 {
1445     if (use_icount) {
1446         /* Account for executed instructions */
1447         cpu_update_icount(cpu);
1448
1449         /* Reset the counters */
1450         cpu_neg(cpu)->icount_decr.u16.low = 0;
1451         cpu->icount_extra = 0;
1452         cpu->icount_budget = 0;
1453
1454         replay_account_executed_instructions();
1455
1456         replay_mutex_unlock();
1457     }
1458 }
1459
1460
1461 static int tcg_cpu_exec(CPUState *cpu)
1462 {
1463     int ret;
1464 #ifdef CONFIG_PROFILER
1465     int64_t ti;
1466 #endif
1467
1468     assert(tcg_enabled());
1469 #ifdef CONFIG_PROFILER
1470     ti = profile_getclock();
1471 #endif
1472     cpu_exec_start(cpu);
1473     ret = cpu_exec(cpu);
1474     cpu_exec_end(cpu);
1475 #ifdef CONFIG_PROFILER
1476     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1477                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1478 #endif
1479     return ret;
1480 }
1481
1482 /* Destroy any remaining vCPUs which have been unplugged and have
1483  * finished running
1484  */
1485 static void deal_with_unplugged_cpus(void)
1486 {
1487     CPUState *cpu;
1488
1489     CPU_FOREACH(cpu) {
1490         if (cpu->unplug && !cpu_can_run(cpu)) {
1491             qemu_tcg_destroy_vcpu(cpu);
1492             cpu->created = false;
1493             qemu_cond_signal(&qemu_cpu_cond);
1494             break;
1495         }
1496     }
1497 }
1498
1499 /* Single-threaded TCG
1500  *
1501  * In the single-threaded case each vCPU is simulated in turn. If
1502  * there is more than a single vCPU we create a simple timer to kick
1503  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1504  * This is done explicitly rather than relying on side-effects
1505  * elsewhere.
1506  */
1507
1508 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1509 {
1510     CPUState *cpu = arg;
1511
1512     assert(tcg_enabled());
1513     rcu_register_thread();
1514     tcg_register_thread();
1515
1516     qemu_mutex_lock_iothread();
1517     qemu_thread_get_self(cpu->thread);
1518
1519     cpu->thread_id = qemu_get_thread_id();
1520     cpu->created = true;
1521     cpu->can_do_io = 1;
1522     qemu_cond_signal(&qemu_cpu_cond);
1523     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1524
1525     /* wait for initial kick-off after machine start */
1526     while (first_cpu->stopped) {
1527         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1528
1529         /* process any pending work */
1530         CPU_FOREACH(cpu) {
1531             current_cpu = cpu;
1532             qemu_wait_io_event_common(cpu);
1533         }
1534     }
1535
1536     start_tcg_kick_timer();
1537
1538     cpu = first_cpu;
1539
1540     /* process any pending work */
1541     cpu->exit_request = 1;
1542
1543     while (1) {
1544         qemu_mutex_unlock_iothread();
1545         replay_mutex_lock();
1546         qemu_mutex_lock_iothread();
1547         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1548         qemu_account_warp_timer();
1549
1550         /* Run the timers here.  This is much more efficient than
1551          * waking up the I/O thread and waiting for completion.
1552          */
1553         handle_icount_deadline();
1554
1555         replay_mutex_unlock();
1556
1557         if (!cpu) {
1558             cpu = first_cpu;
1559         }
1560
1561         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1562
1563             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1564             current_cpu = cpu;
1565
1566             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1567                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1568
1569             if (cpu_can_run(cpu)) {
1570                 int r;
1571
1572                 qemu_mutex_unlock_iothread();
1573                 prepare_icount_for_run(cpu);
1574
1575                 r = tcg_cpu_exec(cpu);
1576
1577                 process_icount_data(cpu);
1578                 qemu_mutex_lock_iothread();
1579
1580                 if (r == EXCP_DEBUG) {
1581                     cpu_handle_guest_debug(cpu);
1582                     break;
1583                 } else if (r == EXCP_ATOMIC) {
1584                     qemu_mutex_unlock_iothread();
1585                     cpu_exec_step_atomic(cpu);
1586                     qemu_mutex_lock_iothread();
1587                     break;
1588                 }
1589             } else if (cpu->stop) {
1590                 if (cpu->unplug) {
1591                     cpu = CPU_NEXT(cpu);
1592                 }
1593                 break;
1594             }
1595
1596             cpu = CPU_NEXT(cpu);
1597         } /* while (cpu && !cpu->exit_request).. */
1598
1599         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1600         atomic_set(&tcg_current_rr_cpu, NULL);
1601
1602         if (cpu && cpu->exit_request) {
1603             atomic_mb_set(&cpu->exit_request, 0);
1604         }
1605
1606         if (use_icount && all_cpu_threads_idle()) {
1607             /*
1608              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1609              * in the main_loop, wake it up in order to start the warp timer.
1610              */
1611             qemu_notify_event();
1612         }
1613
1614         qemu_tcg_rr_wait_io_event();
1615         deal_with_unplugged_cpus();
1616     }
1617
1618     rcu_unregister_thread();
1619     return NULL;
1620 }
1621
1622 static void *qemu_hax_cpu_thread_fn(void *arg)
1623 {
1624     CPUState *cpu = arg;
1625     int r;
1626
1627     rcu_register_thread();
1628     qemu_mutex_lock_iothread();
1629     qemu_thread_get_self(cpu->thread);
1630
1631     cpu->thread_id = qemu_get_thread_id();
1632     cpu->created = true;
1633     current_cpu = cpu;
1634
1635     hax_init_vcpu(cpu);
1636     qemu_cond_signal(&qemu_cpu_cond);
1637     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1638
1639     do {
1640         if (cpu_can_run(cpu)) {
1641             r = hax_smp_cpu_exec(cpu);
1642             if (r == EXCP_DEBUG) {
1643                 cpu_handle_guest_debug(cpu);
1644             }
1645         }
1646
1647         qemu_wait_io_event(cpu);
1648     } while (!cpu->unplug || cpu_can_run(cpu));
1649     rcu_unregister_thread();
1650     return NULL;
1651 }
1652
1653 /* The HVF-specific vCPU thread function. This one should only run when the host
1654  * CPU supports the VMX "unrestricted guest" feature. */
1655 static void *qemu_hvf_cpu_thread_fn(void *arg)
1656 {
1657     CPUState *cpu = arg;
1658
1659     int r;
1660
1661     assert(hvf_enabled());
1662
1663     rcu_register_thread();
1664
1665     qemu_mutex_lock_iothread();
1666     qemu_thread_get_self(cpu->thread);
1667
1668     cpu->thread_id = qemu_get_thread_id();
1669     cpu->can_do_io = 1;
1670     current_cpu = cpu;
1671
1672     hvf_init_vcpu(cpu);
1673
1674     /* signal CPU creation */
1675     cpu->created = true;
1676     qemu_cond_signal(&qemu_cpu_cond);
1677     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1678
1679     do {
1680         if (cpu_can_run(cpu)) {
1681             r = hvf_vcpu_exec(cpu);
1682             if (r == EXCP_DEBUG) {
1683                 cpu_handle_guest_debug(cpu);
1684             }
1685         }
1686         qemu_wait_io_event(cpu);
1687     } while (!cpu->unplug || cpu_can_run(cpu));
1688
1689     hvf_vcpu_destroy(cpu);
1690     cpu->created = false;
1691     qemu_cond_signal(&qemu_cpu_cond);
1692     qemu_mutex_unlock_iothread();
1693     rcu_unregister_thread();
1694     return NULL;
1695 }
1696
1697 static void *qemu_whpx_cpu_thread_fn(void *arg)
1698 {
1699     CPUState *cpu = arg;
1700     int r;
1701
1702     rcu_register_thread();
1703
1704     qemu_mutex_lock_iothread();
1705     qemu_thread_get_self(cpu->thread);
1706     cpu->thread_id = qemu_get_thread_id();
1707     current_cpu = cpu;
1708
1709     r = whpx_init_vcpu(cpu);
1710     if (r < 0) {
1711         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1712         exit(1);
1713     }
1714
1715     /* signal CPU creation */
1716     cpu->created = true;
1717     qemu_cond_signal(&qemu_cpu_cond);
1718     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1719
1720     do {
1721         if (cpu_can_run(cpu)) {
1722             r = whpx_vcpu_exec(cpu);
1723             if (r == EXCP_DEBUG) {
1724                 cpu_handle_guest_debug(cpu);
1725             }
1726         }
1727         while (cpu_thread_is_idle(cpu)) {
1728             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1729         }
1730         qemu_wait_io_event_common(cpu);
1731     } while (!cpu->unplug || cpu_can_run(cpu));
1732
1733     whpx_destroy_vcpu(cpu);
1734     cpu->created = false;
1735     qemu_cond_signal(&qemu_cpu_cond);
1736     qemu_mutex_unlock_iothread();
1737     rcu_unregister_thread();
1738     return NULL;
1739 }
1740
1741 #ifdef _WIN32
1742 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1743 {
1744 }
1745 #endif
1746
1747 /* Multi-threaded TCG
1748  *
1749  * In the multi-threaded case each vCPU has its own thread. The TLS
1750  * variable current_cpu can be used deep in the code to find the
1751  * current CPUState for a given thread.
1752  */
1753
1754 static void *qemu_tcg_cpu_thread_fn(void *arg)
1755 {
1756     CPUState *cpu = arg;
1757
1758     assert(tcg_enabled());
1759     g_assert(!use_icount);
1760
1761     rcu_register_thread();
1762     tcg_register_thread();
1763
1764     qemu_mutex_lock_iothread();
1765     qemu_thread_get_self(cpu->thread);
1766
1767     cpu->thread_id = qemu_get_thread_id();
1768     cpu->created = true;
1769     cpu->can_do_io = 1;
1770     current_cpu = cpu;
1771     qemu_cond_signal(&qemu_cpu_cond);
1772     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1773
1774     /* process any pending work */
1775     cpu->exit_request = 1;
1776
1777     do {
1778         if (cpu_can_run(cpu)) {
1779             int r;
1780             qemu_mutex_unlock_iothread();
1781             r = tcg_cpu_exec(cpu);
1782             qemu_mutex_lock_iothread();
1783             switch (r) {
1784             case EXCP_DEBUG:
1785                 cpu_handle_guest_debug(cpu);
1786                 break;
1787             case EXCP_HALTED:
1788                 /* during start-up the vCPU is reset and the thread is
1789                  * kicked several times. If we don't ensure we go back
1790                  * to sleep in the halted state we won't cleanly
1791                  * start-up when the vCPU is enabled.
1792                  *
1793                  * cpu->halted should ensure we sleep in wait_io_event
1794                  */
1795                 g_assert(cpu->halted);
1796                 break;
1797             case EXCP_ATOMIC:
1798                 qemu_mutex_unlock_iothread();
1799                 cpu_exec_step_atomic(cpu);
1800                 qemu_mutex_lock_iothread();
1801             default:
1802                 /* Ignore everything else? */
1803                 break;
1804             }
1805         }
1806
1807         atomic_mb_set(&cpu->exit_request, 0);
1808         qemu_wait_io_event(cpu);
1809     } while (!cpu->unplug || cpu_can_run(cpu));
1810
1811     qemu_tcg_destroy_vcpu(cpu);
1812     cpu->created = false;
1813     qemu_cond_signal(&qemu_cpu_cond);
1814     qemu_mutex_unlock_iothread();
1815     rcu_unregister_thread();
1816     return NULL;
1817 }
1818
1819 static void qemu_cpu_kick_thread(CPUState *cpu)
1820 {
1821 #ifndef _WIN32
1822     int err;
1823
1824     if (cpu->thread_kicked) {
1825         return;
1826     }
1827     cpu->thread_kicked = true;
1828     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1829     if (err && err != ESRCH) {
1830         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1831         exit(1);
1832     }
1833 #else /* _WIN32 */
1834     if (!qemu_cpu_is_self(cpu)) {
1835         if (whpx_enabled()) {
1836             whpx_vcpu_kick(cpu);
1837         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1838             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1839                     __func__, GetLastError());
1840             exit(1);
1841         }
1842     }
1843 #endif
1844 }
1845
1846 void qemu_cpu_kick(CPUState *cpu)
1847 {
1848     qemu_cond_broadcast(cpu->halt_cond);
1849     if (tcg_enabled()) {
1850         if (qemu_tcg_mttcg_enabled()) {
1851             cpu_exit(cpu);
1852         } else {
1853             qemu_cpu_kick_rr_cpus();
1854         }
1855     } else {
1856         if (hax_enabled()) {
1857             /*
1858              * FIXME: race condition with the exit_request check in
1859              * hax_vcpu_hax_exec
1860              */
1861             cpu->exit_request = 1;
1862         }
1863         qemu_cpu_kick_thread(cpu);
1864     }
1865 }
1866
1867 void qemu_cpu_kick_self(void)
1868 {
1869     assert(current_cpu);
1870     qemu_cpu_kick_thread(current_cpu);
1871 }
1872
1873 bool qemu_cpu_is_self(CPUState *cpu)
1874 {
1875     return qemu_thread_is_self(cpu->thread);
1876 }
1877
1878 bool qemu_in_vcpu_thread(void)
1879 {
1880     return current_cpu && qemu_cpu_is_self(current_cpu);
1881 }
1882
1883 static __thread bool iothread_locked = false;
1884
1885 bool qemu_mutex_iothread_locked(void)
1886 {
1887     return iothread_locked;
1888 }
1889
1890 /*
1891  * The BQL is taken from so many places that it is worth profiling the
1892  * callers directly, instead of funneling them all through a single function.
1893  */
1894 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1895 {
1896     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1897
1898     g_assert(!qemu_mutex_iothread_locked());
1899     bql_lock(&qemu_global_mutex, file, line);
1900     iothread_locked = true;
1901 }
1902
1903 void qemu_mutex_unlock_iothread(void)
1904 {
1905     g_assert(qemu_mutex_iothread_locked());
1906     iothread_locked = false;
1907     qemu_mutex_unlock(&qemu_global_mutex);
1908 }
1909
1910 static bool all_vcpus_paused(void)
1911 {
1912     CPUState *cpu;
1913
1914     CPU_FOREACH(cpu) {
1915         if (!cpu->stopped) {
1916             return false;
1917         }
1918     }
1919
1920     return true;
1921 }
1922
1923 void pause_all_vcpus(void)
1924 {
1925     CPUState *cpu;
1926
1927     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1928     CPU_FOREACH(cpu) {
1929         if (qemu_cpu_is_self(cpu)) {
1930             qemu_cpu_stop(cpu, true);
1931         } else {
1932             cpu->stop = true;
1933             qemu_cpu_kick(cpu);
1934         }
1935     }
1936
1937     /* We need to drop the replay_lock so any vCPU threads woken up
1938      * can finish their replay tasks
1939      */
1940     replay_mutex_unlock();
1941
1942     while (!all_vcpus_paused()) {
1943         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1944         CPU_FOREACH(cpu) {
1945             qemu_cpu_kick(cpu);
1946         }
1947     }
1948
1949     qemu_mutex_unlock_iothread();
1950     replay_mutex_lock();
1951     qemu_mutex_lock_iothread();
1952 }
1953
1954 void cpu_resume(CPUState *cpu)
1955 {
1956     cpu->stop = false;
1957     cpu->stopped = false;
1958     qemu_cpu_kick(cpu);
1959 }
1960
1961 void resume_all_vcpus(void)
1962 {
1963     CPUState *cpu;
1964
1965     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1966     CPU_FOREACH(cpu) {
1967         cpu_resume(cpu);
1968     }
1969 }
1970
1971 void cpu_remove_sync(CPUState *cpu)
1972 {
1973     cpu->stop = true;
1974     cpu->unplug = true;
1975     qemu_cpu_kick(cpu);
1976     qemu_mutex_unlock_iothread();
1977     qemu_thread_join(cpu->thread);
1978     qemu_mutex_lock_iothread();
1979 }
1980
1981 /* For temporary buffers for forming a name */
1982 #define VCPU_THREAD_NAME_SIZE 16
1983
1984 static void qemu_tcg_init_vcpu(CPUState *cpu)
1985 {
1986     char thread_name[VCPU_THREAD_NAME_SIZE];
1987     static QemuCond *single_tcg_halt_cond;
1988     static QemuThread *single_tcg_cpu_thread;
1989     static int tcg_region_inited;
1990
1991     assert(tcg_enabled());
1992     /*
1993      * Initialize TCG regions--once. Now is a good time, because:
1994      * (1) TCG's init context, prologue and target globals have been set up.
1995      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1996      *     -accel flag is processed, so the check doesn't work then).
1997      */
1998     if (!tcg_region_inited) {
1999         tcg_region_inited = 1;
2000         tcg_region_init();
2001     }
2002
2003     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
2004         cpu->thread = g_malloc0(sizeof(QemuThread));
2005         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2006         qemu_cond_init(cpu->halt_cond);
2007
2008         if (qemu_tcg_mttcg_enabled()) {
2009             /* create a thread per vCPU with TCG (MTTCG) */
2010             parallel_cpus = true;
2011             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
2012                  cpu->cpu_index);
2013
2014             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
2015                                cpu, QEMU_THREAD_JOINABLE);
2016
2017         } else {
2018             /* share a single thread for all cpus with TCG */
2019             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
2020             qemu_thread_create(cpu->thread, thread_name,
2021                                qemu_tcg_rr_cpu_thread_fn,
2022                                cpu, QEMU_THREAD_JOINABLE);
2023
2024             single_tcg_halt_cond = cpu->halt_cond;
2025             single_tcg_cpu_thread = cpu->thread;
2026         }
2027 #ifdef _WIN32
2028         cpu->hThread = qemu_thread_get_handle(cpu->thread);
2029 #endif
2030     } else {
2031         /* For non-MTTCG cases we share the thread */
2032         cpu->thread = single_tcg_cpu_thread;
2033         cpu->halt_cond = single_tcg_halt_cond;
2034         cpu->thread_id = first_cpu->thread_id;
2035         cpu->can_do_io = 1;
2036         cpu->created = true;
2037     }
2038 }
2039
2040 static void qemu_hax_start_vcpu(CPUState *cpu)
2041 {
2042     char thread_name[VCPU_THREAD_NAME_SIZE];
2043
2044     cpu->thread = g_malloc0(sizeof(QemuThread));
2045     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2046     qemu_cond_init(cpu->halt_cond);
2047
2048     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2049              cpu->cpu_index);
2050     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2051                        cpu, QEMU_THREAD_JOINABLE);
2052 #ifdef _WIN32
2053     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2054 #endif
2055 }
2056
2057 static void qemu_kvm_start_vcpu(CPUState *cpu)
2058 {
2059     char thread_name[VCPU_THREAD_NAME_SIZE];
2060
2061     cpu->thread = g_malloc0(sizeof(QemuThread));
2062     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2063     qemu_cond_init(cpu->halt_cond);
2064     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2065              cpu->cpu_index);
2066     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2067                        cpu, QEMU_THREAD_JOINABLE);
2068 }
2069
2070 static void qemu_hvf_start_vcpu(CPUState *cpu)
2071 {
2072     char thread_name[VCPU_THREAD_NAME_SIZE];
2073
2074     /* HVF currently does not support TCG, and only runs in
2075      * unrestricted-guest mode. */
2076     assert(hvf_enabled());
2077
2078     cpu->thread = g_malloc0(sizeof(QemuThread));
2079     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2080     qemu_cond_init(cpu->halt_cond);
2081
2082     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2083              cpu->cpu_index);
2084     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2085                        cpu, QEMU_THREAD_JOINABLE);
2086 }
2087
2088 static void qemu_whpx_start_vcpu(CPUState *cpu)
2089 {
2090     char thread_name[VCPU_THREAD_NAME_SIZE];
2091
2092     cpu->thread = g_malloc0(sizeof(QemuThread));
2093     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2094     qemu_cond_init(cpu->halt_cond);
2095     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2096              cpu->cpu_index);
2097     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2098                        cpu, QEMU_THREAD_JOINABLE);
2099 #ifdef _WIN32
2100     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2101 #endif
2102 }
2103
2104 static void qemu_dummy_start_vcpu(CPUState *cpu)
2105 {
2106     char thread_name[VCPU_THREAD_NAME_SIZE];
2107
2108     cpu->thread = g_malloc0(sizeof(QemuThread));
2109     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2110     qemu_cond_init(cpu->halt_cond);
2111     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2112              cpu->cpu_index);
2113     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2114                        QEMU_THREAD_JOINABLE);
2115 }
2116
2117 void qemu_init_vcpu(CPUState *cpu)
2118 {
2119     MachineState *ms = MACHINE(qdev_get_machine());
2120
2121     cpu->nr_cores = ms->smp.cores;
2122     cpu->nr_threads =  ms->smp.threads;
2123     cpu->stopped = true;
2124     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2125
2126     if (!cpu->as) {
2127         /* If the target cpu hasn't set up any address spaces itself,
2128          * give it the default one.
2129          */
2130         cpu->num_ases = 1;
2131         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2132     }
2133
2134     if (kvm_enabled()) {
2135         qemu_kvm_start_vcpu(cpu);
2136     } else if (hax_enabled()) {
2137         qemu_hax_start_vcpu(cpu);
2138     } else if (hvf_enabled()) {
2139         qemu_hvf_start_vcpu(cpu);
2140     } else if (tcg_enabled()) {
2141         qemu_tcg_init_vcpu(cpu);
2142     } else if (whpx_enabled()) {
2143         qemu_whpx_start_vcpu(cpu);
2144     } else {
2145         qemu_dummy_start_vcpu(cpu);
2146     }
2147
2148     while (!cpu->created) {
2149         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2150     }
2151 }
2152
2153 void cpu_stop_current(void)
2154 {
2155     if (current_cpu) {
2156         current_cpu->stop = true;
2157         cpu_exit(current_cpu);
2158     }
2159 }
2160
2161 int vm_stop(RunState state)
2162 {
2163     if (qemu_in_vcpu_thread()) {
2164         qemu_system_vmstop_request_prepare();
2165         qemu_system_vmstop_request(state);
2166         /*
2167          * FIXME: should not return to device code in case
2168          * vm_stop() has been requested.
2169          */
2170         cpu_stop_current();
2171         return 0;
2172     }
2173
2174     return do_vm_stop(state, true);
2175 }
2176
2177 /**
2178  * Prepare for (re)starting the VM.
2179  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2180  * running or in case of an error condition), 0 otherwise.
2181  */
2182 int vm_prepare_start(void)
2183 {
2184     RunState requested;
2185
2186     qemu_vmstop_requested(&requested);
2187     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2188         return -1;
2189     }
2190
2191     /* Ensure that a STOP/RESUME pair of events is emitted if a
2192      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2193      * example, according to documentation is always followed by
2194      * the STOP event.
2195      */
2196     if (runstate_is_running()) {
2197         qapi_event_send_stop();
2198         qapi_event_send_resume();
2199         return -1;
2200     }
2201
2202     /* We are sending this now, but the CPUs will be resumed shortly later */
2203     qapi_event_send_resume();
2204
2205     cpu_enable_ticks();
2206     runstate_set(RUN_STATE_RUNNING);
2207     vm_state_notify(1, RUN_STATE_RUNNING);
2208     return 0;
2209 }
2210
2211 void vm_start(void)
2212 {
2213     if (!vm_prepare_start()) {
2214         resume_all_vcpus();
2215     }
2216 }
2217
2218 /* does a state transition even if the VM is already stopped,
2219    current state is forgotten forever */
2220 int vm_stop_force_state(RunState state)
2221 {
2222     if (runstate_is_running()) {
2223         return vm_stop(state);
2224     } else {
2225         runstate_set(state);
2226
2227         bdrv_drain_all();
2228         /* Make sure to return an error if the flush in a previous vm_stop()
2229          * failed. */
2230         return bdrv_flush_all();
2231     }
2232 }
2233
2234 void list_cpus(const char *optarg)
2235 {
2236     /* XXX: implement xxx_cpu_list for targets that still miss it */
2237 #if defined(cpu_list)
2238     cpu_list();
2239 #endif
2240 }
2241
2242 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2243                  bool has_cpu, int64_t cpu_index, Error **errp)
2244 {
2245     FILE *f;
2246     uint32_t l;
2247     CPUState *cpu;
2248     uint8_t buf[1024];
2249     int64_t orig_addr = addr, orig_size = size;
2250
2251     if (!has_cpu) {
2252         cpu_index = 0;
2253     }
2254
2255     cpu = qemu_get_cpu(cpu_index);
2256     if (cpu == NULL) {
2257         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2258                    "a CPU number");
2259         return;
2260     }
2261
2262     f = fopen(filename, "wb");
2263     if (!f) {
2264         error_setg_file_open(errp, errno, filename);
2265         return;
2266     }
2267
2268     while (size != 0) {
2269         l = sizeof(buf);
2270         if (l > size)
2271             l = size;
2272         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2273             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2274                              " specified", orig_addr, orig_size);
2275             goto exit;
2276         }
2277         if (fwrite(buf, 1, l, f) != l) {
2278             error_setg(errp, QERR_IO_ERROR);
2279             goto exit;
2280         }
2281         addr += l;
2282         size -= l;
2283     }
2284
2285 exit:
2286     fclose(f);
2287 }
2288
2289 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2290                   Error **errp)
2291 {
2292     FILE *f;
2293     uint32_t l;
2294     uint8_t buf[1024];
2295
2296     f = fopen(filename, "wb");
2297     if (!f) {
2298         error_setg_file_open(errp, errno, filename);
2299         return;
2300     }
2301
2302     while (size != 0) {
2303         l = sizeof(buf);
2304         if (l > size)
2305             l = size;
2306         cpu_physical_memory_read(addr, buf, l);
2307         if (fwrite(buf, 1, l, f) != l) {
2308             error_setg(errp, QERR_IO_ERROR);
2309             goto exit;
2310         }
2311         addr += l;
2312         size -= l;
2313     }
2314
2315 exit:
2316     fclose(f);
2317 }
2318
2319 void qmp_inject_nmi(Error **errp)
2320 {
2321     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2322 }
2323
2324 void dump_drift_info(void)
2325 {
2326     if (!use_icount) {
2327         return;
2328     }
2329
2330     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2331                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2332     if (icount_align_option) {
2333         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2334                     -max_delay / SCALE_MS);
2335         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2336                     max_advance / SCALE_MS);
2337     } else {
2338         qemu_printf("Max guest delay     NA\n");
2339         qemu_printf("Max guest advance   NA\n");
2340     }
2341 }