cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifdef CONFIG_LINUX
  55
  56 #include <sys/prctl.h>
  57
  58 #ifndef PR_MCE_KILL
  59 #define PR_MCE_KILL 33
  60 #endif
  61
  62 #ifndef PR_MCE_KILL_SET
  63 #define PR_MCE_KILL_SET 1
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_EARLY
  67 #define PR_MCE_KILL_EARLY 1
  68 #endif
  69
  70 #endif /* CONFIG_LINUX */
  71
  72 int64_t max_delay;
  73 int64_t max_advance;
  74
  75 /* vcpu throttling controls */
  76 static QEMUTimer *throttle_timer;
  77 static unsigned int throttle_percentage;
  78
  79 #define CPU_THROTTLE_PCT_MIN 1
  80 #define CPU_THROTTLE_PCT_MAX 99
  81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  82
  83 bool cpu_is_stopped(CPUState *cpu)
  84 {
  85     return cpu->stopped || !runstate_is_running();
  86 }
  87
  88 static bool cpu_thread_is_idle(CPUState *cpu)
  89 {
  90     if (cpu->stop || cpu->queued_work_first) {
  91         return false;
  92     }
  93     if (cpu_is_stopped(cpu)) {
  94         return true;
  95     }
  96     if (!cpu->halted || cpu_has_work(cpu) ||
  97         kvm_halt_in_kernel()) {
  98         return false;
  99     }
 100     return true;
 101 }
 102
 103 static bool all_cpu_threads_idle(void)
 104 {
 105     CPUState *cpu;
 106
 107     CPU_FOREACH(cpu) {
 108         if (!cpu_thread_is_idle(cpu)) {
 109             return false;
 110         }
 111     }
 112     return true;
 113 }
 114
 115 /***********************************************************/
 116 /* guest cycle counter */
 117
 118 /* Protected by TimersState seqlock */
 119
 120 static bool icount_sleep = true;
 121 static int64_t vm_clock_warp_start = -1;
 122 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 123 static int icount_time_shift;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 static QEMUTimer *icount_rt_timer;
 128 static QEMUTimer *icount_vm_timer;
 129 static QEMUTimer *icount_warp_timer;
 130
 131 typedef struct TimersState {
 132     /* Protected by BQL.  */
 133     int64_t cpu_ticks_prev;
 134     int64_t cpu_ticks_offset;
 135
 136     /* cpu_clock_offset can be read out of BQL, so protect it with
 137      * this lock.
 138      */
 139     QemuSeqLock vm_clock_seqlock;
 140     int64_t cpu_clock_offset;
 141     int32_t cpu_ticks_enabled;
 142     int64_t dummy;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146     /* Only written by TCG thread */
 147     int64_t qemu_icount;
 148 } TimersState;
 149
 150 static TimersState timers_state;
 151 bool mttcg_enabled;
 152
 153 /*
 154  * We default to false if we know other options have been enabled
 155  * which are currently incompatible with MTTCG. Otherwise when each
 156  * guest (target) has been updated to support:
 157  *   - atomic instructions
 158  *   - memory ordering primitives (barriers)
 159  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 160  *
 161  * Once a guest architecture has been converted to the new primitives
 162  * there are two remaining limitations to check.
 163  *
 164  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 165  * - The host must have a stronger memory order than the guest
 166  *
 167  * It may be possible in future to support strong guests on weak hosts
 168  * but that will require tagging all load/stores in a guest with their
 169  * implicit memory order requirements which would likely slow things
 170  * down a lot.
 171  */
 172
 173 static bool check_tcg_memory_orders_compatible(void)
 174 {
 175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 176     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 177 #else
 178     return false;
 179 #endif
 180 }
 181
 182 static bool default_mttcg_enabled(void)
 183 {
 184     if (use_icount || TCG_OVERSIZED_GUEST) {
 185         return false;
 186     } else {
 187 #ifdef TARGET_SUPPORTS_MTTCG
 188         return check_tcg_memory_orders_compatible();
 189 #else
 190         return false;
 191 #endif
 192     }
 193 }
 194
 195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 196 {
 197     const char *t = qemu_opt_get(opts, "thread");
 198     if (t) {
 199         if (strcmp(t, "multi") == 0) {
 200             if (TCG_OVERSIZED_GUEST) {
 201                 error_setg(errp, "No MTTCG when guest word size > hosts");
 202             } else if (use_icount) {
 203                 error_setg(errp, "No MTTCG when icount is enabled");
 204             } else {
 205 #ifndef TARGET_SUPPORTS_MTTCG
 206                 error_report("Guest not yet converted to MTTCG - "
 207                              "you may get unexpected results");
 208 #endif
 209                 if (!check_tcg_memory_orders_compatible()) {
 210                     error_report("Guest expects a stronger memory ordering "
 211                                  "than the host provides");
 212                     error_printf("This may cause strange/hard to debug errors\n");
 213                 }
 214                 mttcg_enabled = true;
 215             }
 216         } else if (strcmp(t, "single") == 0) {
 217             mttcg_enabled = false;
 218         } else {
 219             error_setg(errp, "Invalid 'thread' setting %s", t);
 220         }
 221     } else {
 222         mttcg_enabled = default_mttcg_enabled();
 223     }
 224 }
 225
 226 /* The current number of executed instructions is based on what we
 227  * originally budgeted minus the current state of the decrementing
 228  * icount counters in extra/u16.low.
 229  */
 230 static int64_t cpu_get_icount_executed(CPUState *cpu)
 231 {
 232     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 233 }
 234
 235 /*
 236  * Update the global shared timer_state.qemu_icount to take into
 237  * account executed instructions. This is done by the TCG vCPU
 238  * thread so the main-loop can see time has moved forward.
 239  */
 240 void cpu_update_icount(CPUState *cpu)
 241 {
 242     int64_t executed = cpu_get_icount_executed(cpu);
 243     cpu->icount_budget -= executed;
 244
 245 #ifdef CONFIG_ATOMIC64
 246     atomic_set__nocheck(&timers_state.qemu_icount,
 247                         atomic_read__nocheck(&timers_state.qemu_icount) +
 248                         executed);
 249 #else /* FIXME: we need 64bit atomics to do this safely */
 250     timers_state.qemu_icount += executed;
 251 #endif
 252 }
 253
 254 int64_t cpu_get_icount_raw(void)
 255 {
 256     CPUState *cpu = current_cpu;
 257
 258     if (cpu && cpu->running) {
 259         if (!cpu->can_do_io) {
 260             fprintf(stderr, "Bad icount read\n");
 261             exit(1);
 262         }
 263         /* Take into account what has run */
 264         cpu_update_icount(cpu);
 265     }
 266 #ifdef CONFIG_ATOMIC64
 267     return atomic_read__nocheck(&timers_state.qemu_icount);
 268 #else /* FIXME: we need 64bit atomics to do this safely */
 269     return timers_state.qemu_icount;
 270 #endif
 271 }
 272
 273 /* Return the virtual CPU time, based on the instruction counter.  */
 274 static int64_t cpu_get_icount_locked(void)
 275 {
 276     int64_t icount = cpu_get_icount_raw();
 277     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 278 }
 279
 280 int64_t cpu_get_icount(void)
 281 {
 282     int64_t icount;
 283     unsigned start;
 284
 285     do {
 286         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 287         icount = cpu_get_icount_locked();
 288     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 289
 290     return icount;
 291 }
 292
 293 int64_t cpu_icount_to_ns(int64_t icount)
 294 {
 295     return icount << icount_time_shift;
 296 }
 297
 298 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 299  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 300  * counter.
 301  *
 302  * Caller must hold the BQL
 303  */
 304 int64_t cpu_get_ticks(void)
 305 {
 306     int64_t ticks;
 307
 308     if (use_icount) {
 309         return cpu_get_icount();
 310     }
 311
 312     ticks = timers_state.cpu_ticks_offset;
 313     if (timers_state.cpu_ticks_enabled) {
 314         ticks += cpu_get_host_ticks();
 315     }
 316
 317     if (timers_state.cpu_ticks_prev > ticks) {
 318         /* Note: non increasing ticks may happen if the host uses
 319            software suspend */
 320         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 321         ticks = timers_state.cpu_ticks_prev;
 322     }
 323
 324     timers_state.cpu_ticks_prev = ticks;
 325     return ticks;
 326 }
 327
 328 static int64_t cpu_get_clock_locked(void)
 329 {
 330     int64_t time;
 331
 332     time = timers_state.cpu_clock_offset;
 333     if (timers_state.cpu_ticks_enabled) {
 334         time += get_clock();
 335     }
 336
 337     return time;
 338 }
 339
 340 /* Return the monotonic time elapsed in VM, i.e.,
 341  * the time between vm_start and vm_stop
 342  */
 343 int64_t cpu_get_clock(void)
 344 {
 345     int64_t ti;
 346     unsigned start;
 347
 348     do {
 349         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 350         ti = cpu_get_clock_locked();
 351     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 352
 353     return ti;
 354 }
 355
 356 /* enable cpu_get_ticks()
 357  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 358  */
 359 void cpu_enable_ticks(void)
 360 {
 361     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 362     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 363     if (!timers_state.cpu_ticks_enabled) {
 364         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 365         timers_state.cpu_clock_offset -= get_clock();
 366         timers_state.cpu_ticks_enabled = 1;
 367     }
 368     seqlock_write_end(&timers_state.vm_clock_seqlock);
 369 }
 370
 371 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 372  * cpu_get_ticks() after that.
 373  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 374  */
 375 void cpu_disable_ticks(void)
 376 {
 377     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 378     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 379     if (timers_state.cpu_ticks_enabled) {
 380         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 381         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 382         timers_state.cpu_ticks_enabled = 0;
 383     }
 384     seqlock_write_end(&timers_state.vm_clock_seqlock);
 385 }
 386
 387 /* Correlation between real and virtual time is always going to be
 388    fairly approximate, so ignore small variation.
 389    When the guest is idle real and virtual time will be aligned in
 390    the IO wait loop.  */
 391 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 392
 393 static void icount_adjust(void)
 394 {
 395     int64_t cur_time;
 396     int64_t cur_icount;
 397     int64_t delta;
 398
 399     /* Protected by TimersState mutex.  */
 400     static int64_t last_delta;
 401
 402     /* If the VM is not running, then do nothing.  */
 403     if (!runstate_is_running()) {
 404         return;
 405     }
 406
 407     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 408     cur_time = cpu_get_clock_locked();
 409     cur_icount = cpu_get_icount_locked();
 410
 411     delta = cur_icount - cur_time;
 412     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 413     if (delta > 0
 414         && last_delta + ICOUNT_WOBBLE < delta * 2
 415         && icount_time_shift > 0) {
 416         /* The guest is getting too far ahead.  Slow time down.  */
 417         icount_time_shift--;
 418     }
 419     if (delta < 0
 420         && last_delta - ICOUNT_WOBBLE > delta * 2
 421         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 422         /* The guest is getting too far behind.  Speed time up.  */
 423         icount_time_shift++;
 424     }
 425     last_delta = delta;
 426     timers_state.qemu_icount_bias = cur_icount
 427                               - (timers_state.qemu_icount << icount_time_shift);
 428     seqlock_write_end(&timers_state.vm_clock_seqlock);
 429 }
 430
 431 static void icount_adjust_rt(void *opaque)
 432 {
 433     timer_mod(icount_rt_timer,
 434               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 435     icount_adjust();
 436 }
 437
 438 static void icount_adjust_vm(void *opaque)
 439 {
 440     timer_mod(icount_vm_timer,
 441                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 442                    NANOSECONDS_PER_SECOND / 10);
 443     icount_adjust();
 444 }
 445
 446 static int64_t qemu_icount_round(int64_t count)
 447 {
 448     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 449 }
 450
 451 static void icount_warp_rt(void)
 452 {
 453     unsigned seq;
 454     int64_t warp_start;
 455
 456     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 457      * changes from -1 to another value, so the race here is okay.
 458      */
 459     do {
 460         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 461         warp_start = vm_clock_warp_start;
 462     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 463
 464     if (warp_start == -1) {
 465         return;
 466     }
 467
 468     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 469     if (runstate_is_running()) {
 470         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 471                                      cpu_get_clock_locked());
 472         int64_t warp_delta;
 473
 474         warp_delta = clock - vm_clock_warp_start;
 475         if (use_icount == 2) {
 476             /*
 477              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 478              * far ahead of real time.
 479              */
 480             int64_t cur_icount = cpu_get_icount_locked();
 481             int64_t delta = clock - cur_icount;
 482             warp_delta = MIN(warp_delta, delta);
 483         }
 484         timers_state.qemu_icount_bias += warp_delta;
 485     }
 486     vm_clock_warp_start = -1;
 487     seqlock_write_end(&timers_state.vm_clock_seqlock);
 488
 489     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 490         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 491     }
 492 }
 493
 494 static void icount_timer_cb(void *opaque)
 495 {
 496     /* No need for a checkpoint because the timer already synchronizes
 497      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 498      */
 499     icount_warp_rt();
 500 }
 501
 502 void qtest_clock_warp(int64_t dest)
 503 {
 504     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 505     AioContext *aio_context;
 506     assert(qtest_enabled());
 507     aio_context = qemu_get_aio_context();
 508     while (clock < dest) {
 509         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 510         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 511
 512         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 513         timers_state.qemu_icount_bias += warp;
 514         seqlock_write_end(&timers_state.vm_clock_seqlock);
 515
 516         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 517         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 518         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 519     }
 520     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 521 }
 522
 523 void qemu_start_warp_timer(void)
 524 {
 525     int64_t clock;
 526     int64_t deadline;
 527
 528     if (!use_icount) {
 529         return;
 530     }
 531
 532     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 533      * do not fire, so computing the deadline does not make sense.
 534      */
 535     if (!runstate_is_running()) {
 536         return;
 537     }
 538
 539     /* warp clock deterministically in record/replay mode */
 540     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 541         return;
 542     }
 543
 544     if (!all_cpu_threads_idle()) {
 545         return;
 546     }
 547
 548     if (qtest_enabled()) {
 549         /* When testing, qtest commands advance icount.  */
 550         return;
 551     }
 552
 553     /* We want to use the earliest deadline from ALL vm_clocks */
 554     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 555     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 556     if (deadline < 0) {
 557         static bool notified;
 558         if (!icount_sleep && !notified) {
 559             error_report("WARNING: icount sleep disabled and no active timers");
 560             notified = true;
 561         }
 562         return;
 563     }
 564
 565     if (deadline > 0) {
 566         /*
 567          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 568          * sleep.  Otherwise, the CPU might be waiting for a future timer
 569          * interrupt to wake it up, but the interrupt never comes because
 570          * the vCPU isn't running any insns and thus doesn't advance the
 571          * QEMU_CLOCK_VIRTUAL.
 572          */
 573         if (!icount_sleep) {
 574             /*
 575              * We never let VCPUs sleep in no sleep icount mode.
 576              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 577              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 578              * It is useful when we want a deterministic execution time,
 579              * isolated from host latencies.
 580              */
 581             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 582             timers_state.qemu_icount_bias += deadline;
 583             seqlock_write_end(&timers_state.vm_clock_seqlock);
 584             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 585         } else {
 586             /*
 587              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 588              * "real" time, (related to the time left until the next event) has
 589              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 590              * This avoids that the warps are visible externally; for example,
 591              * you will not be sending network packets continuously instead of
 592              * every 100ms.
 593              */
 594             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 595             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 596                 vm_clock_warp_start = clock;
 597             }
 598             seqlock_write_end(&timers_state.vm_clock_seqlock);
 599             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 600         }
 601     } else if (deadline == 0) {
 602         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 603     }
 604 }
 605
 606 static void qemu_account_warp_timer(void)
 607 {
 608     if (!use_icount || !icount_sleep) {
 609         return;
 610     }
 611
 612     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 613      * do not fire, so computing the deadline does not make sense.
 614      */
 615     if (!runstate_is_running()) {
 616         return;
 617     }
 618
 619     /* warp clock deterministically in record/replay mode */
 620     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 621         return;
 622     }
 623
 624     timer_del(icount_warp_timer);
 625     icount_warp_rt();
 626 }
 627
 628 static bool icount_state_needed(void *opaque)
 629 {
 630     return use_icount;
 631 }
 632
 633 /*
 634  * This is a subsection for icount migration.
 635  */
 636 static const VMStateDescription icount_vmstate_timers = {
 637     .name = "timer/icount",
 638     .version_id = 1,
 639     .minimum_version_id = 1,
 640     .needed = icount_state_needed,
 641     .fields = (VMStateField[]) {
 642         VMSTATE_INT64(qemu_icount_bias, TimersState),
 643         VMSTATE_INT64(qemu_icount, TimersState),
 644         VMSTATE_END_OF_LIST()
 645     }
 646 };
 647
 648 static const VMStateDescription vmstate_timers = {
 649     .name = "timer",
 650     .version_id = 2,
 651     .minimum_version_id = 1,
 652     .fields = (VMStateField[]) {
 653         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 654         VMSTATE_INT64(dummy, TimersState),
 655         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 656         VMSTATE_END_OF_LIST()
 657     },
 658     .subsections = (const VMStateDescription*[]) {
 659         &icount_vmstate_timers,
 660         NULL
 661     }
 662 };
 663
 664 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 665 {
 666     double pct;
 667     double throttle_ratio;
 668     long sleeptime_ns;
 669
 670     if (!cpu_throttle_get_percentage()) {
 671         return;
 672     }
 673
 674     pct = (double)cpu_throttle_get_percentage()/100;
 675     throttle_ratio = pct / (1 - pct);
 676     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 677
 678     qemu_mutex_unlock_iothread();
 679     atomic_set(&cpu->throttle_thread_scheduled, 0);
 680     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 681     qemu_mutex_lock_iothread();
 682 }
 683
 684 static void cpu_throttle_timer_tick(void *opaque)
 685 {
 686     CPUState *cpu;
 687     double pct;
 688
 689     /* Stop the timer if needed */
 690     if (!cpu_throttle_get_percentage()) {
 691         return;
 692     }
 693     CPU_FOREACH(cpu) {
 694         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 695             async_run_on_cpu(cpu, cpu_throttle_thread,
 696                              RUN_ON_CPU_NULL);
 697         }
 698     }
 699
 700     pct = (double)cpu_throttle_get_percentage()/100;
 701     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 702                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 703 }
 704
 705 void cpu_throttle_set(int new_throttle_pct)
 706 {
 707     /* Ensure throttle percentage is within valid range */
 708     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 709     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 710
 711     atomic_set(&throttle_percentage, new_throttle_pct);
 712
 713     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 714                                        CPU_THROTTLE_TIMESLICE_NS);
 715 }
 716
 717 void cpu_throttle_stop(void)
 718 {
 719     atomic_set(&throttle_percentage, 0);
 720 }
 721
 722 bool cpu_throttle_active(void)
 723 {
 724     return (cpu_throttle_get_percentage() != 0);
 725 }
 726
 727 int cpu_throttle_get_percentage(void)
 728 {
 729     return atomic_read(&throttle_percentage);
 730 }
 731
 732 void cpu_ticks_init(void)
 733 {
 734     seqlock_init(&timers_state.vm_clock_seqlock);
 735     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 736     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 737                                            cpu_throttle_timer_tick, NULL);
 738 }
 739
 740 void configure_icount(QemuOpts *opts, Error **errp)
 741 {
 742     const char *option;
 743     char *rem_str = NULL;
 744
 745     option = qemu_opt_get(opts, "shift");
 746     if (!option) {
 747         if (qemu_opt_get(opts, "align") != NULL) {
 748             error_setg(errp, "Please specify shift option when using align");
 749         }
 750         return;
 751     }
 752
 753     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 754     if (icount_sleep) {
 755         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 756                                          icount_timer_cb, NULL);
 757     }
 758
 759     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 760
 761     if (icount_align_option && !icount_sleep) {
 762         error_setg(errp, "align=on and sleep=off are incompatible");
 763     }
 764     if (strcmp(option, "auto") != 0) {
 765         errno = 0;
 766         icount_time_shift = strtol(option, &rem_str, 0);
 767         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 768             error_setg(errp, "icount: Invalid shift value");
 769         }
 770         use_icount = 1;
 771         return;
 772     } else if (icount_align_option) {
 773         error_setg(errp, "shift=auto and align=on are incompatible");
 774     } else if (!icount_sleep) {
 775         error_setg(errp, "shift=auto and sleep=off are incompatible");
 776     }
 777
 778     use_icount = 2;
 779
 780     /* 125MIPS seems a reasonable initial guess at the guest speed.
 781        It will be corrected fairly quickly anyway.  */
 782     icount_time_shift = 3;
 783
 784     /* Have both realtime and virtual time triggers for speed adjustment.
 785        The realtime trigger catches emulated time passing too slowly,
 786        the virtual time trigger catches emulated time passing too fast.
 787        Realtime triggers occur even when idle, so use them less frequently
 788        than VM triggers.  */
 789     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 790                                    icount_adjust_rt, NULL);
 791     timer_mod(icount_rt_timer,
 792                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 793     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 794                                         icount_adjust_vm, NULL);
 795     timer_mod(icount_vm_timer,
 796                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 797                    NANOSECONDS_PER_SECOND / 10);
 798 }
 799
 800 /***********************************************************/
 801 /* TCG vCPU kick timer
 802  *
 803  * The kick timer is responsible for moving single threaded vCPU
 804  * emulation on to the next vCPU. If more than one vCPU is running a
 805  * timer event with force a cpu->exit so the next vCPU can get
 806  * scheduled.
 807  *
 808  * The timer is removed if all vCPUs are idle and restarted again once
 809  * idleness is complete.
 810  */
 811
 812 static QEMUTimer *tcg_kick_vcpu_timer;
 813 static CPUState *tcg_current_rr_cpu;
 814
 815 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 816
 817 static inline int64_t qemu_tcg_next_kick(void)
 818 {
 819     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 820 }
 821
 822 /* Kick the currently round-robin scheduled vCPU */
 823 static void qemu_cpu_kick_rr_cpu(void)
 824 {
 825     CPUState *cpu;
 826     do {
 827         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 828         if (cpu) {
 829             cpu_exit(cpu);
 830         }
 831     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 832 }
 833
 834 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 835 {
 836 }
 837
 838 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 839 {
 840     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 841         qemu_notify_event();
 842         return;
 843     }
 844
 845     if (!qemu_in_vcpu_thread() && first_cpu) {
 846         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 847          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 848          * causes cpu_thread_is_idle to return false.  This way,
 849          * handle_icount_deadline can run.
 850          */
 851         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 852     }
 853 }
 854
 855 static void kick_tcg_thread(void *opaque)
 856 {
 857     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 858     qemu_cpu_kick_rr_cpu();
 859 }
 860
 861 static void start_tcg_kick_timer(void)
 862 {
 863     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 864         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 865                                            kick_tcg_thread, NULL);
 866         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 867     }
 868 }
 869
 870 static void stop_tcg_kick_timer(void)
 871 {
 872     if (tcg_kick_vcpu_timer) {
 873         timer_del(tcg_kick_vcpu_timer);
 874         tcg_kick_vcpu_timer = NULL;
 875     }
 876 }
 877
 878 /***********************************************************/
 879 void hw_error(const char *fmt, ...)
 880 {
 881     va_list ap;
 882     CPUState *cpu;
 883
 884     va_start(ap, fmt);
 885     fprintf(stderr, "qemu: hardware error: ");
 886     vfprintf(stderr, fmt, ap);
 887     fprintf(stderr, "\n");
 888     CPU_FOREACH(cpu) {
 889         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 890         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 891     }
 892     va_end(ap);
 893     abort();
 894 }
 895
 896 void cpu_synchronize_all_states(void)
 897 {
 898     CPUState *cpu;
 899
 900     CPU_FOREACH(cpu) {
 901         cpu_synchronize_state(cpu);
 902     }
 903 }
 904
 905 void cpu_synchronize_all_post_reset(void)
 906 {
 907     CPUState *cpu;
 908
 909     CPU_FOREACH(cpu) {
 910         cpu_synchronize_post_reset(cpu);
 911     }
 912 }
 913
 914 void cpu_synchronize_all_post_init(void)
 915 {
 916     CPUState *cpu;
 917
 918     CPU_FOREACH(cpu) {
 919         cpu_synchronize_post_init(cpu);
 920     }
 921 }
 922
 923 static int do_vm_stop(RunState state)
 924 {
 925     int ret = 0;
 926
 927     if (runstate_is_running()) {
 928         cpu_disable_ticks();
 929         pause_all_vcpus();
 930         runstate_set(state);
 931         vm_state_notify(0, state);
 932         qapi_event_send_stop(&error_abort);
 933     }
 934
 935     bdrv_drain_all();
 936     replay_disable_events();
 937     ret = bdrv_flush_all();
 938
 939     return ret;
 940 }
 941
 942 static bool cpu_can_run(CPUState *cpu)
 943 {
 944     if (cpu->stop) {
 945         return false;
 946     }
 947     if (cpu_is_stopped(cpu)) {
 948         return false;
 949     }
 950     return true;
 951 }
 952
 953 static void cpu_handle_guest_debug(CPUState *cpu)
 954 {
 955     gdb_set_stop_cpu(cpu);
 956     qemu_system_debug_request();
 957     cpu->stopped = true;
 958 }
 959
 960 #ifdef CONFIG_LINUX
 961 static void sigbus_reraise(void)
 962 {
 963     sigset_t set;
 964     struct sigaction action;
 965
 966     memset(&action, 0, sizeof(action));
 967     action.sa_handler = SIG_DFL;
 968     if (!sigaction(SIGBUS, &action, NULL)) {
 969         raise(SIGBUS);
 970         sigemptyset(&set);
 971         sigaddset(&set, SIGBUS);
 972         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 973     }
 974     perror("Failed to re-raise SIGBUS!\n");
 975     abort();
 976 }
 977
 978 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
 979 {
 980     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
 981         sigbus_reraise();
 982     }
 983
 984     if (current_cpu) {
 985         /* Called asynchronously in VCPU thread.  */
 986         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
 987             sigbus_reraise();
 988         }
 989     } else {
 990         /* Called synchronously (via signalfd) in main thread.  */
 991         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
 992             sigbus_reraise();
 993         }
 994     }
 995 }
 996
 997 static void qemu_init_sigbus(void)
 998 {
 999     struct sigaction action;
1000
1001     memset(&action, 0, sizeof(action));
1002     action.sa_flags = SA_SIGINFO;
1003     action.sa_sigaction = sigbus_handler;
1004     sigaction(SIGBUS, &action, NULL);
1005
1006     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1007 }
1008 #else /* !CONFIG_LINUX */
1009 static void qemu_init_sigbus(void)
1010 {
1011 }
1012 #endif /* !CONFIG_LINUX */
1013
1014 static QemuMutex qemu_global_mutex;
1015
1016 static QemuThread io_thread;
1017
1018 /* cpu creation */
1019 static QemuCond qemu_cpu_cond;
1020 /* system init */
1021 static QemuCond qemu_pause_cond;
1022
1023 void qemu_init_cpu_loop(void)
1024 {
1025     qemu_init_sigbus();
1026     qemu_cond_init(&qemu_cpu_cond);
1027     qemu_cond_init(&qemu_pause_cond);
1028     qemu_mutex_init(&qemu_global_mutex);
1029
1030     qemu_thread_get_self(&io_thread);
1031 }
1032
1033 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1034 {
1035     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1036 }
1037
1038 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1039 {
1040     if (kvm_destroy_vcpu(cpu) < 0) {
1041         error_report("kvm_destroy_vcpu failed");
1042         exit(EXIT_FAILURE);
1043     }
1044 }
1045
1046 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1047 {
1048 }
1049
1050 static void qemu_wait_io_event_common(CPUState *cpu)
1051 {
1052     atomic_mb_set(&cpu->thread_kicked, false);
1053     if (cpu->stop) {
1054         cpu->stop = false;
1055         cpu->stopped = true;
1056         qemu_cond_broadcast(&qemu_pause_cond);
1057     }
1058     process_queued_cpu_work(cpu);
1059 }
1060
1061 static bool qemu_tcg_should_sleep(CPUState *cpu)
1062 {
1063     if (mttcg_enabled) {
1064         return cpu_thread_is_idle(cpu);
1065     } else {
1066         return all_cpu_threads_idle();
1067     }
1068 }
1069
1070 static void qemu_tcg_wait_io_event(CPUState *cpu)
1071 {
1072     while (qemu_tcg_should_sleep(cpu)) {
1073         stop_tcg_kick_timer();
1074         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1075     }
1076
1077     start_tcg_kick_timer();
1078
1079     qemu_wait_io_event_common(cpu);
1080 }
1081
1082 static void qemu_kvm_wait_io_event(CPUState *cpu)
1083 {
1084     while (cpu_thread_is_idle(cpu)) {
1085         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1086     }
1087
1088     qemu_wait_io_event_common(cpu);
1089 }
1090
1091 static void *qemu_kvm_cpu_thread_fn(void *arg)
1092 {
1093     CPUState *cpu = arg;
1094     int r;
1095
1096     rcu_register_thread();
1097
1098     qemu_mutex_lock_iothread();
1099     qemu_thread_get_self(cpu->thread);
1100     cpu->thread_id = qemu_get_thread_id();
1101     cpu->can_do_io = 1;
1102     current_cpu = cpu;
1103
1104     r = kvm_init_vcpu(cpu);
1105     if (r < 0) {
1106         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1107         exit(1);
1108     }
1109
1110     kvm_init_cpu_signals(cpu);
1111
1112     /* signal CPU creation */
1113     cpu->created = true;
1114     qemu_cond_signal(&qemu_cpu_cond);
1115
1116     do {
1117         if (cpu_can_run(cpu)) {
1118             r = kvm_cpu_exec(cpu);
1119             if (r == EXCP_DEBUG) {
1120                 cpu_handle_guest_debug(cpu);
1121             }
1122         }
1123         qemu_kvm_wait_io_event(cpu);
1124     } while (!cpu->unplug || cpu_can_run(cpu));
1125
1126     qemu_kvm_destroy_vcpu(cpu);
1127     cpu->created = false;
1128     qemu_cond_signal(&qemu_cpu_cond);
1129     qemu_mutex_unlock_iothread();
1130     return NULL;
1131 }
1132
1133 static void *qemu_dummy_cpu_thread_fn(void *arg)
1134 {
1135 #ifdef _WIN32
1136     fprintf(stderr, "qtest is not supported under Windows\n");
1137     exit(1);
1138 #else
1139     CPUState *cpu = arg;
1140     sigset_t waitset;
1141     int r;
1142
1143     rcu_register_thread();
1144
1145     qemu_mutex_lock_iothread();
1146     qemu_thread_get_self(cpu->thread);
1147     cpu->thread_id = qemu_get_thread_id();
1148     cpu->can_do_io = 1;
1149     current_cpu = cpu;
1150
1151     sigemptyset(&waitset);
1152     sigaddset(&waitset, SIG_IPI);
1153
1154     /* signal CPU creation */
1155     cpu->created = true;
1156     qemu_cond_signal(&qemu_cpu_cond);
1157
1158     while (1) {
1159         qemu_mutex_unlock_iothread();
1160         do {
1161             int sig;
1162             r = sigwait(&waitset, &sig);
1163         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1164         if (r == -1) {
1165             perror("sigwait");
1166             exit(1);
1167         }
1168         qemu_mutex_lock_iothread();
1169         qemu_wait_io_event_common(cpu);
1170     }
1171
1172     return NULL;
1173 #endif
1174 }
1175
1176 static int64_t tcg_get_icount_limit(void)
1177 {
1178     int64_t deadline;
1179
1180     if (replay_mode != REPLAY_MODE_PLAY) {
1181         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1182
1183         /* Maintain prior (possibly buggy) behaviour where if no deadline
1184          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1185          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1186          * nanoseconds.
1187          */
1188         if ((deadline < 0) || (deadline > INT32_MAX)) {
1189             deadline = INT32_MAX;
1190         }
1191
1192         return qemu_icount_round(deadline);
1193     } else {
1194         return replay_get_instructions();
1195     }
1196 }
1197
1198 static void handle_icount_deadline(void)
1199 {
1200     assert(qemu_in_vcpu_thread());
1201     if (use_icount) {
1202         int64_t deadline =
1203             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1204
1205         if (deadline == 0) {
1206             /* Wake up other AioContexts.  */
1207             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1208             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1209         }
1210     }
1211 }
1212
1213 static void prepare_icount_for_run(CPUState *cpu)
1214 {
1215     if (use_icount) {
1216         int insns_left;
1217
1218         /* These should always be cleared by process_icount_data after
1219          * each vCPU execution. However u16.high can be raised
1220          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1221          */
1222         g_assert(cpu->icount_decr.u16.low == 0);
1223         g_assert(cpu->icount_extra == 0);
1224
1225         cpu->icount_budget = tcg_get_icount_limit();
1226         insns_left = MIN(0xffff, cpu->icount_budget);
1227         cpu->icount_decr.u16.low = insns_left;
1228         cpu->icount_extra = cpu->icount_budget - insns_left;
1229     }
1230 }
1231
1232 static void process_icount_data(CPUState *cpu)
1233 {
1234     if (use_icount) {
1235         /* Account for executed instructions */
1236         cpu_update_icount(cpu);
1237
1238         /* Reset the counters */
1239         cpu->icount_decr.u16.low = 0;
1240         cpu->icount_extra = 0;
1241         cpu->icount_budget = 0;
1242
1243         replay_account_executed_instructions();
1244     }
1245 }
1246
1247
1248 static int tcg_cpu_exec(CPUState *cpu)
1249 {
1250     int ret;
1251 #ifdef CONFIG_PROFILER
1252     int64_t ti;
1253 #endif
1254
1255 #ifdef CONFIG_PROFILER
1256     ti = profile_getclock();
1257 #endif
1258     qemu_mutex_unlock_iothread();
1259     cpu_exec_start(cpu);
1260     ret = cpu_exec(cpu);
1261     cpu_exec_end(cpu);
1262     qemu_mutex_lock_iothread();
1263 #ifdef CONFIG_PROFILER
1264     tcg_time += profile_getclock() - ti;
1265 #endif
1266     return ret;
1267 }
1268
1269 /* Destroy any remaining vCPUs which have been unplugged and have
1270  * finished running
1271  */
1272 static void deal_with_unplugged_cpus(void)
1273 {
1274     CPUState *cpu;
1275
1276     CPU_FOREACH(cpu) {
1277         if (cpu->unplug && !cpu_can_run(cpu)) {
1278             qemu_tcg_destroy_vcpu(cpu);
1279             cpu->created = false;
1280             qemu_cond_signal(&qemu_cpu_cond);
1281             break;
1282         }
1283     }
1284 }
1285
1286 /* Single-threaded TCG
1287  *
1288  * In the single-threaded case each vCPU is simulated in turn. If
1289  * there is more than a single vCPU we create a simple timer to kick
1290  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1291  * This is done explicitly rather than relying on side-effects
1292  * elsewhere.
1293  */
1294
1295 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1296 {
1297     CPUState *cpu = arg;
1298
1299     rcu_register_thread();
1300
1301     qemu_mutex_lock_iothread();
1302     qemu_thread_get_self(cpu->thread);
1303
1304     CPU_FOREACH(cpu) {
1305         cpu->thread_id = qemu_get_thread_id();
1306         cpu->created = true;
1307         cpu->can_do_io = 1;
1308     }
1309     qemu_cond_signal(&qemu_cpu_cond);
1310
1311     /* wait for initial kick-off after machine start */
1312     while (first_cpu->stopped) {
1313         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1314
1315         /* process any pending work */
1316         CPU_FOREACH(cpu) {
1317             current_cpu = cpu;
1318             qemu_wait_io_event_common(cpu);
1319         }
1320     }
1321
1322     start_tcg_kick_timer();
1323
1324     cpu = first_cpu;
1325
1326     /* process any pending work */
1327     cpu->exit_request = 1;
1328
1329     while (1) {
1330         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1331         qemu_account_warp_timer();
1332
1333         /* Run the timers here.  This is much more efficient than
1334          * waking up the I/O thread and waiting for completion.
1335          */
1336         handle_icount_deadline();
1337
1338         if (!cpu) {
1339             cpu = first_cpu;
1340         }
1341
1342         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1343
1344             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1345             current_cpu = cpu;
1346
1347             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1348                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1349
1350             if (cpu_can_run(cpu)) {
1351                 int r;
1352
1353                 prepare_icount_for_run(cpu);
1354
1355                 r = tcg_cpu_exec(cpu);
1356
1357                 process_icount_data(cpu);
1358
1359                 if (r == EXCP_DEBUG) {
1360                     cpu_handle_guest_debug(cpu);
1361                     break;
1362                 } else if (r == EXCP_ATOMIC) {
1363                     qemu_mutex_unlock_iothread();
1364                     cpu_exec_step_atomic(cpu);
1365                     qemu_mutex_lock_iothread();
1366                     break;
1367                 }
1368             } else if (cpu->stop) {
1369                 if (cpu->unplug) {
1370                     cpu = CPU_NEXT(cpu);
1371                 }
1372                 break;
1373             }
1374
1375             cpu = CPU_NEXT(cpu);
1376         } /* while (cpu && !cpu->exit_request).. */
1377
1378         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1379         atomic_set(&tcg_current_rr_cpu, NULL);
1380
1381         if (cpu && cpu->exit_request) {
1382             atomic_mb_set(&cpu->exit_request, 0);
1383         }
1384
1385         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1386         deal_with_unplugged_cpus();
1387     }
1388
1389     return NULL;
1390 }
1391
1392 static void *qemu_hax_cpu_thread_fn(void *arg)
1393 {
1394     CPUState *cpu = arg;
1395     int r;
1396
1397     qemu_mutex_lock_iothread();
1398     qemu_thread_get_self(cpu->thread);
1399
1400     cpu->thread_id = qemu_get_thread_id();
1401     cpu->created = true;
1402     cpu->halted = 0;
1403     current_cpu = cpu;
1404
1405     hax_init_vcpu(cpu);
1406     qemu_cond_signal(&qemu_cpu_cond);
1407
1408     while (1) {
1409         if (cpu_can_run(cpu)) {
1410             r = hax_smp_cpu_exec(cpu);
1411             if (r == EXCP_DEBUG) {
1412                 cpu_handle_guest_debug(cpu);
1413             }
1414         }
1415
1416         while (cpu_thread_is_idle(cpu)) {
1417             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1418         }
1419 #ifdef _WIN32
1420         SleepEx(0, TRUE);
1421 #endif
1422         qemu_wait_io_event_common(cpu);
1423     }
1424     return NULL;
1425 }
1426
1427 #ifdef _WIN32
1428 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1429 {
1430 }
1431 #endif
1432
1433 /* Multi-threaded TCG
1434  *
1435  * In the multi-threaded case each vCPU has its own thread. The TLS
1436  * variable current_cpu can be used deep in the code to find the
1437  * current CPUState for a given thread.
1438  */
1439
1440 static void *qemu_tcg_cpu_thread_fn(void *arg)
1441 {
1442     CPUState *cpu = arg;
1443
1444     g_assert(!use_icount);
1445
1446     rcu_register_thread();
1447
1448     qemu_mutex_lock_iothread();
1449     qemu_thread_get_self(cpu->thread);
1450
1451     cpu->thread_id = qemu_get_thread_id();
1452     cpu->created = true;
1453     cpu->can_do_io = 1;
1454     current_cpu = cpu;
1455     qemu_cond_signal(&qemu_cpu_cond);
1456
1457     /* process any pending work */
1458     cpu->exit_request = 1;
1459
1460     while (1) {
1461         if (cpu_can_run(cpu)) {
1462             int r;
1463             r = tcg_cpu_exec(cpu);
1464             switch (r) {
1465             case EXCP_DEBUG:
1466                 cpu_handle_guest_debug(cpu);
1467                 break;
1468             case EXCP_HALTED:
1469                 /* during start-up the vCPU is reset and the thread is
1470                  * kicked several times. If we don't ensure we go back
1471                  * to sleep in the halted state we won't cleanly
1472                  * start-up when the vCPU is enabled.
1473                  *
1474                  * cpu->halted should ensure we sleep in wait_io_event
1475                  */
1476                 g_assert(cpu->halted);
1477                 break;
1478             case EXCP_ATOMIC:
1479                 qemu_mutex_unlock_iothread();
1480                 cpu_exec_step_atomic(cpu);
1481                 qemu_mutex_lock_iothread();
1482             default:
1483                 /* Ignore everything else? */
1484                 break;
1485             }
1486         }
1487
1488         atomic_mb_set(&cpu->exit_request, 0);
1489         qemu_tcg_wait_io_event(cpu);
1490     }
1491
1492     return NULL;
1493 }
1494
1495 static void qemu_cpu_kick_thread(CPUState *cpu)
1496 {
1497 #ifndef _WIN32
1498     int err;
1499
1500     if (cpu->thread_kicked) {
1501         return;
1502     }
1503     cpu->thread_kicked = true;
1504     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1505     if (err) {
1506         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1507         exit(1);
1508     }
1509 #else /* _WIN32 */
1510     if (!qemu_cpu_is_self(cpu)) {
1511         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1512             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1513                     __func__, GetLastError());
1514             exit(1);
1515         }
1516     }
1517 #endif
1518 }
1519
1520 void qemu_cpu_kick(CPUState *cpu)
1521 {
1522     qemu_cond_broadcast(cpu->halt_cond);
1523     if (tcg_enabled()) {
1524         cpu_exit(cpu);
1525         /* NOP unless doing single-thread RR */
1526         qemu_cpu_kick_rr_cpu();
1527     } else {
1528         if (hax_enabled()) {
1529             /*
1530              * FIXME: race condition with the exit_request check in
1531              * hax_vcpu_hax_exec
1532              */
1533             cpu->exit_request = 1;
1534         }
1535         qemu_cpu_kick_thread(cpu);
1536     }
1537 }
1538
1539 void qemu_cpu_kick_self(void)
1540 {
1541     assert(current_cpu);
1542     qemu_cpu_kick_thread(current_cpu);
1543 }
1544
1545 bool qemu_cpu_is_self(CPUState *cpu)
1546 {
1547     return qemu_thread_is_self(cpu->thread);
1548 }
1549
1550 bool qemu_in_vcpu_thread(void)
1551 {
1552     return current_cpu && qemu_cpu_is_self(current_cpu);
1553 }
1554
1555 static __thread bool iothread_locked = false;
1556
1557 bool qemu_mutex_iothread_locked(void)
1558 {
1559     return iothread_locked;
1560 }
1561
1562 void qemu_mutex_lock_iothread(void)
1563 {
1564     g_assert(!qemu_mutex_iothread_locked());
1565     qemu_mutex_lock(&qemu_global_mutex);
1566     iothread_locked = true;
1567 }
1568
1569 void qemu_mutex_unlock_iothread(void)
1570 {
1571     g_assert(qemu_mutex_iothread_locked());
1572     iothread_locked = false;
1573     qemu_mutex_unlock(&qemu_global_mutex);
1574 }
1575
1576 static bool all_vcpus_paused(void)
1577 {
1578     CPUState *cpu;
1579
1580     CPU_FOREACH(cpu) {
1581         if (!cpu->stopped) {
1582             return false;
1583         }
1584     }
1585
1586     return true;
1587 }
1588
1589 void pause_all_vcpus(void)
1590 {
1591     CPUState *cpu;
1592
1593     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1594     CPU_FOREACH(cpu) {
1595         cpu->stop = true;
1596         qemu_cpu_kick(cpu);
1597     }
1598
1599     if (qemu_in_vcpu_thread()) {
1600         cpu_stop_current();
1601     }
1602
1603     while (!all_vcpus_paused()) {
1604         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1605         CPU_FOREACH(cpu) {
1606             qemu_cpu_kick(cpu);
1607         }
1608     }
1609 }
1610
1611 void cpu_resume(CPUState *cpu)
1612 {
1613     cpu->stop = false;
1614     cpu->stopped = false;
1615     qemu_cpu_kick(cpu);
1616 }
1617
1618 void resume_all_vcpus(void)
1619 {
1620     CPUState *cpu;
1621
1622     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1623     CPU_FOREACH(cpu) {
1624         cpu_resume(cpu);
1625     }
1626 }
1627
1628 void cpu_remove(CPUState *cpu)
1629 {
1630     cpu->stop = true;
1631     cpu->unplug = true;
1632     qemu_cpu_kick(cpu);
1633 }
1634
1635 void cpu_remove_sync(CPUState *cpu)
1636 {
1637     cpu_remove(cpu);
1638     while (cpu->created) {
1639         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1640     }
1641 }
1642
1643 /* For temporary buffers for forming a name */
1644 #define VCPU_THREAD_NAME_SIZE 16
1645
1646 static void qemu_tcg_init_vcpu(CPUState *cpu)
1647 {
1648     char thread_name[VCPU_THREAD_NAME_SIZE];
1649     static QemuCond *single_tcg_halt_cond;
1650     static QemuThread *single_tcg_cpu_thread;
1651
1652     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1653         cpu->thread = g_malloc0(sizeof(QemuThread));
1654         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1655         qemu_cond_init(cpu->halt_cond);
1656
1657         if (qemu_tcg_mttcg_enabled()) {
1658             /* create a thread per vCPU with TCG (MTTCG) */
1659             parallel_cpus = true;
1660             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1661                  cpu->cpu_index);
1662
1663             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1664                                cpu, QEMU_THREAD_JOINABLE);
1665
1666         } else {
1667             /* share a single thread for all cpus with TCG */
1668             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1669             qemu_thread_create(cpu->thread, thread_name,
1670                                qemu_tcg_rr_cpu_thread_fn,
1671                                cpu, QEMU_THREAD_JOINABLE);
1672
1673             single_tcg_halt_cond = cpu->halt_cond;
1674             single_tcg_cpu_thread = cpu->thread;
1675         }
1676 #ifdef _WIN32
1677         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1678 #endif
1679         while (!cpu->created) {
1680             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1681         }
1682     } else {
1683         /* For non-MTTCG cases we share the thread */
1684         cpu->thread = single_tcg_cpu_thread;
1685         cpu->halt_cond = single_tcg_halt_cond;
1686     }
1687 }
1688
1689 static void qemu_hax_start_vcpu(CPUState *cpu)
1690 {
1691     char thread_name[VCPU_THREAD_NAME_SIZE];
1692
1693     cpu->thread = g_malloc0(sizeof(QemuThread));
1694     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1695     qemu_cond_init(cpu->halt_cond);
1696
1697     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1698              cpu->cpu_index);
1699     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1700                        cpu, QEMU_THREAD_JOINABLE);
1701 #ifdef _WIN32
1702     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1703 #endif
1704     while (!cpu->created) {
1705         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1706     }
1707 }
1708
1709 static void qemu_kvm_start_vcpu(CPUState *cpu)
1710 {
1711     char thread_name[VCPU_THREAD_NAME_SIZE];
1712
1713     cpu->thread = g_malloc0(sizeof(QemuThread));
1714     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1715     qemu_cond_init(cpu->halt_cond);
1716     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1717              cpu->cpu_index);
1718     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1719                        cpu, QEMU_THREAD_JOINABLE);
1720     while (!cpu->created) {
1721         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1722     }
1723 }
1724
1725 static void qemu_dummy_start_vcpu(CPUState *cpu)
1726 {
1727     char thread_name[VCPU_THREAD_NAME_SIZE];
1728
1729     cpu->thread = g_malloc0(sizeof(QemuThread));
1730     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1731     qemu_cond_init(cpu->halt_cond);
1732     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1733              cpu->cpu_index);
1734     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1735                        QEMU_THREAD_JOINABLE);
1736     while (!cpu->created) {
1737         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1738     }
1739 }
1740
1741 void qemu_init_vcpu(CPUState *cpu)
1742 {
1743     cpu->nr_cores = smp_cores;
1744     cpu->nr_threads = smp_threads;
1745     cpu->stopped = true;
1746
1747     if (!cpu->as) {
1748         /* If the target cpu hasn't set up any address spaces itself,
1749          * give it the default one.
1750          */
1751         AddressSpace *as = address_space_init_shareable(cpu->memory,
1752                                                         "cpu-memory");
1753         cpu->num_ases = 1;
1754         cpu_address_space_init(cpu, as, 0);
1755     }
1756
1757     if (kvm_enabled()) {
1758         qemu_kvm_start_vcpu(cpu);
1759     } else if (hax_enabled()) {
1760         qemu_hax_start_vcpu(cpu);
1761     } else if (tcg_enabled()) {
1762         qemu_tcg_init_vcpu(cpu);
1763     } else {
1764         qemu_dummy_start_vcpu(cpu);
1765     }
1766 }
1767
1768 void cpu_stop_current(void)
1769 {
1770     if (current_cpu) {
1771         current_cpu->stop = false;
1772         current_cpu->stopped = true;
1773         cpu_exit(current_cpu);
1774         qemu_cond_broadcast(&qemu_pause_cond);
1775     }
1776 }
1777
1778 int vm_stop(RunState state)
1779 {
1780     if (qemu_in_vcpu_thread()) {
1781         qemu_system_vmstop_request_prepare();
1782         qemu_system_vmstop_request(state);
1783         /*
1784          * FIXME: should not return to device code in case
1785          * vm_stop() has been requested.
1786          */
1787         cpu_stop_current();
1788         return 0;
1789     }
1790
1791     return do_vm_stop(state);
1792 }
1793
1794 /**
1795  * Prepare for (re)starting the VM.
1796  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1797  * running or in case of an error condition), 0 otherwise.
1798  */
1799 int vm_prepare_start(void)
1800 {
1801     RunState requested;
1802     int res = 0;
1803
1804     qemu_vmstop_requested(&requested);
1805     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1806         return -1;
1807     }
1808
1809     /* Ensure that a STOP/RESUME pair of events is emitted if a
1810      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1811      * example, according to documentation is always followed by
1812      * the STOP event.
1813      */
1814     if (runstate_is_running()) {
1815         qapi_event_send_stop(&error_abort);
1816         res = -1;
1817     } else {
1818         replay_enable_events();
1819         cpu_enable_ticks();
1820         runstate_set(RUN_STATE_RUNNING);
1821         vm_state_notify(1, RUN_STATE_RUNNING);
1822     }
1823
1824     /* We are sending this now, but the CPUs will be resumed shortly later */
1825     qapi_event_send_resume(&error_abort);
1826     return res;
1827 }
1828
1829 void vm_start(void)
1830 {
1831     if (!vm_prepare_start()) {
1832         resume_all_vcpus();
1833     }
1834 }
1835
1836 /* does a state transition even if the VM is already stopped,
1837    current state is forgotten forever */
1838 int vm_stop_force_state(RunState state)
1839 {
1840     if (runstate_is_running()) {
1841         return vm_stop(state);
1842     } else {
1843         runstate_set(state);
1844
1845         bdrv_drain_all();
1846         /* Make sure to return an error if the flush in a previous vm_stop()
1847          * failed. */
1848         return bdrv_flush_all();
1849     }
1850 }
1851
1852 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1853 {
1854     /* XXX: implement xxx_cpu_list for targets that still miss it */
1855 #if defined(cpu_list)
1856     cpu_list(f, cpu_fprintf);
1857 #endif
1858 }
1859
1860 CpuInfoList *qmp_query_cpus(Error **errp)
1861 {
1862     CpuInfoList *head = NULL, *cur_item = NULL;
1863     CPUState *cpu;
1864
1865     CPU_FOREACH(cpu) {
1866         CpuInfoList *info;
1867 #if defined(TARGET_I386)
1868         X86CPU *x86_cpu = X86_CPU(cpu);
1869         CPUX86State *env = &x86_cpu->env;
1870 #elif defined(TARGET_PPC)
1871         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1872         CPUPPCState *env = &ppc_cpu->env;
1873 #elif defined(TARGET_SPARC)
1874         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1875         CPUSPARCState *env = &sparc_cpu->env;
1876 #elif defined(TARGET_MIPS)
1877         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1878         CPUMIPSState *env = &mips_cpu->env;
1879 #elif defined(TARGET_TRICORE)
1880         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1881         CPUTriCoreState *env = &tricore_cpu->env;
1882 #endif
1883
1884         cpu_synchronize_state(cpu);
1885
1886         info = g_malloc0(sizeof(*info));
1887         info->value = g_malloc0(sizeof(*info->value));
1888         info->value->CPU = cpu->cpu_index;
1889         info->value->current = (cpu == first_cpu);
1890         info->value->halted = cpu->halted;
1891         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1892         info->value->thread_id = cpu->thread_id;
1893 #if defined(TARGET_I386)
1894         info->value->arch = CPU_INFO_ARCH_X86;
1895         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1896 #elif defined(TARGET_PPC)
1897         info->value->arch = CPU_INFO_ARCH_PPC;
1898         info->value->u.ppc.nip = env->nip;
1899 #elif defined(TARGET_SPARC)
1900         info->value->arch = CPU_INFO_ARCH_SPARC;
1901         info->value->u.q_sparc.pc = env->pc;
1902         info->value->u.q_sparc.npc = env->npc;
1903 #elif defined(TARGET_MIPS)
1904         info->value->arch = CPU_INFO_ARCH_MIPS;
1905         info->value->u.q_mips.PC = env->active_tc.PC;
1906 #elif defined(TARGET_TRICORE)
1907         info->value->arch = CPU_INFO_ARCH_TRICORE;
1908         info->value->u.tricore.PC = env->PC;
1909 #else
1910         info->value->arch = CPU_INFO_ARCH_OTHER;
1911 #endif
1912
1913         /* XXX: waiting for the qapi to support GSList */
1914         if (!cur_item) {
1915             head = cur_item = info;
1916         } else {
1917             cur_item->next = info;
1918             cur_item = info;
1919         }
1920     }
1921
1922     return head;
1923 }
1924
1925 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1926                  bool has_cpu, int64_t cpu_index, Error **errp)
1927 {
1928     FILE *f;
1929     uint32_t l;
1930     CPUState *cpu;
1931     uint8_t buf[1024];
1932     int64_t orig_addr = addr, orig_size = size;
1933
1934     if (!has_cpu) {
1935         cpu_index = 0;
1936     }
1937
1938     cpu = qemu_get_cpu(cpu_index);
1939     if (cpu == NULL) {
1940         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1941                    "a CPU number");
1942         return;
1943     }
1944
1945     f = fopen(filename, "wb");
1946     if (!f) {
1947         error_setg_file_open(errp, errno, filename);
1948         return;
1949     }
1950
1951     while (size != 0) {
1952         l = sizeof(buf);
1953         if (l > size)
1954             l = size;
1955         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1956             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1957                              " specified", orig_addr, orig_size);
1958             goto exit;
1959         }
1960         if (fwrite(buf, 1, l, f) != l) {
1961             error_setg(errp, QERR_IO_ERROR);
1962             goto exit;
1963         }
1964         addr += l;
1965         size -= l;
1966     }
1967
1968 exit:
1969     fclose(f);
1970 }
1971
1972 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1973                   Error **errp)
1974 {
1975     FILE *f;
1976     uint32_t l;
1977     uint8_t buf[1024];
1978
1979     f = fopen(filename, "wb");
1980     if (!f) {
1981         error_setg_file_open(errp, errno, filename);
1982         return;
1983     }
1984
1985     while (size != 0) {
1986         l = sizeof(buf);
1987         if (l > size)
1988             l = size;
1989         cpu_physical_memory_read(addr, buf, l);
1990         if (fwrite(buf, 1, l, f) != l) {
1991             error_setg(errp, QERR_IO_ERROR);
1992             goto exit;
1993         }
1994         addr += l;
1995         size -= l;
1996     }
1997
1998 exit:
1999     fclose(f);
2000 }
2001
2002 void qmp_inject_nmi(Error **errp)
2003 {
2004     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2005 }
2006
2007 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2008 {
2009     if (!use_icount) {
2010         return;
2011     }
2012
2013     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2014                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2015     if (icount_align_option) {
2016         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2017         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2018     } else {
2019         cpu_fprintf(f, "Max guest delay     NA\n");
2020         cpu_fprintf(f, "Max guest advance   NA\n");
2021     }
2022 }