cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifdef CONFIG_LINUX
  55
  56 #include <sys/prctl.h>
  57
  58 #ifndef PR_MCE_KILL
  59 #define PR_MCE_KILL 33
  60 #endif
  61
  62 #ifndef PR_MCE_KILL_SET
  63 #define PR_MCE_KILL_SET 1
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_EARLY
  67 #define PR_MCE_KILL_EARLY 1
  68 #endif
  69
  70 #endif /* CONFIG_LINUX */
  71
  72 int64_t max_delay;
  73 int64_t max_advance;
  74
  75 /* vcpu throttling controls */
  76 static QEMUTimer *throttle_timer;
  77 static unsigned int throttle_percentage;
  78
  79 #define CPU_THROTTLE_PCT_MIN 1
  80 #define CPU_THROTTLE_PCT_MAX 99
  81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  82
  83 bool cpu_is_stopped(CPUState *cpu)
  84 {
  85     return cpu->stopped || !runstate_is_running();
  86 }
  87
  88 static bool cpu_thread_is_idle(CPUState *cpu)
  89 {
  90     if (cpu->stop || cpu->queued_work_first) {
  91         return false;
  92     }
  93     if (cpu_is_stopped(cpu)) {
  94         return true;
  95     }
  96     if (!cpu->halted || cpu_has_work(cpu) ||
  97         kvm_halt_in_kernel()) {
  98         return false;
  99     }
 100     return true;
 101 }
 102
 103 static bool all_cpu_threads_idle(void)
 104 {
 105     CPUState *cpu;
 106
 107     CPU_FOREACH(cpu) {
 108         if (!cpu_thread_is_idle(cpu)) {
 109             return false;
 110         }
 111     }
 112     return true;
 113 }
 114
 115 /***********************************************************/
 116 /* guest cycle counter */
 117
 118 /* Protected by TimersState seqlock */
 119
 120 static bool icount_sleep = true;
 121 static int64_t vm_clock_warp_start = -1;
 122 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 123 static int icount_time_shift;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 static QEMUTimer *icount_rt_timer;
 128 static QEMUTimer *icount_vm_timer;
 129 static QEMUTimer *icount_warp_timer;
 130
 131 typedef struct TimersState {
 132     /* Protected by BQL.  */
 133     int64_t cpu_ticks_prev;
 134     int64_t cpu_ticks_offset;
 135
 136     /* cpu_clock_offset can be read out of BQL, so protect it with
 137      * this lock.
 138      */
 139     QemuSeqLock vm_clock_seqlock;
 140     int64_t cpu_clock_offset;
 141     int32_t cpu_ticks_enabled;
 142     int64_t dummy;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146     /* Only written by TCG thread */
 147     int64_t qemu_icount;
 148 } TimersState;
 149
 150 static TimersState timers_state;
 151 bool mttcg_enabled;
 152
 153 /*
 154  * We default to false if we know other options have been enabled
 155  * which are currently incompatible with MTTCG. Otherwise when each
 156  * guest (target) has been updated to support:
 157  *   - atomic instructions
 158  *   - memory ordering primitives (barriers)
 159  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 160  *
 161  * Once a guest architecture has been converted to the new primitives
 162  * there are two remaining limitations to check.
 163  *
 164  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 165  * - The host must have a stronger memory order than the guest
 166  *
 167  * It may be possible in future to support strong guests on weak hosts
 168  * but that will require tagging all load/stores in a guest with their
 169  * implicit memory order requirements which would likely slow things
 170  * down a lot.
 171  */
 172
 173 static bool check_tcg_memory_orders_compatible(void)
 174 {
 175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 176     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 177 #else
 178     return false;
 179 #endif
 180 }
 181
 182 static bool default_mttcg_enabled(void)
 183 {
 184     if (use_icount || TCG_OVERSIZED_GUEST) {
 185         return false;
 186     } else {
 187 #ifdef TARGET_SUPPORTS_MTTCG
 188         return check_tcg_memory_orders_compatible();
 189 #else
 190         return false;
 191 #endif
 192     }
 193 }
 194
 195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 196 {
 197     const char *t = qemu_opt_get(opts, "thread");
 198     if (t) {
 199         if (strcmp(t, "multi") == 0) {
 200             if (TCG_OVERSIZED_GUEST) {
 201                 error_setg(errp, "No MTTCG when guest word size > hosts");
 202             } else if (use_icount) {
 203                 error_setg(errp, "No MTTCG when icount is enabled");
 204             } else {
 205 #ifndef TARGET_SUPPORTS_MTTCG
 206                 error_report("Guest not yet converted to MTTCG - "
 207                              "you may get unexpected results");
 208 #endif
 209                 if (!check_tcg_memory_orders_compatible()) {
 210                     error_report("Guest expects a stronger memory ordering "
 211                                  "than the host provides");
 212                     error_printf("This may cause strange/hard to debug errors\n");
 213                 }
 214                 mttcg_enabled = true;
 215             }
 216         } else if (strcmp(t, "single") == 0) {
 217             mttcg_enabled = false;
 218         } else {
 219             error_setg(errp, "Invalid 'thread' setting %s", t);
 220         }
 221     } else {
 222         mttcg_enabled = default_mttcg_enabled();
 223     }
 224 }
 225
 226 /* The current number of executed instructions is based on what we
 227  * originally budgeted minus the current state of the decrementing
 228  * icount counters in extra/u16.low.
 229  */
 230 static int64_t cpu_get_icount_executed(CPUState *cpu)
 231 {
 232     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 233 }
 234
 235 /*
 236  * Update the global shared timer_state.qemu_icount to take into
 237  * account executed instructions. This is done by the TCG vCPU
 238  * thread so the main-loop can see time has moved forward.
 239  */
 240 void cpu_update_icount(CPUState *cpu)
 241 {
 242     int64_t executed = cpu_get_icount_executed(cpu);
 243     cpu->icount_budget -= executed;
 244
 245 #ifdef CONFIG_ATOMIC64
 246     atomic_set__nocheck(&timers_state.qemu_icount,
 247                         atomic_read__nocheck(&timers_state.qemu_icount) +
 248                         executed);
 249 #else /* FIXME: we need 64bit atomics to do this safely */
 250     timers_state.qemu_icount += executed;
 251 #endif
 252 }
 253
 254 int64_t cpu_get_icount_raw(void)
 255 {
 256     int64_t icount;
 257     CPUState *cpu = current_cpu;
 258
 259     icount = atomic_read(&timers_state.qemu_icount);
 260     if (cpu && cpu->running) {
 261         if (!cpu->can_do_io) {
 262             fprintf(stderr, "Bad icount read\n");
 263             exit(1);
 264         }
 265         /* Take into account what has run */
 266         icount += cpu_get_icount_executed(cpu);
 267     }
 268     return icount;
 269 }
 270
 271 /* Return the virtual CPU time, based on the instruction counter.  */
 272 static int64_t cpu_get_icount_locked(void)
 273 {
 274     int64_t icount = cpu_get_icount_raw();
 275     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 276 }
 277
 278 int64_t cpu_get_icount(void)
 279 {
 280     int64_t icount;
 281     unsigned start;
 282
 283     do {
 284         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 285         icount = cpu_get_icount_locked();
 286     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 287
 288     return icount;
 289 }
 290
 291 int64_t cpu_icount_to_ns(int64_t icount)
 292 {
 293     return icount << icount_time_shift;
 294 }
 295
 296 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 297  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 298  * counter.
 299  *
 300  * Caller must hold the BQL
 301  */
 302 int64_t cpu_get_ticks(void)
 303 {
 304     int64_t ticks;
 305
 306     if (use_icount) {
 307         return cpu_get_icount();
 308     }
 309
 310     ticks = timers_state.cpu_ticks_offset;
 311     if (timers_state.cpu_ticks_enabled) {
 312         ticks += cpu_get_host_ticks();
 313     }
 314
 315     if (timers_state.cpu_ticks_prev > ticks) {
 316         /* Note: non increasing ticks may happen if the host uses
 317            software suspend */
 318         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 319         ticks = timers_state.cpu_ticks_prev;
 320     }
 321
 322     timers_state.cpu_ticks_prev = ticks;
 323     return ticks;
 324 }
 325
 326 static int64_t cpu_get_clock_locked(void)
 327 {
 328     int64_t time;
 329
 330     time = timers_state.cpu_clock_offset;
 331     if (timers_state.cpu_ticks_enabled) {
 332         time += get_clock();
 333     }
 334
 335     return time;
 336 }
 337
 338 /* Return the monotonic time elapsed in VM, i.e.,
 339  * the time between vm_start and vm_stop
 340  */
 341 int64_t cpu_get_clock(void)
 342 {
 343     int64_t ti;
 344     unsigned start;
 345
 346     do {
 347         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 348         ti = cpu_get_clock_locked();
 349     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 350
 351     return ti;
 352 }
 353
 354 /* enable cpu_get_ticks()
 355  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 356  */
 357 void cpu_enable_ticks(void)
 358 {
 359     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 360     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 361     if (!timers_state.cpu_ticks_enabled) {
 362         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 363         timers_state.cpu_clock_offset -= get_clock();
 364         timers_state.cpu_ticks_enabled = 1;
 365     }
 366     seqlock_write_end(&timers_state.vm_clock_seqlock);
 367 }
 368
 369 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 370  * cpu_get_ticks() after that.
 371  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 372  */
 373 void cpu_disable_ticks(void)
 374 {
 375     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 376     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 377     if (timers_state.cpu_ticks_enabled) {
 378         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 379         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 380         timers_state.cpu_ticks_enabled = 0;
 381     }
 382     seqlock_write_end(&timers_state.vm_clock_seqlock);
 383 }
 384
 385 /* Correlation between real and virtual time is always going to be
 386    fairly approximate, so ignore small variation.
 387    When the guest is idle real and virtual time will be aligned in
 388    the IO wait loop.  */
 389 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 390
 391 static void icount_adjust(void)
 392 {
 393     int64_t cur_time;
 394     int64_t cur_icount;
 395     int64_t delta;
 396
 397     /* Protected by TimersState mutex.  */
 398     static int64_t last_delta;
 399
 400     /* If the VM is not running, then do nothing.  */
 401     if (!runstate_is_running()) {
 402         return;
 403     }
 404
 405     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 406     cur_time = cpu_get_clock_locked();
 407     cur_icount = cpu_get_icount_locked();
 408
 409     delta = cur_icount - cur_time;
 410     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 411     if (delta > 0
 412         && last_delta + ICOUNT_WOBBLE < delta * 2
 413         && icount_time_shift > 0) {
 414         /* The guest is getting too far ahead.  Slow time down.  */
 415         icount_time_shift--;
 416     }
 417     if (delta < 0
 418         && last_delta - ICOUNT_WOBBLE > delta * 2
 419         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 420         /* The guest is getting too far behind.  Speed time up.  */
 421         icount_time_shift++;
 422     }
 423     last_delta = delta;
 424     timers_state.qemu_icount_bias = cur_icount
 425                               - (timers_state.qemu_icount << icount_time_shift);
 426     seqlock_write_end(&timers_state.vm_clock_seqlock);
 427 }
 428
 429 static void icount_adjust_rt(void *opaque)
 430 {
 431     timer_mod(icount_rt_timer,
 432               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 433     icount_adjust();
 434 }
 435
 436 static void icount_adjust_vm(void *opaque)
 437 {
 438     timer_mod(icount_vm_timer,
 439                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 440                    NANOSECONDS_PER_SECOND / 10);
 441     icount_adjust();
 442 }
 443
 444 static int64_t qemu_icount_round(int64_t count)
 445 {
 446     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 447 }
 448
 449 static void icount_warp_rt(void)
 450 {
 451     unsigned seq;
 452     int64_t warp_start;
 453
 454     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 455      * changes from -1 to another value, so the race here is okay.
 456      */
 457     do {
 458         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 459         warp_start = vm_clock_warp_start;
 460     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 461
 462     if (warp_start == -1) {
 463         return;
 464     }
 465
 466     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 467     if (runstate_is_running()) {
 468         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 469                                      cpu_get_clock_locked());
 470         int64_t warp_delta;
 471
 472         warp_delta = clock - vm_clock_warp_start;
 473         if (use_icount == 2) {
 474             /*
 475              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 476              * far ahead of real time.
 477              */
 478             int64_t cur_icount = cpu_get_icount_locked();
 479             int64_t delta = clock - cur_icount;
 480             warp_delta = MIN(warp_delta, delta);
 481         }
 482         timers_state.qemu_icount_bias += warp_delta;
 483     }
 484     vm_clock_warp_start = -1;
 485     seqlock_write_end(&timers_state.vm_clock_seqlock);
 486
 487     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 488         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 489     }
 490 }
 491
 492 static void icount_timer_cb(void *opaque)
 493 {
 494     /* No need for a checkpoint because the timer already synchronizes
 495      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 496      */
 497     icount_warp_rt();
 498 }
 499
 500 void qtest_clock_warp(int64_t dest)
 501 {
 502     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 503     AioContext *aio_context;
 504     assert(qtest_enabled());
 505     aio_context = qemu_get_aio_context();
 506     while (clock < dest) {
 507         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 508         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 509
 510         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 511         timers_state.qemu_icount_bias += warp;
 512         seqlock_write_end(&timers_state.vm_clock_seqlock);
 513
 514         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 515         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 516         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 517     }
 518     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 519 }
 520
 521 void qemu_start_warp_timer(void)
 522 {
 523     int64_t clock;
 524     int64_t deadline;
 525
 526     if (!use_icount) {
 527         return;
 528     }
 529
 530     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 531      * do not fire, so computing the deadline does not make sense.
 532      */
 533     if (!runstate_is_running()) {
 534         return;
 535     }
 536
 537     /* warp clock deterministically in record/replay mode */
 538     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 539         return;
 540     }
 541
 542     if (!all_cpu_threads_idle()) {
 543         return;
 544     }
 545
 546     if (qtest_enabled()) {
 547         /* When testing, qtest commands advance icount.  */
 548         return;
 549     }
 550
 551     /* We want to use the earliest deadline from ALL vm_clocks */
 552     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 553     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 554     if (deadline < 0) {
 555         static bool notified;
 556         if (!icount_sleep && !notified) {
 557             error_report("WARNING: icount sleep disabled and no active timers");
 558             notified = true;
 559         }
 560         return;
 561     }
 562
 563     if (deadline > 0) {
 564         /*
 565          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 566          * sleep.  Otherwise, the CPU might be waiting for a future timer
 567          * interrupt to wake it up, but the interrupt never comes because
 568          * the vCPU isn't running any insns and thus doesn't advance the
 569          * QEMU_CLOCK_VIRTUAL.
 570          */
 571         if (!icount_sleep) {
 572             /*
 573              * We never let VCPUs sleep in no sleep icount mode.
 574              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 575              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 576              * It is useful when we want a deterministic execution time,
 577              * isolated from host latencies.
 578              */
 579             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 580             timers_state.qemu_icount_bias += deadline;
 581             seqlock_write_end(&timers_state.vm_clock_seqlock);
 582             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 583         } else {
 584             /*
 585              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 586              * "real" time, (related to the time left until the next event) has
 587              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 588              * This avoids that the warps are visible externally; for example,
 589              * you will not be sending network packets continuously instead of
 590              * every 100ms.
 591              */
 592             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 593             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 594                 vm_clock_warp_start = clock;
 595             }
 596             seqlock_write_end(&timers_state.vm_clock_seqlock);
 597             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 598         }
 599     } else if (deadline == 0) {
 600         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 601     }
 602 }
 603
 604 static void qemu_account_warp_timer(void)
 605 {
 606     if (!use_icount || !icount_sleep) {
 607         return;
 608     }
 609
 610     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 611      * do not fire, so computing the deadline does not make sense.
 612      */
 613     if (!runstate_is_running()) {
 614         return;
 615     }
 616
 617     /* warp clock deterministically in record/replay mode */
 618     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 619         return;
 620     }
 621
 622     timer_del(icount_warp_timer);
 623     icount_warp_rt();
 624 }
 625
 626 static bool icount_state_needed(void *opaque)
 627 {
 628     return use_icount;
 629 }
 630
 631 /*
 632  * This is a subsection for icount migration.
 633  */
 634 static const VMStateDescription icount_vmstate_timers = {
 635     .name = "timer/icount",
 636     .version_id = 1,
 637     .minimum_version_id = 1,
 638     .needed = icount_state_needed,
 639     .fields = (VMStateField[]) {
 640         VMSTATE_INT64(qemu_icount_bias, TimersState),
 641         VMSTATE_INT64(qemu_icount, TimersState),
 642         VMSTATE_END_OF_LIST()
 643     }
 644 };
 645
 646 static const VMStateDescription vmstate_timers = {
 647     .name = "timer",
 648     .version_id = 2,
 649     .minimum_version_id = 1,
 650     .fields = (VMStateField[]) {
 651         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 652         VMSTATE_INT64(dummy, TimersState),
 653         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 654         VMSTATE_END_OF_LIST()
 655     },
 656     .subsections = (const VMStateDescription*[]) {
 657         &icount_vmstate_timers,
 658         NULL
 659     }
 660 };
 661
 662 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 663 {
 664     double pct;
 665     double throttle_ratio;
 666     long sleeptime_ns;
 667
 668     if (!cpu_throttle_get_percentage()) {
 669         return;
 670     }
 671
 672     pct = (double)cpu_throttle_get_percentage()/100;
 673     throttle_ratio = pct / (1 - pct);
 674     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 675
 676     qemu_mutex_unlock_iothread();
 677     atomic_set(&cpu->throttle_thread_scheduled, 0);
 678     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 679     qemu_mutex_lock_iothread();
 680 }
 681
 682 static void cpu_throttle_timer_tick(void *opaque)
 683 {
 684     CPUState *cpu;
 685     double pct;
 686
 687     /* Stop the timer if needed */
 688     if (!cpu_throttle_get_percentage()) {
 689         return;
 690     }
 691     CPU_FOREACH(cpu) {
 692         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 693             async_run_on_cpu(cpu, cpu_throttle_thread,
 694                              RUN_ON_CPU_NULL);
 695         }
 696     }
 697
 698     pct = (double)cpu_throttle_get_percentage()/100;
 699     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 700                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 701 }
 702
 703 void cpu_throttle_set(int new_throttle_pct)
 704 {
 705     /* Ensure throttle percentage is within valid range */
 706     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 707     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 708
 709     atomic_set(&throttle_percentage, new_throttle_pct);
 710
 711     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 712                                        CPU_THROTTLE_TIMESLICE_NS);
 713 }
 714
 715 void cpu_throttle_stop(void)
 716 {
 717     atomic_set(&throttle_percentage, 0);
 718 }
 719
 720 bool cpu_throttle_active(void)
 721 {
 722     return (cpu_throttle_get_percentage() != 0);
 723 }
 724
 725 int cpu_throttle_get_percentage(void)
 726 {
 727     return atomic_read(&throttle_percentage);
 728 }
 729
 730 void cpu_ticks_init(void)
 731 {
 732     seqlock_init(&timers_state.vm_clock_seqlock);
 733     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 734     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 735                                            cpu_throttle_timer_tick, NULL);
 736 }
 737
 738 void configure_icount(QemuOpts *opts, Error **errp)
 739 {
 740     const char *option;
 741     char *rem_str = NULL;
 742
 743     option = qemu_opt_get(opts, "shift");
 744     if (!option) {
 745         if (qemu_opt_get(opts, "align") != NULL) {
 746             error_setg(errp, "Please specify shift option when using align");
 747         }
 748         return;
 749     }
 750
 751     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 752     if (icount_sleep) {
 753         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 754                                          icount_timer_cb, NULL);
 755     }
 756
 757     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 758
 759     if (icount_align_option && !icount_sleep) {
 760         error_setg(errp, "align=on and sleep=off are incompatible");
 761     }
 762     if (strcmp(option, "auto") != 0) {
 763         errno = 0;
 764         icount_time_shift = strtol(option, &rem_str, 0);
 765         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 766             error_setg(errp, "icount: Invalid shift value");
 767         }
 768         use_icount = 1;
 769         return;
 770     } else if (icount_align_option) {
 771         error_setg(errp, "shift=auto and align=on are incompatible");
 772     } else if (!icount_sleep) {
 773         error_setg(errp, "shift=auto and sleep=off are incompatible");
 774     }
 775
 776     use_icount = 2;
 777
 778     /* 125MIPS seems a reasonable initial guess at the guest speed.
 779        It will be corrected fairly quickly anyway.  */
 780     icount_time_shift = 3;
 781
 782     /* Have both realtime and virtual time triggers for speed adjustment.
 783        The realtime trigger catches emulated time passing too slowly,
 784        the virtual time trigger catches emulated time passing too fast.
 785        Realtime triggers occur even when idle, so use them less frequently
 786        than VM triggers.  */
 787     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 788                                    icount_adjust_rt, NULL);
 789     timer_mod(icount_rt_timer,
 790                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 791     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 792                                         icount_adjust_vm, NULL);
 793     timer_mod(icount_vm_timer,
 794                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 795                    NANOSECONDS_PER_SECOND / 10);
 796 }
 797
 798 /***********************************************************/
 799 /* TCG vCPU kick timer
 800  *
 801  * The kick timer is responsible for moving single threaded vCPU
 802  * emulation on to the next vCPU. If more than one vCPU is running a
 803  * timer event with force a cpu->exit so the next vCPU can get
 804  * scheduled.
 805  *
 806  * The timer is removed if all vCPUs are idle and restarted again once
 807  * idleness is complete.
 808  */
 809
 810 static QEMUTimer *tcg_kick_vcpu_timer;
 811 static CPUState *tcg_current_rr_cpu;
 812
 813 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 814
 815 static inline int64_t qemu_tcg_next_kick(void)
 816 {
 817     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 818 }
 819
 820 /* Kick the currently round-robin scheduled vCPU */
 821 static void qemu_cpu_kick_rr_cpu(void)
 822 {
 823     CPUState *cpu;
 824     do {
 825         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 826         if (cpu) {
 827             cpu_exit(cpu);
 828         }
 829     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 830 }
 831
 832 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 833 {
 834 }
 835
 836 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 837 {
 838     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 839         qemu_notify_event();
 840         return;
 841     }
 842
 843     if (!qemu_in_vcpu_thread() && first_cpu) {
 844         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 845          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 846          * causes cpu_thread_is_idle to return false.  This way,
 847          * handle_icount_deadline can run.
 848          */
 849         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 850     }
 851 }
 852
 853 static void kick_tcg_thread(void *opaque)
 854 {
 855     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 856     qemu_cpu_kick_rr_cpu();
 857 }
 858
 859 static void start_tcg_kick_timer(void)
 860 {
 861     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 862         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 863                                            kick_tcg_thread, NULL);
 864         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 865     }
 866 }
 867
 868 static void stop_tcg_kick_timer(void)
 869 {
 870     if (tcg_kick_vcpu_timer) {
 871         timer_del(tcg_kick_vcpu_timer);
 872         tcg_kick_vcpu_timer = NULL;
 873     }
 874 }
 875
 876 /***********************************************************/
 877 void hw_error(const char *fmt, ...)
 878 {
 879     va_list ap;
 880     CPUState *cpu;
 881
 882     va_start(ap, fmt);
 883     fprintf(stderr, "qemu: hardware error: ");
 884     vfprintf(stderr, fmt, ap);
 885     fprintf(stderr, "\n");
 886     CPU_FOREACH(cpu) {
 887         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 888         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 889     }
 890     va_end(ap);
 891     abort();
 892 }
 893
 894 void cpu_synchronize_all_states(void)
 895 {
 896     CPUState *cpu;
 897
 898     CPU_FOREACH(cpu) {
 899         cpu_synchronize_state(cpu);
 900     }
 901 }
 902
 903 void cpu_synchronize_all_post_reset(void)
 904 {
 905     CPUState *cpu;
 906
 907     CPU_FOREACH(cpu) {
 908         cpu_synchronize_post_reset(cpu);
 909     }
 910 }
 911
 912 void cpu_synchronize_all_post_init(void)
 913 {
 914     CPUState *cpu;
 915
 916     CPU_FOREACH(cpu) {
 917         cpu_synchronize_post_init(cpu);
 918     }
 919 }
 920
 921 static int do_vm_stop(RunState state)
 922 {
 923     int ret = 0;
 924
 925     if (runstate_is_running()) {
 926         cpu_disable_ticks();
 927         pause_all_vcpus();
 928         runstate_set(state);
 929         vm_state_notify(0, state);
 930         qapi_event_send_stop(&error_abort);
 931     }
 932
 933     bdrv_drain_all();
 934     replay_disable_events();
 935     ret = bdrv_flush_all();
 936
 937     return ret;
 938 }
 939
 940 static bool cpu_can_run(CPUState *cpu)
 941 {
 942     if (cpu->stop) {
 943         return false;
 944     }
 945     if (cpu_is_stopped(cpu)) {
 946         return false;
 947     }
 948     return true;
 949 }
 950
 951 static void cpu_handle_guest_debug(CPUState *cpu)
 952 {
 953     gdb_set_stop_cpu(cpu);
 954     qemu_system_debug_request();
 955     cpu->stopped = true;
 956 }
 957
 958 #ifdef CONFIG_LINUX
 959 static void sigbus_reraise(void)
 960 {
 961     sigset_t set;
 962     struct sigaction action;
 963
 964     memset(&action, 0, sizeof(action));
 965     action.sa_handler = SIG_DFL;
 966     if (!sigaction(SIGBUS, &action, NULL)) {
 967         raise(SIGBUS);
 968         sigemptyset(&set);
 969         sigaddset(&set, SIGBUS);
 970         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 971     }
 972     perror("Failed to re-raise SIGBUS!\n");
 973     abort();
 974 }
 975
 976 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
 977 {
 978     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
 979         sigbus_reraise();
 980     }
 981
 982     if (current_cpu) {
 983         /* Called asynchronously in VCPU thread.  */
 984         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
 985             sigbus_reraise();
 986         }
 987     } else {
 988         /* Called synchronously (via signalfd) in main thread.  */
 989         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
 990             sigbus_reraise();
 991         }
 992     }
 993 }
 994
 995 static void qemu_init_sigbus(void)
 996 {
 997     struct sigaction action;
 998
 999     memset(&action, 0, sizeof(action));
1000     action.sa_flags = SA_SIGINFO;
1001     action.sa_sigaction = sigbus_handler;
1002     sigaction(SIGBUS, &action, NULL);
1003
1004     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1005 }
1006 #else /* !CONFIG_LINUX */
1007 static void qemu_init_sigbus(void)
1008 {
1009 }
1010 #endif /* !CONFIG_LINUX */
1011
1012 static QemuMutex qemu_global_mutex;
1013
1014 static QemuThread io_thread;
1015
1016 /* cpu creation */
1017 static QemuCond qemu_cpu_cond;
1018 /* system init */
1019 static QemuCond qemu_pause_cond;
1020
1021 void qemu_init_cpu_loop(void)
1022 {
1023     qemu_init_sigbus();
1024     qemu_cond_init(&qemu_cpu_cond);
1025     qemu_cond_init(&qemu_pause_cond);
1026     qemu_mutex_init(&qemu_global_mutex);
1027
1028     qemu_thread_get_self(&io_thread);
1029 }
1030
1031 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1032 {
1033     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1034 }
1035
1036 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1037 {
1038     if (kvm_destroy_vcpu(cpu) < 0) {
1039         error_report("kvm_destroy_vcpu failed");
1040         exit(EXIT_FAILURE);
1041     }
1042 }
1043
1044 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1045 {
1046 }
1047
1048 static void qemu_wait_io_event_common(CPUState *cpu)
1049 {
1050     atomic_mb_set(&cpu->thread_kicked, false);
1051     if (cpu->stop) {
1052         cpu->stop = false;
1053         cpu->stopped = true;
1054         qemu_cond_broadcast(&qemu_pause_cond);
1055     }
1056     process_queued_cpu_work(cpu);
1057 }
1058
1059 static bool qemu_tcg_should_sleep(CPUState *cpu)
1060 {
1061     if (mttcg_enabled) {
1062         return cpu_thread_is_idle(cpu);
1063     } else {
1064         return all_cpu_threads_idle();
1065     }
1066 }
1067
1068 static void qemu_tcg_wait_io_event(CPUState *cpu)
1069 {
1070     while (qemu_tcg_should_sleep(cpu)) {
1071         stop_tcg_kick_timer();
1072         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1073     }
1074
1075     start_tcg_kick_timer();
1076
1077     qemu_wait_io_event_common(cpu);
1078 }
1079
1080 static void qemu_kvm_wait_io_event(CPUState *cpu)
1081 {
1082     while (cpu_thread_is_idle(cpu)) {
1083         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1084     }
1085
1086     qemu_wait_io_event_common(cpu);
1087 }
1088
1089 static void *qemu_kvm_cpu_thread_fn(void *arg)
1090 {
1091     CPUState *cpu = arg;
1092     int r;
1093
1094     rcu_register_thread();
1095
1096     qemu_mutex_lock_iothread();
1097     qemu_thread_get_self(cpu->thread);
1098     cpu->thread_id = qemu_get_thread_id();
1099     cpu->can_do_io = 1;
1100     current_cpu = cpu;
1101
1102     r = kvm_init_vcpu(cpu);
1103     if (r < 0) {
1104         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1105         exit(1);
1106     }
1107
1108     kvm_init_cpu_signals(cpu);
1109
1110     /* signal CPU creation */
1111     cpu->created = true;
1112     qemu_cond_signal(&qemu_cpu_cond);
1113
1114     do {
1115         if (cpu_can_run(cpu)) {
1116             r = kvm_cpu_exec(cpu);
1117             if (r == EXCP_DEBUG) {
1118                 cpu_handle_guest_debug(cpu);
1119             }
1120         }
1121         qemu_kvm_wait_io_event(cpu);
1122     } while (!cpu->unplug || cpu_can_run(cpu));
1123
1124     qemu_kvm_destroy_vcpu(cpu);
1125     cpu->created = false;
1126     qemu_cond_signal(&qemu_cpu_cond);
1127     qemu_mutex_unlock_iothread();
1128     return NULL;
1129 }
1130
1131 static void *qemu_dummy_cpu_thread_fn(void *arg)
1132 {
1133 #ifdef _WIN32
1134     fprintf(stderr, "qtest is not supported under Windows\n");
1135     exit(1);
1136 #else
1137     CPUState *cpu = arg;
1138     sigset_t waitset;
1139     int r;
1140
1141     rcu_register_thread();
1142
1143     qemu_mutex_lock_iothread();
1144     qemu_thread_get_self(cpu->thread);
1145     cpu->thread_id = qemu_get_thread_id();
1146     cpu->can_do_io = 1;
1147     current_cpu = cpu;
1148
1149     sigemptyset(&waitset);
1150     sigaddset(&waitset, SIG_IPI);
1151
1152     /* signal CPU creation */
1153     cpu->created = true;
1154     qemu_cond_signal(&qemu_cpu_cond);
1155
1156     while (1) {
1157         qemu_mutex_unlock_iothread();
1158         do {
1159             int sig;
1160             r = sigwait(&waitset, &sig);
1161         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1162         if (r == -1) {
1163             perror("sigwait");
1164             exit(1);
1165         }
1166         qemu_mutex_lock_iothread();
1167         qemu_wait_io_event_common(cpu);
1168     }
1169
1170     return NULL;
1171 #endif
1172 }
1173
1174 static int64_t tcg_get_icount_limit(void)
1175 {
1176     int64_t deadline;
1177
1178     if (replay_mode != REPLAY_MODE_PLAY) {
1179         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1180
1181         /* Maintain prior (possibly buggy) behaviour where if no deadline
1182          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1183          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1184          * nanoseconds.
1185          */
1186         if ((deadline < 0) || (deadline > INT32_MAX)) {
1187             deadline = INT32_MAX;
1188         }
1189
1190         return qemu_icount_round(deadline);
1191     } else {
1192         return replay_get_instructions();
1193     }
1194 }
1195
1196 static void handle_icount_deadline(void)
1197 {
1198     assert(qemu_in_vcpu_thread());
1199     if (use_icount) {
1200         int64_t deadline =
1201             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1202
1203         if (deadline == 0) {
1204             /* Wake up other AioContexts.  */
1205             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1206             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1207         }
1208     }
1209 }
1210
1211 static void prepare_icount_for_run(CPUState *cpu)
1212 {
1213     if (use_icount) {
1214         int64_t count;
1215         int decr;
1216
1217         /* These should always be cleared by process_icount_data after
1218          * each vCPU execution. However u16.high can be raised
1219          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1220          */
1221         g_assert(cpu->icount_decr.u16.low == 0);
1222         g_assert(cpu->icount_extra == 0);
1223
1224
1225         count = tcg_get_icount_limit();
1226
1227         /* To calculate what we have executed so far we need to know
1228          * what we originally budgeted to run this cycle */
1229         cpu->icount_budget = count;
1230
1231         decr = (count > 0xffff) ? 0xffff : count;
1232         count -= decr;
1233         cpu->icount_decr.u16.low = decr;
1234         cpu->icount_extra = count;
1235     }
1236 }
1237
1238 static void process_icount_data(CPUState *cpu)
1239 {
1240     if (use_icount) {
1241         /* Account for executed instructions */
1242         cpu_update_icount(cpu);
1243
1244         /* Reset the counters */
1245         cpu->icount_decr.u16.low = 0;
1246         cpu->icount_extra = 0;
1247         cpu->icount_budget = 0;
1248
1249         replay_account_executed_instructions();
1250     }
1251 }
1252
1253
1254 static int tcg_cpu_exec(CPUState *cpu)
1255 {
1256     int ret;
1257 #ifdef CONFIG_PROFILER
1258     int64_t ti;
1259 #endif
1260
1261 #ifdef CONFIG_PROFILER
1262     ti = profile_getclock();
1263 #endif
1264     qemu_mutex_unlock_iothread();
1265     cpu_exec_start(cpu);
1266     ret = cpu_exec(cpu);
1267     cpu_exec_end(cpu);
1268     qemu_mutex_lock_iothread();
1269 #ifdef CONFIG_PROFILER
1270     tcg_time += profile_getclock() - ti;
1271 #endif
1272     return ret;
1273 }
1274
1275 /* Destroy any remaining vCPUs which have been unplugged and have
1276  * finished running
1277  */
1278 static void deal_with_unplugged_cpus(void)
1279 {
1280     CPUState *cpu;
1281
1282     CPU_FOREACH(cpu) {
1283         if (cpu->unplug && !cpu_can_run(cpu)) {
1284             qemu_tcg_destroy_vcpu(cpu);
1285             cpu->created = false;
1286             qemu_cond_signal(&qemu_cpu_cond);
1287             break;
1288         }
1289     }
1290 }
1291
1292 /* Single-threaded TCG
1293  *
1294  * In the single-threaded case each vCPU is simulated in turn. If
1295  * there is more than a single vCPU we create a simple timer to kick
1296  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1297  * This is done explicitly rather than relying on side-effects
1298  * elsewhere.
1299  */
1300
1301 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1302 {
1303     CPUState *cpu = arg;
1304
1305     rcu_register_thread();
1306
1307     qemu_mutex_lock_iothread();
1308     qemu_thread_get_self(cpu->thread);
1309
1310     CPU_FOREACH(cpu) {
1311         cpu->thread_id = qemu_get_thread_id();
1312         cpu->created = true;
1313         cpu->can_do_io = 1;
1314     }
1315     qemu_cond_signal(&qemu_cpu_cond);
1316
1317     /* wait for initial kick-off after machine start */
1318     while (first_cpu->stopped) {
1319         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1320
1321         /* process any pending work */
1322         CPU_FOREACH(cpu) {
1323             current_cpu = cpu;
1324             qemu_wait_io_event_common(cpu);
1325         }
1326     }
1327
1328     start_tcg_kick_timer();
1329
1330     cpu = first_cpu;
1331
1332     /* process any pending work */
1333     cpu->exit_request = 1;
1334
1335     while (1) {
1336         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1337         qemu_account_warp_timer();
1338
1339         /* Run the timers here.  This is much more efficient than
1340          * waking up the I/O thread and waiting for completion.
1341          */
1342         handle_icount_deadline();
1343
1344         if (!cpu) {
1345             cpu = first_cpu;
1346         }
1347
1348         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1349
1350             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1351             current_cpu = cpu;
1352
1353             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1354                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1355
1356             if (cpu_can_run(cpu)) {
1357                 int r;
1358
1359                 prepare_icount_for_run(cpu);
1360
1361                 r = tcg_cpu_exec(cpu);
1362
1363                 process_icount_data(cpu);
1364
1365                 if (r == EXCP_DEBUG) {
1366                     cpu_handle_guest_debug(cpu);
1367                     break;
1368                 } else if (r == EXCP_ATOMIC) {
1369                     qemu_mutex_unlock_iothread();
1370                     cpu_exec_step_atomic(cpu);
1371                     qemu_mutex_lock_iothread();
1372                     break;
1373                 }
1374             } else if (cpu->stop) {
1375                 if (cpu->unplug) {
1376                     cpu = CPU_NEXT(cpu);
1377                 }
1378                 break;
1379             }
1380
1381             cpu = CPU_NEXT(cpu);
1382         } /* while (cpu && !cpu->exit_request).. */
1383
1384         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1385         atomic_set(&tcg_current_rr_cpu, NULL);
1386
1387         if (cpu && cpu->exit_request) {
1388             atomic_mb_set(&cpu->exit_request, 0);
1389         }
1390
1391         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1392         deal_with_unplugged_cpus();
1393     }
1394
1395     return NULL;
1396 }
1397
1398 static void *qemu_hax_cpu_thread_fn(void *arg)
1399 {
1400     CPUState *cpu = arg;
1401     int r;
1402
1403     qemu_mutex_lock_iothread();
1404     qemu_thread_get_self(cpu->thread);
1405
1406     cpu->thread_id = qemu_get_thread_id();
1407     cpu->created = true;
1408     cpu->halted = 0;
1409     current_cpu = cpu;
1410
1411     hax_init_vcpu(cpu);
1412     qemu_cond_signal(&qemu_cpu_cond);
1413
1414     while (1) {
1415         if (cpu_can_run(cpu)) {
1416             r = hax_smp_cpu_exec(cpu);
1417             if (r == EXCP_DEBUG) {
1418                 cpu_handle_guest_debug(cpu);
1419             }
1420         }
1421
1422         while (cpu_thread_is_idle(cpu)) {
1423             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1424         }
1425 #ifdef _WIN32
1426         SleepEx(0, TRUE);
1427 #endif
1428         qemu_wait_io_event_common(cpu);
1429     }
1430     return NULL;
1431 }
1432
1433 #ifdef _WIN32
1434 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1435 {
1436 }
1437 #endif
1438
1439 /* Multi-threaded TCG
1440  *
1441  * In the multi-threaded case each vCPU has its own thread. The TLS
1442  * variable current_cpu can be used deep in the code to find the
1443  * current CPUState for a given thread.
1444  */
1445
1446 static void *qemu_tcg_cpu_thread_fn(void *arg)
1447 {
1448     CPUState *cpu = arg;
1449
1450     g_assert(!use_icount);
1451
1452     rcu_register_thread();
1453
1454     qemu_mutex_lock_iothread();
1455     qemu_thread_get_self(cpu->thread);
1456
1457     cpu->thread_id = qemu_get_thread_id();
1458     cpu->created = true;
1459     cpu->can_do_io = 1;
1460     current_cpu = cpu;
1461     qemu_cond_signal(&qemu_cpu_cond);
1462
1463     /* process any pending work */
1464     cpu->exit_request = 1;
1465
1466     while (1) {
1467         if (cpu_can_run(cpu)) {
1468             int r;
1469             r = tcg_cpu_exec(cpu);
1470             switch (r) {
1471             case EXCP_DEBUG:
1472                 cpu_handle_guest_debug(cpu);
1473                 break;
1474             case EXCP_HALTED:
1475                 /* during start-up the vCPU is reset and the thread is
1476                  * kicked several times. If we don't ensure we go back
1477                  * to sleep in the halted state we won't cleanly
1478                  * start-up when the vCPU is enabled.
1479                  *
1480                  * cpu->halted should ensure we sleep in wait_io_event
1481                  */
1482                 g_assert(cpu->halted);
1483                 break;
1484             case EXCP_ATOMIC:
1485                 qemu_mutex_unlock_iothread();
1486                 cpu_exec_step_atomic(cpu);
1487                 qemu_mutex_lock_iothread();
1488             default:
1489                 /* Ignore everything else? */
1490                 break;
1491             }
1492         }
1493
1494         atomic_mb_set(&cpu->exit_request, 0);
1495         qemu_tcg_wait_io_event(cpu);
1496     }
1497
1498     return NULL;
1499 }
1500
1501 static void qemu_cpu_kick_thread(CPUState *cpu)
1502 {
1503 #ifndef _WIN32
1504     int err;
1505
1506     if (cpu->thread_kicked) {
1507         return;
1508     }
1509     cpu->thread_kicked = true;
1510     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1511     if (err) {
1512         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1513         exit(1);
1514     }
1515 #else /* _WIN32 */
1516     if (!qemu_cpu_is_self(cpu)) {
1517         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1518             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1519                     __func__, GetLastError());
1520             exit(1);
1521         }
1522     }
1523 #endif
1524 }
1525
1526 void qemu_cpu_kick(CPUState *cpu)
1527 {
1528     qemu_cond_broadcast(cpu->halt_cond);
1529     if (tcg_enabled()) {
1530         cpu_exit(cpu);
1531         /* NOP unless doing single-thread RR */
1532         qemu_cpu_kick_rr_cpu();
1533     } else {
1534         if (hax_enabled()) {
1535             /*
1536              * FIXME: race condition with the exit_request check in
1537              * hax_vcpu_hax_exec
1538              */
1539             cpu->exit_request = 1;
1540         }
1541         qemu_cpu_kick_thread(cpu);
1542     }
1543 }
1544
1545 void qemu_cpu_kick_self(void)
1546 {
1547     assert(current_cpu);
1548     qemu_cpu_kick_thread(current_cpu);
1549 }
1550
1551 bool qemu_cpu_is_self(CPUState *cpu)
1552 {
1553     return qemu_thread_is_self(cpu->thread);
1554 }
1555
1556 bool qemu_in_vcpu_thread(void)
1557 {
1558     return current_cpu && qemu_cpu_is_self(current_cpu);
1559 }
1560
1561 static __thread bool iothread_locked = false;
1562
1563 bool qemu_mutex_iothread_locked(void)
1564 {
1565     return iothread_locked;
1566 }
1567
1568 void qemu_mutex_lock_iothread(void)
1569 {
1570     g_assert(!qemu_mutex_iothread_locked());
1571     qemu_mutex_lock(&qemu_global_mutex);
1572     iothread_locked = true;
1573 }
1574
1575 void qemu_mutex_unlock_iothread(void)
1576 {
1577     g_assert(qemu_mutex_iothread_locked());
1578     iothread_locked = false;
1579     qemu_mutex_unlock(&qemu_global_mutex);
1580 }
1581
1582 static bool all_vcpus_paused(void)
1583 {
1584     CPUState *cpu;
1585
1586     CPU_FOREACH(cpu) {
1587         if (!cpu->stopped) {
1588             return false;
1589         }
1590     }
1591
1592     return true;
1593 }
1594
1595 void pause_all_vcpus(void)
1596 {
1597     CPUState *cpu;
1598
1599     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1600     CPU_FOREACH(cpu) {
1601         cpu->stop = true;
1602         qemu_cpu_kick(cpu);
1603     }
1604
1605     if (qemu_in_vcpu_thread()) {
1606         cpu_stop_current();
1607     }
1608
1609     while (!all_vcpus_paused()) {
1610         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1611         CPU_FOREACH(cpu) {
1612             qemu_cpu_kick(cpu);
1613         }
1614     }
1615 }
1616
1617 void cpu_resume(CPUState *cpu)
1618 {
1619     cpu->stop = false;
1620     cpu->stopped = false;
1621     qemu_cpu_kick(cpu);
1622 }
1623
1624 void resume_all_vcpus(void)
1625 {
1626     CPUState *cpu;
1627
1628     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1629     CPU_FOREACH(cpu) {
1630         cpu_resume(cpu);
1631     }
1632 }
1633
1634 void cpu_remove(CPUState *cpu)
1635 {
1636     cpu->stop = true;
1637     cpu->unplug = true;
1638     qemu_cpu_kick(cpu);
1639 }
1640
1641 void cpu_remove_sync(CPUState *cpu)
1642 {
1643     cpu_remove(cpu);
1644     while (cpu->created) {
1645         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1646     }
1647 }
1648
1649 /* For temporary buffers for forming a name */
1650 #define VCPU_THREAD_NAME_SIZE 16
1651
1652 static void qemu_tcg_init_vcpu(CPUState *cpu)
1653 {
1654     char thread_name[VCPU_THREAD_NAME_SIZE];
1655     static QemuCond *single_tcg_halt_cond;
1656     static QemuThread *single_tcg_cpu_thread;
1657
1658     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1659         cpu->thread = g_malloc0(sizeof(QemuThread));
1660         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1661         qemu_cond_init(cpu->halt_cond);
1662
1663         if (qemu_tcg_mttcg_enabled()) {
1664             /* create a thread per vCPU with TCG (MTTCG) */
1665             parallel_cpus = true;
1666             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1667                  cpu->cpu_index);
1668
1669             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1670                                cpu, QEMU_THREAD_JOINABLE);
1671
1672         } else {
1673             /* share a single thread for all cpus with TCG */
1674             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1675             qemu_thread_create(cpu->thread, thread_name,
1676                                qemu_tcg_rr_cpu_thread_fn,
1677                                cpu, QEMU_THREAD_JOINABLE);
1678
1679             single_tcg_halt_cond = cpu->halt_cond;
1680             single_tcg_cpu_thread = cpu->thread;
1681         }
1682 #ifdef _WIN32
1683         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1684 #endif
1685         while (!cpu->created) {
1686             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1687         }
1688     } else {
1689         /* For non-MTTCG cases we share the thread */
1690         cpu->thread = single_tcg_cpu_thread;
1691         cpu->halt_cond = single_tcg_halt_cond;
1692     }
1693 }
1694
1695 static void qemu_hax_start_vcpu(CPUState *cpu)
1696 {
1697     char thread_name[VCPU_THREAD_NAME_SIZE];
1698
1699     cpu->thread = g_malloc0(sizeof(QemuThread));
1700     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1701     qemu_cond_init(cpu->halt_cond);
1702
1703     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1704              cpu->cpu_index);
1705     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1706                        cpu, QEMU_THREAD_JOINABLE);
1707 #ifdef _WIN32
1708     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1709 #endif
1710     while (!cpu->created) {
1711         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1712     }
1713 }
1714
1715 static void qemu_kvm_start_vcpu(CPUState *cpu)
1716 {
1717     char thread_name[VCPU_THREAD_NAME_SIZE];
1718
1719     cpu->thread = g_malloc0(sizeof(QemuThread));
1720     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1721     qemu_cond_init(cpu->halt_cond);
1722     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1723              cpu->cpu_index);
1724     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1725                        cpu, QEMU_THREAD_JOINABLE);
1726     while (!cpu->created) {
1727         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1728     }
1729 }
1730
1731 static void qemu_dummy_start_vcpu(CPUState *cpu)
1732 {
1733     char thread_name[VCPU_THREAD_NAME_SIZE];
1734
1735     cpu->thread = g_malloc0(sizeof(QemuThread));
1736     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1737     qemu_cond_init(cpu->halt_cond);
1738     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1739              cpu->cpu_index);
1740     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1741                        QEMU_THREAD_JOINABLE);
1742     while (!cpu->created) {
1743         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1744     }
1745 }
1746
1747 void qemu_init_vcpu(CPUState *cpu)
1748 {
1749     cpu->nr_cores = smp_cores;
1750     cpu->nr_threads = smp_threads;
1751     cpu->stopped = true;
1752
1753     if (!cpu->as) {
1754         /* If the target cpu hasn't set up any address spaces itself,
1755          * give it the default one.
1756          */
1757         AddressSpace *as = address_space_init_shareable(cpu->memory,
1758                                                         "cpu-memory");
1759         cpu->num_ases = 1;
1760         cpu_address_space_init(cpu, as, 0);
1761     }
1762
1763     if (kvm_enabled()) {
1764         qemu_kvm_start_vcpu(cpu);
1765     } else if (hax_enabled()) {
1766         qemu_hax_start_vcpu(cpu);
1767     } else if (tcg_enabled()) {
1768         qemu_tcg_init_vcpu(cpu);
1769     } else {
1770         qemu_dummy_start_vcpu(cpu);
1771     }
1772 }
1773
1774 void cpu_stop_current(void)
1775 {
1776     if (current_cpu) {
1777         current_cpu->stop = false;
1778         current_cpu->stopped = true;
1779         cpu_exit(current_cpu);
1780         qemu_cond_broadcast(&qemu_pause_cond);
1781     }
1782 }
1783
1784 int vm_stop(RunState state)
1785 {
1786     if (qemu_in_vcpu_thread()) {
1787         qemu_system_vmstop_request_prepare();
1788         qemu_system_vmstop_request(state);
1789         /*
1790          * FIXME: should not return to device code in case
1791          * vm_stop() has been requested.
1792          */
1793         cpu_stop_current();
1794         return 0;
1795     }
1796
1797     return do_vm_stop(state);
1798 }
1799
1800 /**
1801  * Prepare for (re)starting the VM.
1802  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1803  * running or in case of an error condition), 0 otherwise.
1804  */
1805 int vm_prepare_start(void)
1806 {
1807     RunState requested;
1808     int res = 0;
1809
1810     qemu_vmstop_requested(&requested);
1811     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1812         return -1;
1813     }
1814
1815     /* Ensure that a STOP/RESUME pair of events is emitted if a
1816      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1817      * example, according to documentation is always followed by
1818      * the STOP event.
1819      */
1820     if (runstate_is_running()) {
1821         qapi_event_send_stop(&error_abort);
1822         res = -1;
1823     } else {
1824         replay_enable_events();
1825         cpu_enable_ticks();
1826         runstate_set(RUN_STATE_RUNNING);
1827         vm_state_notify(1, RUN_STATE_RUNNING);
1828     }
1829
1830     /* We are sending this now, but the CPUs will be resumed shortly later */
1831     qapi_event_send_resume(&error_abort);
1832     return res;
1833 }
1834
1835 void vm_start(void)
1836 {
1837     if (!vm_prepare_start()) {
1838         resume_all_vcpus();
1839     }
1840 }
1841
1842 /* does a state transition even if the VM is already stopped,
1843    current state is forgotten forever */
1844 int vm_stop_force_state(RunState state)
1845 {
1846     if (runstate_is_running()) {
1847         return vm_stop(state);
1848     } else {
1849         runstate_set(state);
1850
1851         bdrv_drain_all();
1852         /* Make sure to return an error if the flush in a previous vm_stop()
1853          * failed. */
1854         return bdrv_flush_all();
1855     }
1856 }
1857
1858 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1859 {
1860     /* XXX: implement xxx_cpu_list for targets that still miss it */
1861 #if defined(cpu_list)
1862     cpu_list(f, cpu_fprintf);
1863 #endif
1864 }
1865
1866 CpuInfoList *qmp_query_cpus(Error **errp)
1867 {
1868     CpuInfoList *head = NULL, *cur_item = NULL;
1869     CPUState *cpu;
1870
1871     CPU_FOREACH(cpu) {
1872         CpuInfoList *info;
1873 #if defined(TARGET_I386)
1874         X86CPU *x86_cpu = X86_CPU(cpu);
1875         CPUX86State *env = &x86_cpu->env;
1876 #elif defined(TARGET_PPC)
1877         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1878         CPUPPCState *env = &ppc_cpu->env;
1879 #elif defined(TARGET_SPARC)
1880         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1881         CPUSPARCState *env = &sparc_cpu->env;
1882 #elif defined(TARGET_MIPS)
1883         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1884         CPUMIPSState *env = &mips_cpu->env;
1885 #elif defined(TARGET_TRICORE)
1886         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1887         CPUTriCoreState *env = &tricore_cpu->env;
1888 #endif
1889
1890         cpu_synchronize_state(cpu);
1891
1892         info = g_malloc0(sizeof(*info));
1893         info->value = g_malloc0(sizeof(*info->value));
1894         info->value->CPU = cpu->cpu_index;
1895         info->value->current = (cpu == first_cpu);
1896         info->value->halted = cpu->halted;
1897         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1898         info->value->thread_id = cpu->thread_id;
1899 #if defined(TARGET_I386)
1900         info->value->arch = CPU_INFO_ARCH_X86;
1901         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1902 #elif defined(TARGET_PPC)
1903         info->value->arch = CPU_INFO_ARCH_PPC;
1904         info->value->u.ppc.nip = env->nip;
1905 #elif defined(TARGET_SPARC)
1906         info->value->arch = CPU_INFO_ARCH_SPARC;
1907         info->value->u.q_sparc.pc = env->pc;
1908         info->value->u.q_sparc.npc = env->npc;
1909 #elif defined(TARGET_MIPS)
1910         info->value->arch = CPU_INFO_ARCH_MIPS;
1911         info->value->u.q_mips.PC = env->active_tc.PC;
1912 #elif defined(TARGET_TRICORE)
1913         info->value->arch = CPU_INFO_ARCH_TRICORE;
1914         info->value->u.tricore.PC = env->PC;
1915 #else
1916         info->value->arch = CPU_INFO_ARCH_OTHER;
1917 #endif
1918
1919         /* XXX: waiting for the qapi to support GSList */
1920         if (!cur_item) {
1921             head = cur_item = info;
1922         } else {
1923             cur_item->next = info;
1924             cur_item = info;
1925         }
1926     }
1927
1928     return head;
1929 }
1930
1931 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1932                  bool has_cpu, int64_t cpu_index, Error **errp)
1933 {
1934     FILE *f;
1935     uint32_t l;
1936     CPUState *cpu;
1937     uint8_t buf[1024];
1938     int64_t orig_addr = addr, orig_size = size;
1939
1940     if (!has_cpu) {
1941         cpu_index = 0;
1942     }
1943
1944     cpu = qemu_get_cpu(cpu_index);
1945     if (cpu == NULL) {
1946         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1947                    "a CPU number");
1948         return;
1949     }
1950
1951     f = fopen(filename, "wb");
1952     if (!f) {
1953         error_setg_file_open(errp, errno, filename);
1954         return;
1955     }
1956
1957     while (size != 0) {
1958         l = sizeof(buf);
1959         if (l > size)
1960             l = size;
1961         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1962             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1963                              " specified", orig_addr, orig_size);
1964             goto exit;
1965         }
1966         if (fwrite(buf, 1, l, f) != l) {
1967             error_setg(errp, QERR_IO_ERROR);
1968             goto exit;
1969         }
1970         addr += l;
1971         size -= l;
1972     }
1973
1974 exit:
1975     fclose(f);
1976 }
1977
1978 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1979                   Error **errp)
1980 {
1981     FILE *f;
1982     uint32_t l;
1983     uint8_t buf[1024];
1984
1985     f = fopen(filename, "wb");
1986     if (!f) {
1987         error_setg_file_open(errp, errno, filename);
1988         return;
1989     }
1990
1991     while (size != 0) {
1992         l = sizeof(buf);
1993         if (l > size)
1994             l = size;
1995         cpu_physical_memory_read(addr, buf, l);
1996         if (fwrite(buf, 1, l, f) != l) {
1997             error_setg(errp, QERR_IO_ERROR);
1998             goto exit;
1999         }
2000         addr += l;
2001         size -= l;
2002     }
2003
2004 exit:
2005     fclose(f);
2006 }
2007
2008 void qmp_inject_nmi(Error **errp)
2009 {
2010     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2011 }
2012
2013 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2014 {
2015     if (!use_icount) {
2016         return;
2017     }
2018
2019     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2020                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2021     if (icount_align_option) {
2022         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2023         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2024     } else {
2025         cpu_fprintf(f, "Max guest delay     NA\n");
2026         cpu_fprintf(f, "Max guest advance   NA\n");
2027     }
2028 }