usr/src/uts/i86xpv/os/mp_xen.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*
  28  * Virtual CPU management.
  29  *
  30  * VCPUs can be controlled in one of two ways; through the domain itself
  31  * (psradm, p_online(), etc.), and via changes in xenstore (vcpu_config()).
  32  * Unfortunately, the terminology is used in different ways; they work out as
  33  * follows:
  34  *
  35  * P_ONLINE: the VCPU is up and running, taking interrupts and running threads
  36  *
  37  * P_OFFLINE: the VCPU is up and running, but quiesced (i.e. blocked in the
  38  * hypervisor on the idle thread).  It must be up since a downed VCPU cannot
  39  * receive interrupts, and we require this for offline CPUs in Solaris.
  40  *
  41  * P_POWEROFF: the VCPU is down (we never called xen_vcpu_up(), or called
  42  * xen_vcpu_down() for it).  It can't take interrupts or run anything, though
  43  * if it has run previously, its software state (cpu_t, machcpu structures, IPI
  44  * event channels, etc.) will still exist.
  45  *
  46  * The hypervisor has two notions of CPU states as represented in the store:
  47  *
  48  * "offline": the VCPU is down.  Corresponds to P_POWEROFF.
  49  *
  50  * "online": the VCPU is running.  Corresponds to a CPU state other than
  51  * P_POWEROFF.
  52  *
  53  * Currently, only a notification via xenstore can bring a CPU into a
  54  * P_POWEROFF state, and only the domain can change between P_ONLINE, P_NOINTR,
  55  * P_OFFLINE, etc.  We need to be careful to treat xenstore notifications
  56  * idempotently, as we'll get 'duplicate' entries when we resume a domain.
  57  *
  58  * Note that the xenstore configuration is strictly advisory, in that a domain
  59  * can choose to ignore it and still power up a VCPU in the offline state. To
  60  * play nice, we don't allow it. Thus, any attempt to power on/off a CPU is
  61  * ENOTSUP from within Solaris.
  62  *
  63  * Powering off a VCPU and suspending the domain use similar code. The
  64  * difficulty here is that we must ensure that each VCPU is in a stable
  65  * state: it must have a saved PCB, and not be responding to interrupts
  66  * (since we are just about to remove its ability to run on a real CPU,
  67  * possibly forever).  However, an offline CPU in Solaris can take
  68  * cross-call interrupts, as mentioned, so we must go through a
  69  * two-stage process.  First, we use the standard Solaris pause_cpus().
  70  * This ensures that all CPUs are either in mach_cpu_pause() or
  71  * mach_cpu_idle(), and nothing will cross-call them.
  72  *
  73  * Powered-off-CPUs are already safe, as we own the cpu_lock needed to
  74  * bring them back up, and in state CPU_PHASE_POWERED_OFF.
  75  *
  76  * Running CPUs are spinning in mach_cpu_pause() waiting for either
  77  * PAUSE_IDLE or CPU_PHASE_WAIT_SAFE.
  78  *
  79  * Offline CPUs are either running the idle thread and periodically
  80  * checking for CPU_PHASE_WAIT_SAFE, or blocked in the hypervisor.
  81  *
  82  * Thus, we set CPU_PHASE_WAIT_SAFE for every powered-on CPU, as well as
  83  * poking them to make sure they're not blocked[1]. When every CPU has
  84  * responded by reaching a safe state and setting CPU_PHASE_SAFE, we
  85  * know we can suspend, or power-off a CPU, without problems.
  86  *
  87  * [1] note that we have to repeatedly poke offline CPUs: it's the only
  88  * way to ensure that the CPU doesn't miss the state change before
  89  * dropping into HYPERVISOR_block().
  90  */
  91
  92 #include <sys/types.h>
  93 #include <sys/systm.h>
  94 #include <sys/param.h>
  95 #include <sys/taskq.h>
  96 #include <sys/cmn_err.h>
  97 #include <sys/archsystm.h>
  98 #include <sys/machsystm.h>
  99 #include <sys/segments.h>
 100 #include <sys/cpuvar.h>
 101 #include <sys/x86_archext.h>
 102 #include <sys/controlregs.h>
 103 #include <sys/hypervisor.h>
 104 #include <sys/xpv_panic.h>
 105 #include <sys/mman.h>
 106 #include <sys/psw.h>
 107 #include <sys/cpu.h>
 108 #include <sys/sunddi.h>
 109 #include <util/sscanf.h>
 110 #include <vm/hat_i86.h>
 111 #include <vm/hat.h>
 112 #include <vm/as.h>
 113
 114 #include <xen/public/io/xs_wire.h>
 115 #include <xen/sys/xenbus_impl.h>
 116 #include <xen/public/vcpu.h>
 117
 118 extern cpuset_t cpu_ready_set;
 119
 120 #define CPU_PHASE_NONE 0
 121 #define CPU_PHASE_WAIT_SAFE 1
 122 #define CPU_PHASE_SAFE 2
 123 #define CPU_PHASE_POWERED_OFF 3
 124
 125 /*
 126  * We can only poke CPUs during barrier enter 256 times a second at
 127  * most.
 128  */
 129 #define POKE_TIMEOUT (NANOSEC / 256)
 130
 131 static taskq_t *cpu_config_tq;
 132 static int cpu_phase[NCPU];
 133
 134 static void vcpu_config_event(struct xenbus_watch *, const char **, uint_t);
 135 static int xen_vcpu_initialize(processorid_t, vcpu_guest_context_t *);
 136
 137 /*
 138  * Return whether or not the vcpu is actually running on a pcpu
 139  */
 140 int
 141 vcpu_on_pcpu(processorid_t cpu)
 142 {
 143         struct vcpu_runstate_info runstate;
 144         int     ret = VCPU_STATE_UNKNOWN;
 145
 146         ASSERT(cpu < NCPU);
 147         /*
 148          * Don't bother with hypercall if we are asking about ourself
 149          */
 150         if (cpu == CPU->cpu_id)
 151                 return (VCPU_ON_PCPU);
 152         if (HYPERVISOR_vcpu_op(VCPUOP_get_runstate_info, cpu, &runstate) != 0)
 153                 goto out;
 154
 155         switch (runstate.state) {
 156         case RUNSTATE_running:
 157                 ret = VCPU_ON_PCPU;
 158                 break;
 159
 160         case RUNSTATE_runnable:
 161         case RUNSTATE_offline:
 162         case RUNSTATE_blocked:
 163                 ret = VCPU_NOT_ON_PCPU;
 164                 break;
 165
 166         default:
 167                 break;
 168         }
 169
 170 out:
 171         return (ret);
 172 }
 173
 174 /*
 175  * These routines allocate any global state that might be needed
 176  * while starting cpus.  For virtual cpus, there is no such state.
 177  */
 178 int
 179 mach_cpucontext_init(void)
 180 {
 181         return (0);
 182 }
 183
 184 void
 185 do_cpu_config_watch(int state)
 186 {
 187         static struct xenbus_watch cpu_config_watch;
 188
 189         if (state != XENSTORE_UP)
 190                 return;
 191         cpu_config_watch.node = "cpu";
 192         cpu_config_watch.callback = vcpu_config_event;
 193         if (register_xenbus_watch(&cpu_config_watch)) {
 194                 taskq_destroy(cpu_config_tq);
 195                 cmn_err(CE_WARN, "do_cpu_config_watch: "
 196                     "failed to set vcpu config watch");
 197         }
 198
 199 }
 200
 201 /*
 202  * This routine is called after all the "normal" MP startup has
 203  * been done; a good place to start watching xen store for virtual
 204  * cpu hot plug events.
 205  */
 206 void
 207 mach_cpucontext_fini(void)
 208 {
 209
 210         cpu_config_tq = taskq_create("vcpu config taskq", 1,
 211             maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
 212
 213         (void) xs_register_xenbus_callback(do_cpu_config_watch);
 214 }
 215
 216 /*
 217  * Fill in the remaining CPU context and initialize it.
 218  */
 219 static int
 220 mp_set_cpu_context(vcpu_guest_context_t *vgc, cpu_t *cp)
 221 {
 222         uint_t vec, iopl;
 223
 224         vgc->flags = VGCF_IN_KERNEL;
 225
 226         /*
 227          * fpu_ctx we leave as zero; on first fault we'll store
 228          * sse_initial into it anyway.
 229          */
 230
 231 #if defined(__amd64)
 232         vgc->user_regs.cs = KCS_SEL | SEL_KPL;  /* force to ring 3 */
 233 #else
 234         vgc->user_regs.cs = KCS_SEL;
 235 #endif
 236         vgc->user_regs.ds = KDS_SEL;
 237         vgc->user_regs.es = KDS_SEL;
 238         vgc->user_regs.ss = KDS_SEL;
 239         vgc->kernel_ss = KDS_SEL;
 240
 241         /*
 242          * Allow I/O privilege level for Dom0 kernel.
 243          */
 244         if (DOMAIN_IS_INITDOMAIN(xen_info))
 245                 iopl = (PS_IOPL & 0x1000); /* ring 1 */
 246         else
 247                 iopl = 0;
 248
 249 #if defined(__amd64)
 250         vgc->user_regs.fs = 0;
 251         vgc->user_regs.gs = 0;
 252         vgc->user_regs.rflags = F_OFF | iopl;
 253 #elif defined(__i386)
 254         vgc->user_regs.fs = KFS_SEL;
 255         vgc->user_regs.gs = KGS_SEL;
 256         vgc->user_regs.eflags = F_OFF | iopl;
 257         vgc->event_callback_cs = vgc->user_regs.cs;
 258         vgc->failsafe_callback_cs = vgc->user_regs.cs;
 259 #endif
 260
 261         /*
 262          * Initialize the trap_info_t from the IDT
 263          */
 264 #if !defined(__lint)
 265         ASSERT(NIDT == sizeof (vgc->trap_ctxt) / sizeof (vgc->trap_ctxt[0]));
 266 #endif
 267         for (vec = 0; vec < NIDT; vec++) {
 268                 trap_info_t *ti = &vgc->trap_ctxt[vec];
 269
 270                 if (xen_idt_to_trap_info(vec,
 271                     &cp->cpu_m.mcpu_idt[vec], ti) == 0) {
 272                         ti->cs = KCS_SEL;
 273                         ti->vector = vec;
 274                 }
 275         }
 276
 277         /*
 278          * No LDT
 279          */
 280
 281         /*
 282          * (We assert in various places that the GDT is (a) aligned on a
 283          * page boundary and (b) one page long, so this really should fit..)
 284          */
 285 #ifdef CRASH_XEN
 286         vgc->gdt_frames[0] = pa_to_ma(mmu_btop(cp->cpu_m.mcpu_gdtpa));
 287 #else
 288         vgc->gdt_frames[0] = pfn_to_mfn(mmu_btop(cp->cpu_m.mcpu_gdtpa));
 289 #endif
 290         vgc->gdt_ents = NGDT;
 291
 292         vgc->ctrlreg[0] = CR0_ENABLE_FPU_FLAGS(getcr0());
 293
 294 #if defined(__i386)
 295         if (mmu.pae_hat)
 296                 vgc->ctrlreg[3] =
 297                     xen_pfn_to_cr3(pfn_to_mfn(kas.a_hat->hat_htable->ht_pfn));
 298         else
 299 #endif
 300                 vgc->ctrlreg[3] =
 301                     pa_to_ma(mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
 302
 303         vgc->ctrlreg[4] = getcr4();
 304
 305         vgc->event_callback_eip = (uintptr_t)xen_callback;
 306         vgc->failsafe_callback_eip = (uintptr_t)xen_failsafe_callback;
 307         vgc->flags |= VGCF_failsafe_disables_events;
 308
 309 #if defined(__amd64)
 310         /*
 311          * XXPV should this be moved to init_cpu_syscall?
 312          */
 313         vgc->syscall_callback_eip = (uintptr_t)sys_syscall;
 314         vgc->flags |= VGCF_syscall_disables_events;
 315
 316         ASSERT(vgc->user_regs.gs == 0);
 317         vgc->gs_base_kernel = (uintptr_t)cp;
 318 #endif
 319
 320         return (xen_vcpu_initialize(cp->cpu_id, vgc));
 321 }
 322
 323 /*
 324  * Create a guest virtual cpu context so that the virtual cpu
 325  * springs into life in the domain just about to call mp_startup()
 326  *
 327  * Virtual CPUs must be initialized once in the lifetime of the domain;
 328  * after that subsequent attempts to start them will fail with X_EEXIST.
 329  *
 330  * Thus 'alloc' -really- creates and initializes the virtual
 331  * CPU context just once. Once the initialisation succeeds, we never
 332  * free it, nor the regular cpu_t to which it refers.
 333  */
 334 void *
 335 mach_cpucontext_alloc(struct cpu *cp)
 336 {
 337         kthread_t *tp = cp->cpu_thread;
 338         vcpu_guest_context_t vgc;
 339
 340         int err = 1;
 341
 342         /*
 343          * First, augment the incoming cpu structure
 344          * - vcpu pointer reference
 345          * - pending event storage area
 346          * - physical address of GDT
 347          */
 348         cp->cpu_m.mcpu_vcpu_info =
 349             &HYPERVISOR_shared_info->vcpu_info[cp->cpu_id];
 350         cp->cpu_m.mcpu_evt_pend = kmem_zalloc(
 351             sizeof (struct xen_evt_data), KM_SLEEP);
 352         cp->cpu_m.mcpu_gdtpa =
 353             mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)cp->cpu_gdt));
 354
 355         if ((err = xen_gdt_setprot(cp, PROT_READ)) != 0)
 356                 goto done;
 357
 358         /*
 359          * Now set up the vcpu context so that we can start this vcpu
 360          * in the kernel at tp->t_pc (mp_startup).  Note that the
 361          * thread will thread_exit() shortly after performing the
 362          * initialization; in particular, we will *never* take a
 363          * privilege transition on this thread.
 364          */
 365
 366         bzero(&vgc, sizeof (vgc));
 367
 368 #ifdef __amd64
 369         vgc.user_regs.rip = tp->t_pc;
 370         vgc.user_regs.rsp = tp->t_sp;
 371         vgc.user_regs.rbp = tp->t_sp - 2 * sizeof (greg_t);
 372 #else
 373         vgc.user_regs.eip = tp->t_pc;
 374         vgc.user_regs.esp = tp->t_sp;
 375         vgc.user_regs.ebp = tp->t_sp - 2 * sizeof (greg_t);
 376 #endif
 377         /*
 378          * XXPV Fix resume, if Russ didn't already fix it.
 379          *
 380          * Note that resume unconditionally puts t->t_stk + sizeof (regs)
 381          * into kernel_sp via HYPERVISOR_stack_switch. This anticipates
 382          * that only lwps take traps that switch to the kernel stack;
 383          * part of creating an lwp adjusts the stack by subtracting
 384          * sizeof (struct regs) off t_stk.
 385          *
 386          * The more interesting question is, why do we do all the work
 387          * of a fully fledged lwp for a plain thread?  In particular
 388          * we don't have to call HYPERVISOR_stack_switch for lwp-less threads
 389          * or futz with the LDT.  This should probably all be done with
 390          * an lwp context operator to keep pure thread context switch fast.
 391          */
 392         vgc.kernel_sp = (ulong_t)tp->t_stk;
 393
 394         err = mp_set_cpu_context(&vgc, cp);
 395
 396 done:
 397         if (err) {
 398                 mach_cpucontext_free(cp, NULL, err);
 399                 return (NULL);
 400         }
 401         return (cp);
 402 }
 403
 404 /*
 405  * By the time we are called either we have successfully started
 406  * the cpu, or our attempt to start it has failed.
 407  */
 408
 409 /*ARGSUSED*/
 410 void
 411 mach_cpucontext_free(struct cpu *cp, void *arg, int err)
 412 {
 413         switch (err) {
 414         case 0:
 415                 break;
 416         case ETIMEDOUT:
 417                 /*
 418                  * The vcpu context is loaded into the hypervisor, and
 419                  * we've tried to start it, but the vcpu has not been set
 420                  * running yet, for whatever reason.  We arrange to -not-
 421                  * free any data structures it may be referencing.  In
 422                  * particular, we've already told the hypervisor about
 423                  * the GDT, and so we can't map it read-write again.
 424                  */
 425                 break;
 426         default:
 427                 (void) xen_gdt_setprot(cp, PROT_READ | PROT_WRITE);
 428                 kmem_free(cp->cpu_m.mcpu_evt_pend,
 429                     sizeof (struct xen_evt_data));
 430                 break;
 431         }
 432 }
 433
 434 /*
 435  * Reset this CPU's context.  Clear out any pending evtchn data, since event
 436  * channel numbers will all change when we resume.
 437  */
 438 void
 439 mach_cpucontext_reset(cpu_t *cp)
 440 {
 441         bzero(cp->cpu_m.mcpu_evt_pend, sizeof (struct xen_evt_data));
 442         /* mcpu_intr_pending ? */
 443 }
 444
 445 static void
 446 pcb_to_user_regs(label_t *pcb, vcpu_guest_context_t *vgc)
 447 {
 448 #ifdef __amd64
 449         vgc->user_regs.rip = pcb->val[REG_LABEL_PC];
 450         vgc->user_regs.rsp = pcb->val[REG_LABEL_SP];
 451         vgc->user_regs.rbp = pcb->val[REG_LABEL_BP];
 452         vgc->user_regs.rbx = pcb->val[REG_LABEL_RBX];
 453         vgc->user_regs.r12 = pcb->val[REG_LABEL_R12];
 454         vgc->user_regs.r13 = pcb->val[REG_LABEL_R13];
 455         vgc->user_regs.r14 = pcb->val[REG_LABEL_R14];
 456         vgc->user_regs.r15 = pcb->val[REG_LABEL_R15];
 457 #else /* __amd64 */
 458         vgc->user_regs.eip = pcb->val[REG_LABEL_PC];
 459         vgc->user_regs.esp = pcb->val[REG_LABEL_SP];
 460         vgc->user_regs.ebp = pcb->val[REG_LABEL_BP];
 461         vgc->user_regs.ebx = pcb->val[REG_LABEL_EBX];
 462         vgc->user_regs.esi = pcb->val[REG_LABEL_ESI];
 463         vgc->user_regs.edi = pcb->val[REG_LABEL_EDI];
 464 #endif /* __amd64 */
 465 }
 466
 467 /*
 468  * Restore the context of a CPU during resume.  This context is always
 469  * inside enter_safe_phase(), below.
 470  */
 471 void
 472 mach_cpucontext_restore(cpu_t *cp)
 473 {
 474         vcpu_guest_context_t vgc;
 475         int err;
 476
 477         ASSERT(cp->cpu_thread == cp->cpu_pause_thread ||
 478             cp->cpu_thread == cp->cpu_idle_thread);
 479
 480         bzero(&vgc, sizeof (vgc));
 481
 482         pcb_to_user_regs(&cp->cpu_thread->t_pcb, &vgc);
 483
 484         /*
 485          * We're emulating a longjmp() here: in particular, we need to bump the
 486          * stack pointer to account for the pop of xIP that returning from
 487          * longjmp() normally would do, and set the return value in xAX to 1.
 488          */
 489 #ifdef __amd64
 490         vgc.user_regs.rax = 1;
 491         vgc.user_regs.rsp += sizeof (ulong_t);
 492 #else
 493         vgc.user_regs.eax = 1;
 494         vgc.user_regs.esp += sizeof (ulong_t);
 495 #endif
 496
 497         vgc.kernel_sp = cp->cpu_thread->t_sp;
 498
 499         err = mp_set_cpu_context(&vgc, cp);
 500
 501         ASSERT(err == 0);
 502 }
 503
 504 /*
 505  * Reach a point at which the CPU can be safely powered-off or
 506  * suspended.  Nothing can wake this CPU out of the loop.
 507  */
 508 static void
 509 enter_safe_phase(void)
 510 {
 511         ulong_t flags = intr_clear();
 512
 513         if (setjmp(&curthread->t_pcb) == 0) {
 514                 cpu_phase[CPU->cpu_id] = CPU_PHASE_SAFE;
 515                 while (cpu_phase[CPU->cpu_id] == CPU_PHASE_SAFE)
 516                         SMT_PAUSE();
 517         }
 518
 519         ASSERT(!interrupts_enabled());
 520
 521         intr_restore(flags);
 522 }
 523
 524 /*
 525  * Offline CPUs run this code even under a pause_cpus(), so we must
 526  * check if we need to enter the safe phase.
 527  */
 528 void
 529 mach_cpu_idle(void)
 530 {
 531         if (IN_XPV_PANIC()) {
 532                 xpv_panic_halt();
 533         } else  {
 534                 (void) HYPERVISOR_block();
 535                 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
 536                         enter_safe_phase();
 537         }
 538 }
 539
 540 /*
 541  * Spin until either start_cpus() wakes us up, or we get a request to
 542  * enter the safe phase (followed by a later start_cpus()).
 543  */
 544 void
 545 mach_cpu_pause(volatile char *safe)
 546 {
 547         *safe = PAUSE_WAIT;
 548         membar_enter();
 549
 550         while (*safe != PAUSE_IDLE) {
 551                 if (cpu_phase[CPU->cpu_id] == CPU_PHASE_WAIT_SAFE)
 552                         enter_safe_phase();
 553                 SMT_PAUSE();
 554         }
 555 }
 556
 557 void
 558 mach_cpu_halt(char *msg)
 559 {
 560         if (msg)
 561                 prom_printf("%s\n", msg);
 562         (void) xen_vcpu_down(CPU->cpu_id);
 563 }
 564
 565 /*ARGSUSED*/
 566 int
 567 mp_cpu_poweron(struct cpu *cp)
 568 {
 569         return (ENOTSUP);
 570 }
 571
 572 /*ARGSUSED*/
 573 int
 574 mp_cpu_poweroff(struct cpu *cp)
 575 {
 576         return (ENOTSUP);
 577 }
 578
 579 void
 580 mp_enter_barrier(void)
 581 {
 582         hrtime_t last_poke_time = 0;
 583         int poke_allowed = 0;
 584         int done = 0;
 585         int i;
 586
 587         ASSERT(MUTEX_HELD(&cpu_lock));
 588
 589         pause_cpus(NULL, NULL);
 590
 591         while (!done) {
 592                 done = 1;
 593                 poke_allowed = 0;
 594
 595                 if (xpv_gethrtime() - last_poke_time > POKE_TIMEOUT) {
 596                         last_poke_time = xpv_gethrtime();
 597                         poke_allowed = 1;
 598                 }
 599
 600                 for (i = 0; i < NCPU; i++) {
 601                         cpu_t *cp = cpu_get(i);
 602
 603                         if (cp == NULL || cp == CPU)
 604                                 continue;
 605
 606                         switch (cpu_phase[i]) {
 607                         case CPU_PHASE_NONE:
 608                                 cpu_phase[i] = CPU_PHASE_WAIT_SAFE;
 609                                 poke_cpu(i);
 610                                 done = 0;
 611                                 break;
 612
 613                         case CPU_PHASE_WAIT_SAFE:
 614                                 if (poke_allowed)
 615                                         poke_cpu(i);
 616                                 done = 0;
 617                                 break;
 618
 619                         case CPU_PHASE_SAFE:
 620                         case CPU_PHASE_POWERED_OFF:
 621                                 break;
 622                         }
 623                 }
 624
 625                 SMT_PAUSE();
 626         }
 627 }
 628
 629 void
 630 mp_leave_barrier(void)
 631 {
 632         int i;
 633
 634         ASSERT(MUTEX_HELD(&cpu_lock));
 635
 636         for (i = 0; i < NCPU; i++) {
 637                 cpu_t *cp = cpu_get(i);
 638
 639                 if (cp == NULL || cp == CPU)
 640                         continue;
 641
 642                 switch (cpu_phase[i]) {
 643                 /*
 644                  * If we see a CPU in one of these phases, something has
 645                  * gone badly wrong with the guarantees
 646                  * mp_enter_barrier() is supposed to provide.  Rather
 647                  * than attempt to stumble along (and since we can't
 648                  * panic properly in this context), we tell the
 649                  * hypervisor we've crashed.
 650                  */
 651                 case CPU_PHASE_NONE:
 652                 case CPU_PHASE_WAIT_SAFE:
 653                         (void) HYPERVISOR_shutdown(SHUTDOWN_crash);
 654                         break;
 655
 656                 case CPU_PHASE_POWERED_OFF:
 657                         break;
 658
 659                 case CPU_PHASE_SAFE:
 660                         cpu_phase[i] = CPU_PHASE_NONE;
 661                 }
 662         }
 663
 664         start_cpus();
 665 }
 666
 667 static int
 668 poweroff_vcpu(struct cpu *cp)
 669 {
 670         int error;
 671
 672         ASSERT(MUTEX_HELD(&cpu_lock));
 673
 674         ASSERT(CPU->cpu_id != cp->cpu_id);
 675         ASSERT(cp->cpu_flags & CPU_QUIESCED);
 676
 677         mp_enter_barrier();
 678
 679         if ((error = xen_vcpu_down(cp->cpu_id)) == 0) {
 680                 ASSERT(cpu_phase[cp->cpu_id] == CPU_PHASE_SAFE);
 681
 682                 CPUSET_DEL(cpu_ready_set, cp->cpu_id);
 683
 684                 cp->cpu_flags |= CPU_POWEROFF | CPU_OFFLINE;
 685                 cp->cpu_flags &=
 686                     ~(CPU_RUNNING | CPU_READY | CPU_EXISTS | CPU_ENABLE);
 687
 688                 cpu_phase[cp->cpu_id] = CPU_PHASE_POWERED_OFF;
 689
 690                 cpu_set_state(cp);
 691         }
 692
 693         mp_leave_barrier();
 694
 695         return (error);
 696 }
 697
 698 static int
 699 vcpu_config_poweroff(processorid_t id)
 700 {
 701         int oldstate;
 702         int error;
 703         cpu_t *cp;
 704
 705         mutex_enter(&cpu_lock);
 706
 707         if ((cp = cpu_get(id)) == NULL) {
 708                 mutex_exit(&cpu_lock);
 709                 return (ESRCH);
 710         }
 711
 712         if (cpu_get_state(cp) == P_POWEROFF) {
 713                 mutex_exit(&cpu_lock);
 714                 return (0);
 715         }
 716
 717         mutex_exit(&cpu_lock);
 718
 719         do {
 720                 error = p_online_internal(id, P_OFFLINE,
 721                     &oldstate);
 722
 723                 if (error != 0)
 724                         break;
 725
 726                 /*
 727                  * So we just changed it to P_OFFLINE.  But then we dropped
 728                  * cpu_lock, so now it is possible for another thread to change
 729                  * the cpu back to a different, non-quiesced state e.g.
 730                  * P_ONLINE.
 731                  */
 732                 mutex_enter(&cpu_lock);
 733                 if ((cp = cpu_get(id)) == NULL)
 734                         error = ESRCH;
 735                 else {
 736                         if (cp->cpu_flags & CPU_QUIESCED)
 737                                 error = poweroff_vcpu(cp);
 738                         else
 739                                 error = EBUSY;
 740                 }
 741                 mutex_exit(&cpu_lock);
 742         } while (error == EBUSY);
 743
 744         return (error);
 745 }
 746
 747 /*
 748  * Add a new virtual cpu to the domain.
 749  */
 750 static int
 751 vcpu_config_new(processorid_t id)
 752 {
 753         extern int start_cpu(processorid_t);
 754         int error;
 755
 756         if (ncpus == 1) {
 757                 printf("cannot (yet) add cpus to a single-cpu domain\n");
 758                 return (ENOTSUP);
 759         }
 760
 761         affinity_set(CPU_CURRENT);
 762         error = start_cpu(id);
 763         affinity_clear();
 764         return (error);
 765 }
 766
 767 static int
 768 poweron_vcpu(struct cpu *cp)
 769 {
 770         int error;
 771
 772         ASSERT(MUTEX_HELD(&cpu_lock));
 773
 774         if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cp->cpu_id, NULL) != 0) {
 775                 printf("poweron_vcpu: vcpu%d is not available!\n",
 776                     cp->cpu_id);
 777                 return (ENXIO);
 778         }
 779
 780         if ((error = xen_vcpu_up(cp->cpu_id)) == 0) {
 781                 CPUSET_ADD(cpu_ready_set, cp->cpu_id);
 782                 cp->cpu_flags |= CPU_EXISTS | CPU_READY | CPU_RUNNING;
 783                 cp->cpu_flags &= ~CPU_POWEROFF;
 784                 /*
 785                  * There are some nasty races possible here.
 786                  * Tell the vcpu it's up one more time.
 787                  * XXPV Is this enough?  Is this safe?
 788                  */
 789                 (void) xen_vcpu_up(cp->cpu_id);
 790
 791                 cpu_phase[cp->cpu_id] = CPU_PHASE_NONE;
 792
 793                 cpu_set_state(cp);
 794         }
 795         return (error);
 796 }
 797
 798 static int
 799 vcpu_config_poweron(processorid_t id)
 800 {
 801         cpu_t *cp;
 802         int oldstate;
 803         int error;
 804
 805         if (id >= ncpus)
 806                 return (vcpu_config_new(id));
 807
 808         mutex_enter(&cpu_lock);
 809
 810         if ((cp = cpu_get(id)) == NULL) {
 811                 mutex_exit(&cpu_lock);
 812                 return (ESRCH);
 813         }
 814
 815         if (cpu_get_state(cp) != P_POWEROFF) {
 816                 mutex_exit(&cpu_lock);
 817                 return (0);
 818         }
 819
 820         if ((error = poweron_vcpu(cp)) != 0) {
 821                 mutex_exit(&cpu_lock);
 822                 return (error);
 823         }
 824
 825         mutex_exit(&cpu_lock);
 826
 827         return (p_online_internal(id, P_ONLINE, &oldstate));
 828 }
 829
 830 #define REPORT_LEN      128
 831
 832 static void
 833 vcpu_config_report(processorid_t id, uint_t newstate, int error)
 834 {
 835         char *report = kmem_alloc(REPORT_LEN, KM_SLEEP);
 836         size_t len;
 837         char *ps;
 838
 839         switch (newstate) {
 840         case P_ONLINE:
 841                 ps = PS_ONLINE;
 842                 break;
 843         case P_POWEROFF:
 844                 ps = PS_POWEROFF;
 845                 break;
 846         default:
 847                 cmn_err(CE_PANIC, "unknown state %u\n", newstate);
 848                 break;
 849         }
 850
 851         len = snprintf(report, REPORT_LEN,
 852             "cpu%d: externally initiated %s", id, ps);
 853
 854         if (!error) {
 855                 cmn_err(CE_CONT, "!%s\n", report);
 856                 kmem_free(report, REPORT_LEN);
 857                 return;
 858         }
 859
 860         len += snprintf(report + len, REPORT_LEN - len,
 861             " failed, error %d: ", error);
 862         switch (error) {
 863         case EEXIST:
 864                 len += snprintf(report + len, REPORT_LEN - len,
 865                     "cpu already %s", ps ? ps : "?");
 866                 break;
 867         case ESRCH:
 868                 len += snprintf(report + len, REPORT_LEN - len,
 869                     "cpu not found");
 870                 break;
 871         case EINVAL:
 872         case EALREADY:
 873                 break;
 874         case EPERM:
 875                 len += snprintf(report + len, REPORT_LEN - len,
 876                     "insufficient privilege (0x%x)", id);
 877                 break;
 878         case EBUSY:
 879                 switch (newstate) {
 880                 case P_ONLINE:
 881                         /*
 882                          * This return comes from mp_cpu_start -
 883                          * we cannot 'start' the boot CPU.
 884                          */
 885                         len += snprintf(report + len, REPORT_LEN - len,
 886                             "already running");
 887                         break;
 888                 case P_POWEROFF:
 889                         len += snprintf(report + len, REPORT_LEN - len,
 890                             "bound lwps?");
 891                         break;
 892                 default:
 893                         break;
 894                 }
 895         default:
 896                 break;
 897         }
 898
 899         cmn_err(CE_CONT, "%s\n", report);
 900         kmem_free(report, REPORT_LEN);
 901 }
 902
 903 static void
 904 vcpu_config(void *arg)
 905 {
 906         int id = (int)(uintptr_t)arg;
 907         int error;
 908         char dir[16];
 909         char *state;
 910
 911         if ((uint_t)id >= max_ncpus) {
 912                 cmn_err(CE_WARN,
 913                     "vcpu_config: cpu%d does not fit in this domain", id);
 914                 return;
 915         }
 916
 917         (void) snprintf(dir, sizeof (dir), "cpu/%d", id);
 918         state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 919         if (xenbus_scanf(XBT_NULL, dir, "availability", "%s", state) == 0) {
 920                 if (strcmp(state, "online") == 0) {
 921                         error = vcpu_config_poweron(id);
 922                         vcpu_config_report(id, P_ONLINE, error);
 923                 } else if (strcmp(state, "offline") == 0) {
 924                         error = vcpu_config_poweroff(id);
 925                         vcpu_config_report(id, P_POWEROFF, error);
 926                 } else {
 927                         cmn_err(CE_WARN,
 928                             "cpu%d: unknown target state '%s'", id, state);
 929                 }
 930         } else
 931                 cmn_err(CE_WARN,
 932                     "cpu%d: unable to read target state from xenstore", id);
 933
 934         kmem_free(state, MAXPATHLEN);
 935 }
 936
 937 /*ARGSUSED*/
 938 static void
 939 vcpu_config_event(struct xenbus_watch *watch, const char **vec, uint_t len)
 940 {
 941         const char *path = vec[XS_WATCH_PATH];
 942         processorid_t id;
 943         char *s;
 944
 945         if ((s = strstr(path, "cpu/")) != NULL &&
 946             sscanf(s, "cpu/%d", &id) == 1) {
 947                 /*
 948                  * Run the virtual CPU configuration on a separate thread to
 949                  * avoid blocking on this event for too long (and for now,
 950                  * to ensure configuration requests are serialized.)
 951                  */
 952                 (void) taskq_dispatch(cpu_config_tq,
 953                     vcpu_config, (void *)(uintptr_t)id, 0);
 954         }
 955 }
 956
 957 static int
 958 xen_vcpu_initialize(processorid_t id, vcpu_guest_context_t *vgc)
 959 {
 960         int err;
 961
 962         if ((err = HYPERVISOR_vcpu_op(VCPUOP_initialise, id, vgc)) != 0) {
 963                 char *str;
 964                 int level = CE_WARN;
 965
 966                 switch (err) {
 967                 case -X_EINVAL:
 968                         /*
 969                          * This interface squashes multiple error sources
 970                          * to one error code.  In particular, an X_EINVAL
 971                          * code can mean:
 972                          *
 973                          * -    the vcpu id is out of range
 974                          * -    cs or ss are in ring 0
 975                          * -    cr3 is wrong
 976                          * -    an entry in the new gdt is above the
 977                          *      reserved entry
 978                          * -    a frame underneath the new gdt is bad
 979                          */
 980                         str = "something is wrong :(";
 981                         break;
 982                 case -X_ENOENT:
 983                         str = "no such cpu";
 984                         break;
 985                 case -X_ENOMEM:
 986                         str = "no mem to copy ctxt";
 987                         break;
 988                 case -X_EFAULT:
 989                         str = "bad address";
 990                         break;
 991                 case -X_EEXIST:
 992                         /*
 993                          * Hmm.  This error is returned if the vcpu has already
 994                          * been initialized once before in the lifetime of this
 995                          * domain.  This is a logic error in the kernel.
 996                          */
 997                         level = CE_PANIC;
 998                         str = "already initialized";
 999                         break;
1000                 default:
1001                         level = CE_PANIC;
1002                         str = "<unexpected>";
1003                         break;
1004                 }
1005
1006                 cmn_err(level, "vcpu%d: failed to init: error %d: %s",
1007                     id, -err, str);
1008         }
1009         return (err);
1010 }
1011
1012 long
1013 xen_vcpu_up(processorid_t id)
1014 {
1015         long err;
1016
1017         if ((err = HYPERVISOR_vcpu_op(VCPUOP_up, id, NULL)) != 0) {
1018                 char *str;
1019
1020                 switch (err) {
1021                 case -X_ENOENT:
1022                         str = "no such cpu";
1023                         break;
1024                 case -X_EINVAL:
1025                         /*
1026                          * Perhaps this is diagnostic overkill.
1027                          */
1028                         if (HYPERVISOR_vcpu_op(VCPUOP_is_up, id, NULL) < 0)
1029                                 str = "bad cpuid";
1030                         else
1031                                 str = "not initialized";
1032                         break;
1033                 default:
1034                         str = "<unexpected>";
1035                         break;
1036                 }
1037
1038                 printf("vcpu%d: failed to start: error %d: %s\n",
1039                     id, -(int)err, str);
1040                 return (EBFONT);        /* deliberately silly */
1041         }
1042         return (err);
1043 }
1044
1045 long
1046 xen_vcpu_down(processorid_t id)
1047 {
1048         long err;
1049
1050         if ((err = HYPERVISOR_vcpu_op(VCPUOP_down, id, NULL)) != 0) {
1051                 /*
1052                  * X_ENOENT:    no such cpu
1053                  * X_EINVAL:    bad cpuid
1054                  */
1055                 panic("vcpu%d: failed to stop: error %d", id, -(int)err);
1056         }
1057
1058         return (err);
1059 }