15325 bhyve upstream sync 2023 January
[illumos-gate.git] / usr / src / uts / intel / io / vmm / io / vlapic.c
blob5037c3b1097c5396b73e0de42b1880ac2918f82c
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 * Copyright (c) 2019 Joyent, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
29 * $FreeBSD$
32 * This file and its contents are supplied under the terms of the
33 * Common Development and Distribution License ("CDDL"), version 1.0.
34 * You may only use this file in accordance with the terms of version
35 * 1.0 of the CDDL.
37 * A full copy of the text of the CDDL should have accompanied this
38 * source. A copy of the CDDL is also available via the Internet at
39 * http://www.illumos.org/license/CDDL.
41 * Copyright 2014 Pluribus Networks Inc.
42 * Copyright 2018 Joyent, Inc.
43 * Copyright 2023 Oxide Computer Company
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
49 #include <sys/param.h>
50 #include <sys/kernel.h>
51 #include <sys/kmem.h>
52 #include <sys/mutex.h>
53 #include <sys/systm.h>
54 #include <sys/cpuset.h>
56 #include <x86/specialreg.h>
57 #include <x86/apicreg.h>
59 #include <machine/clock.h>
61 #include <machine/vmm.h>
62 #include <sys/vmm_kernel.h>
64 #include "vmm_lapic.h"
65 #include "vmm_stat.h"
67 #include "vlapic.h"
68 #include "vlapic_priv.h"
69 #include "vioapic.h"
73 * The 4 high bits of a given interrupt vector represent its priority. The same
74 * is true for the contents of the TPR when it is used to calculate the ultimate
75 * PPR of an APIC - the 4 high bits hold the priority.
77 #define PRIO(x) ((x) & 0xf0)
79 #define VLAPIC_VERSION (0x14)
82 * The 'vlapic->timer_lock' is used to provide mutual exclusion between the
83 * vlapic_callout_handler() and vcpu accesses to:
84 * - timer_freq_bt, timer_period_bt, timer_fire_bt
85 * - timer LVT register
87 #define VLAPIC_TIMER_LOCK(vlapic) mutex_enter(&((vlapic)->timer_lock))
88 #define VLAPIC_TIMER_UNLOCK(vlapic) mutex_exit(&((vlapic)->timer_lock))
89 #define VLAPIC_TIMER_LOCKED(vlapic) MUTEX_HELD(&((vlapic)->timer_lock))
92 * APIC timer frequency:
93 * - arbitrary but chosen to be in the ballpark of contemporary hardware.
94 * - power-of-two to avoid loss of precision when calculating times
96 #define VLAPIC_BUS_FREQ (128 * 1024 * 1024)
98 #define APICBASE_ADDR_MASK 0xfffffffffffff000UL
100 #define APIC_VALID_MASK_ESR (APIC_ESR_SEND_CS_ERROR | \
101 APIC_ESR_RECEIVE_CS_ERROR | APIC_ESR_SEND_ACCEPT | \
102 APIC_ESR_RECEIVE_ACCEPT | APIC_ESR_SEND_ILLEGAL_VECTOR | \
103 APIC_ESR_RECEIVE_ILLEGAL_VECTOR | APIC_ESR_ILLEGAL_REGISTER)
105 static void vlapic_set_error(struct vlapic *, uint32_t, bool);
106 static void vlapic_callout_handler(void *arg);
108 static __inline bool
109 vlapic_x2mode(const struct vlapic *vlapic)
111 return ((vlapic->msr_apicbase & APICBASE_X2APIC) != 0);
114 static __inline bool
115 vlapic_hw_disabled(const struct vlapic *vlapic)
117 return ((vlapic->msr_apicbase & APICBASE_ENABLED) == 0);
120 static __inline bool
121 vlapic_sw_disabled(const struct vlapic *vlapic)
123 const struct LAPIC *lapic = vlapic->apic_page;
125 return ((lapic->svr & APIC_SVR_ENABLE) == 0);
128 static __inline bool
129 vlapic_enabled(const struct vlapic *vlapic)
131 return (!vlapic_hw_disabled(vlapic) && !vlapic_sw_disabled(vlapic));
134 static __inline uint32_t
135 vlapic_get_id(const struct vlapic *vlapic)
138 if (vlapic_x2mode(vlapic))
139 return (vlapic->vcpuid);
140 else
141 return (vlapic->vcpuid << 24);
144 static uint32_t
145 x2apic_ldr(const struct vlapic *vlapic)
147 int apicid;
148 uint32_t ldr;
150 apicid = vlapic_get_id(vlapic);
151 ldr = 1 << (apicid & 0xf);
152 ldr |= (apicid & 0xffff0) << 12;
153 return (ldr);
156 void
157 vlapic_dfr_write_handler(struct vlapic *vlapic)
159 struct LAPIC *lapic;
161 lapic = vlapic->apic_page;
162 if (vlapic_x2mode(vlapic)) {
163 /* Ignore write to DFR in x2APIC mode */
164 lapic->dfr = 0;
165 return;
168 lapic->dfr &= APIC_DFR_MODEL_MASK;
169 lapic->dfr |= APIC_DFR_RESERVED;
172 void
173 vlapic_ldr_write_handler(struct vlapic *vlapic)
175 struct LAPIC *lapic;
177 lapic = vlapic->apic_page;
179 /* LDR is read-only in x2apic mode */
180 if (vlapic_x2mode(vlapic)) {
181 /* Ignore write to LDR in x2APIC mode */
182 lapic->ldr = x2apic_ldr(vlapic);
183 } else {
184 lapic->ldr &= ~APIC_LDR_RESERVED;
188 void
189 vlapic_id_write_handler(struct vlapic *vlapic)
191 struct LAPIC *lapic;
194 * We don't allow the ID register to be modified so reset it back to
195 * its default value.
197 lapic = vlapic->apic_page;
198 lapic->id = vlapic_get_id(vlapic);
201 static int
202 vlapic_timer_divisor(uint32_t dcr)
204 switch (dcr & 0xB) {
205 case APIC_TDCR_1:
206 return (1);
207 case APIC_TDCR_2:
208 return (2);
209 case APIC_TDCR_4:
210 return (4);
211 case APIC_TDCR_8:
212 return (8);
213 case APIC_TDCR_16:
214 return (16);
215 case APIC_TDCR_32:
216 return (32);
217 case APIC_TDCR_64:
218 return (64);
219 case APIC_TDCR_128:
220 return (128);
221 default:
222 panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
226 #if 0
227 static inline void
228 vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
230 printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
231 *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
232 *lvt & APIC_LVTT_M);
234 #endif
236 static uint32_t
237 vlapic_get_ccr(struct vlapic *vlapic)
239 struct LAPIC *lapic;
240 uint32_t ccr;
242 ccr = 0;
243 lapic = vlapic->apic_page;
245 VLAPIC_TIMER_LOCK(vlapic);
246 if (callout_active(&vlapic->callout)) {
248 * If the timer is scheduled to expire in the future then
249 * compute the value of 'ccr' based on the remaining time.
252 const hrtime_t now = gethrtime();
253 if (vlapic->timer_fire_when > now) {
254 ccr += hrt_freq_count(vlapic->timer_fire_when - now,
255 vlapic->timer_cur_freq);
260 * Clamp CCR value to that programmed in ICR - its theoretical maximum.
261 * Normal operation should never result in this being necessary. Only
262 * strange circumstances due to state importation as part of instance
263 * save/restore or live-migration require such wariness.
265 if (ccr > lapic->icr_timer) {
266 ccr = lapic->icr_timer;
267 vlapic->stats.vs_clamp_ccr++;
269 VLAPIC_TIMER_UNLOCK(vlapic);
270 return (ccr);
273 static void
274 vlapic_update_divider(struct vlapic *vlapic)
276 struct LAPIC *lapic = vlapic->apic_page;
278 ASSERT(VLAPIC_TIMER_LOCKED(vlapic));
280 vlapic->timer_cur_freq =
281 VLAPIC_BUS_FREQ / vlapic_timer_divisor(lapic->dcr_timer);
282 vlapic->timer_period =
283 hrt_freq_interval(vlapic->timer_cur_freq, lapic->icr_timer);
286 void
287 vlapic_dcr_write_handler(struct vlapic *vlapic)
290 * Update the timer frequency and the timer period.
292 * XXX changes to the frequency divider will not take effect until
293 * the timer is reloaded.
295 VLAPIC_TIMER_LOCK(vlapic);
296 vlapic_update_divider(vlapic);
297 VLAPIC_TIMER_UNLOCK(vlapic);
300 void
301 vlapic_esr_write_handler(struct vlapic *vlapic)
303 struct LAPIC *lapic;
305 lapic = vlapic->apic_page;
306 lapic->esr = vlapic->esr_pending;
307 vlapic->esr_pending = 0;
310 vcpu_notify_t
311 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
313 struct LAPIC *lapic;
314 uint32_t *irrptr, *tmrptr, mask, tmr;
315 int idx;
317 KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
319 lapic = vlapic->apic_page;
320 if (!(lapic->svr & APIC_SVR_ENABLE)) {
321 /* ignore interrupt on software-disabled APIC */
322 return (VCPU_NOTIFY_NONE);
325 if (vector < 16) {
326 vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
327 false);
330 * If the error LVT is configured to interrupt the vCPU, it will
331 * have delivered a notification through that mechanism.
333 return (VCPU_NOTIFY_NONE);
336 if (vlapic->ops.set_intr_ready) {
337 return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
340 idx = (vector / 32) * 4;
341 mask = 1 << (vector % 32);
342 tmrptr = &lapic->tmr0;
343 irrptr = &lapic->irr0;
346 * Update TMR for requested vector, if necessary.
347 * This must be done prior to asserting the bit in IRR so that the
348 * proper TMR state is always visible before the to-be-queued interrupt
349 * can be injected.
351 tmr = atomic_load_acq_32(&tmrptr[idx]);
352 if ((tmr & mask) != (level ? mask : 0)) {
353 if (level) {
354 atomic_set_int(&tmrptr[idx], mask);
355 } else {
356 atomic_clear_int(&tmrptr[idx], mask);
360 /* Now set the bit in IRR */
361 atomic_set_int(&irrptr[idx], mask);
363 return (VCPU_NOTIFY_EXIT);
366 static __inline uint32_t *
367 vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
369 struct LAPIC *lapic = vlapic->apic_page;
370 int i;
372 switch (offset) {
373 case APIC_OFFSET_CMCI_LVT:
374 return (&lapic->lvt_cmci);
375 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
376 i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
377 return ((&lapic->lvt_timer) + i);
378 default:
379 panic("vlapic_get_lvt: invalid LVT\n");
383 static __inline int
384 lvt_off_to_idx(uint32_t offset)
386 int index;
388 switch (offset) {
389 case APIC_OFFSET_CMCI_LVT:
390 index = APIC_LVT_CMCI;
391 break;
392 case APIC_OFFSET_TIMER_LVT:
393 index = APIC_LVT_TIMER;
394 break;
395 case APIC_OFFSET_THERM_LVT:
396 index = APIC_LVT_THERMAL;
397 break;
398 case APIC_OFFSET_PERF_LVT:
399 index = APIC_LVT_PMC;
400 break;
401 case APIC_OFFSET_LINT0_LVT:
402 index = APIC_LVT_LINT0;
403 break;
404 case APIC_OFFSET_LINT1_LVT:
405 index = APIC_LVT_LINT1;
406 break;
407 case APIC_OFFSET_ERROR_LVT:
408 index = APIC_LVT_ERROR;
409 break;
410 default:
411 index = -1;
412 break;
414 KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
415 "invalid lvt index %d for offset %x", index, offset));
417 return (index);
420 static __inline uint32_t
421 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
423 int idx;
424 uint32_t val;
426 idx = lvt_off_to_idx(offset);
427 val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
428 return (val);
431 void
432 vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
434 uint32_t *lvtptr, mask, val;
435 struct LAPIC *lapic;
436 int idx;
438 lapic = vlapic->apic_page;
439 lvtptr = vlapic_get_lvtptr(vlapic, offset);
440 val = *lvtptr;
441 idx = lvt_off_to_idx(offset);
443 if (!(lapic->svr & APIC_SVR_ENABLE))
444 val |= APIC_LVT_M;
445 mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
446 switch (offset) {
447 case APIC_OFFSET_TIMER_LVT:
448 mask |= APIC_LVTT_TM;
449 break;
450 case APIC_OFFSET_ERROR_LVT:
451 break;
452 case APIC_OFFSET_LINT0_LVT:
453 case APIC_OFFSET_LINT1_LVT:
454 mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
455 /* FALLTHROUGH */
456 default:
457 mask |= APIC_LVT_DM;
458 break;
460 val &= mask;
461 *lvtptr = val;
462 atomic_store_rel_32(&vlapic->lvt_last[idx], val);
465 static void
466 vlapic_refresh_lvts(struct vlapic *vlapic)
468 vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
469 vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
470 vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
471 vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
472 vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
473 vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
474 vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
477 static void
478 vlapic_mask_lvts(struct vlapic *vlapic)
480 struct LAPIC *lapic = vlapic->apic_page;
482 lapic->lvt_cmci |= APIC_LVT_M;
483 lapic->lvt_timer |= APIC_LVT_M;
484 lapic->lvt_thermal |= APIC_LVT_M;
485 lapic->lvt_pcint |= APIC_LVT_M;
486 lapic->lvt_lint0 |= APIC_LVT_M;
487 lapic->lvt_lint1 |= APIC_LVT_M;
488 lapic->lvt_error |= APIC_LVT_M;
489 vlapic_refresh_lvts(vlapic);
492 static int
493 vlapic_fire_lvt(struct vlapic *vlapic, uint_t lvt)
495 uint32_t mode, reg, vec;
496 vcpu_notify_t notify;
498 reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
500 if (reg & APIC_LVT_M)
501 return (0);
502 vec = reg & APIC_LVT_VECTOR;
503 mode = reg & APIC_LVT_DM;
505 switch (mode) {
506 case APIC_LVT_DM_FIXED:
507 if (vec < 16) {
508 vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
509 lvt == APIC_LVT_ERROR);
510 return (0);
512 notify = vlapic_set_intr_ready(vlapic, vec, false);
513 vcpu_notify_event_type(vlapic->vm, vlapic->vcpuid, notify);
514 break;
515 case APIC_LVT_DM_NMI:
516 (void) vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
517 break;
518 case APIC_LVT_DM_EXTINT:
519 (void) vm_inject_extint(vlapic->vm, vlapic->vcpuid);
520 break;
521 default:
522 // Other modes ignored
523 return (0);
525 return (1);
528 static uint_t
529 vlapic_active_isr(struct vlapic *vlapic)
531 int i;
532 uint32_t *isrp;
534 isrp = &vlapic->apic_page->isr7;
536 for (i = 7; i >= 0; i--, isrp -= 4) {
537 uint32_t reg = *isrp;
539 if (reg != 0) {
540 uint_t vec = (i * 32) + bsrl(reg);
542 if (vec < 16) {
544 * Truncate the illegal low vectors to value of
545 * 0, indicating that no active ISR was found.
547 return (0);
549 return (vec);
553 return (0);
557 * After events which might arbitrarily change the value of PPR, such as a TPR
558 * write or an EOI, calculate that new PPR value and store it in the APIC page.
560 static void
561 vlapic_update_ppr(struct vlapic *vlapic)
563 int isrvec, tpr, ppr;
565 isrvec = vlapic_active_isr(vlapic);
566 tpr = vlapic->apic_page->tpr;
569 * Algorithm adopted from section "Interrupt, Task and Processor
570 * Priority" in Intel Architecture Manual Vol 3a.
572 if (PRIO(tpr) >= PRIO(isrvec)) {
573 ppr = tpr;
574 } else {
575 ppr = PRIO(isrvec);
578 vlapic->apic_page->ppr = ppr;
582 * When a vector is asserted in ISR as in-service, the PPR must be raised to the
583 * priority of that vector, as the vCPU would have been at a lower priority in
584 * order for the vector to be accepted.
586 static void
587 vlapic_raise_ppr(struct vlapic *vlapic, int vec)
589 struct LAPIC *lapic = vlapic->apic_page;
590 int ppr;
592 ppr = PRIO(vec);
594 lapic->ppr = ppr;
597 void
598 vlapic_sync_tpr(struct vlapic *vlapic)
600 vlapic_update_ppr(vlapic);
603 static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
605 static void
606 vlapic_process_eoi(struct vlapic *vlapic)
608 struct LAPIC *lapic = vlapic->apic_page;
609 uint32_t *isrptr, *tmrptr;
610 int i;
611 uint_t idx, bitpos, vector;
613 isrptr = &lapic->isr0;
614 tmrptr = &lapic->tmr0;
616 for (i = 7; i >= 0; i--) {
617 idx = i * 4;
618 if (isrptr[idx] != 0) {
619 bitpos = bsrl(isrptr[idx]);
620 vector = i * 32 + bitpos;
622 isrptr[idx] &= ~(1 << bitpos);
623 vlapic_update_ppr(vlapic);
624 if ((tmrptr[idx] & (1 << bitpos)) != 0) {
625 vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
626 vector);
628 return;
631 vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
634 static __inline int
635 vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
638 return (lvt & mask);
641 static __inline int
642 vlapic_periodic_timer(struct vlapic *vlapic)
644 uint32_t lvt;
646 lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
648 return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
651 static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
653 static void
654 vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error)
657 vlapic->esr_pending |= mask;
660 * Avoid infinite recursion if the error LVT itself is configured with
661 * an illegal vector.
663 if (lvt_error)
664 return;
666 if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) {
667 vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
671 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
673 static void
674 vlapic_fire_timer(struct vlapic *vlapic)
676 ASSERT(VLAPIC_TIMER_LOCKED(vlapic));
678 if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) {
679 vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
683 static VMM_STAT(VLAPIC_INTR_CMC,
684 "corrected machine check interrupts generated by vlapic");
686 void
687 vlapic_fire_cmci(struct vlapic *vlapic)
690 if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) {
691 vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
695 static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
696 "lvts triggered");
699 vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
701 if (!vlapic_enabled(vlapic)) {
703 * When the local APIC is global/hardware disabled,
704 * LINT[1:0] pins are configured as INTR and NMI pins,
705 * respectively.
707 switch (vector) {
708 case APIC_LVT_LINT0:
709 (void) vm_inject_extint(vlapic->vm,
710 vlapic->vcpuid);
711 break;
712 case APIC_LVT_LINT1:
713 (void) vm_inject_nmi(vlapic->vm,
714 vlapic->vcpuid);
715 break;
716 default:
717 break;
719 return (0);
722 switch (vector) {
723 case APIC_LVT_LINT0:
724 case APIC_LVT_LINT1:
725 case APIC_LVT_TIMER:
726 case APIC_LVT_ERROR:
727 case APIC_LVT_PMC:
728 case APIC_LVT_THERMAL:
729 case APIC_LVT_CMCI:
730 if (vlapic_fire_lvt(vlapic, vector)) {
731 vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
732 LVTS_TRIGGERRED, vector, 1);
734 break;
735 default:
736 return (EINVAL);
738 return (0);
741 static void
742 vlapic_callout_reset(struct vlapic *vlapic)
744 callout_reset_hrtime(&vlapic->callout, vlapic->timer_fire_when,
745 vlapic_callout_handler, vlapic, C_ABSOLUTE);
748 static void
749 vlapic_callout_handler(void *arg)
751 struct vlapic *vlapic = arg;
753 VLAPIC_TIMER_LOCK(vlapic);
754 if (callout_pending(&vlapic->callout)) /* callout was reset */
755 goto done;
757 if (!callout_active(&vlapic->callout)) /* callout was stopped */
758 goto done;
760 callout_deactivate(&vlapic->callout);
762 vlapic_fire_timer(vlapic);
764 if (vlapic_periodic_timer(vlapic)) {
766 * Compute the delta between when the timer was supposed to
767 * fire and the present time. We can depend on the fact that
768 * cyclics (which underly these callouts) will never be called
769 * early.
771 const hrtime_t now = gethrtime();
772 const hrtime_t delta = now - vlapic->timer_fire_when;
773 if (delta >= vlapic->timer_period) {
775 * If we are so behind that we have missed an entire
776 * timer period, reset the time base rather than
777 * attempting to catch up.
779 vlapic->timer_fire_when = now + vlapic->timer_period;
780 } else {
781 vlapic->timer_fire_when += vlapic->timer_period;
783 vlapic_callout_reset(vlapic);
784 } else {
786 * Clear the target time so that logic can distinguish from a
787 * timer which has fired (where the value is zero) from one
788 * which is held pending due to the instance being paused (where
789 * the value is non-zero, but the callout is not pending).
791 vlapic->timer_fire_when = 0;
793 done:
794 VLAPIC_TIMER_UNLOCK(vlapic);
797 void
798 vlapic_icrtmr_write_handler(struct vlapic *vlapic)
800 struct LAPIC *lapic = vlapic->apic_page;
802 VLAPIC_TIMER_LOCK(vlapic);
803 vlapic->timer_period = hrt_freq_interval(vlapic->timer_cur_freq,
804 lapic->icr_timer);
805 if (vlapic->timer_period != 0) {
806 vlapic->timer_fire_when = gethrtime() + vlapic->timer_period;
807 vlapic_callout_reset(vlapic);
808 } else {
809 vlapic->timer_fire_when = 0;
810 callout_stop(&vlapic->callout);
812 VLAPIC_TIMER_UNLOCK(vlapic);
816 * This function populates 'dmask' with the set of vcpus that match the
817 * addressing specified by the (dest, phys, lowprio) tuple.
819 * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
820 * or xAPIC (8-bit) destination field.
822 void
823 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
824 bool lowprio, bool x2apic_dest)
826 struct vlapic *vlapic;
827 uint32_t dfr, ldr, ldest, cluster;
828 uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
829 cpuset_t amask;
830 int vcpuid;
832 if ((x2apic_dest && dest == 0xffffffff) ||
833 (!x2apic_dest && dest == 0xff)) {
835 * Broadcast in both logical and physical modes.
837 *dmask = vm_active_cpus(vm);
838 return;
841 if (phys) {
843 * Physical mode: destination is APIC ID.
845 CPU_ZERO(dmask);
846 vcpuid = vm_apicid2vcpuid(vm, dest);
847 amask = vm_active_cpus(vm);
848 if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask))
849 CPU_SET(vcpuid, dmask);
850 } else {
852 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
853 * bitmask. This model is only available in the xAPIC mode.
855 mda_flat_ldest = dest & 0xff;
858 * In the "Cluster Model" the MDA is used to identify a
859 * specific cluster and a set of APICs in that cluster.
861 if (x2apic_dest) {
862 mda_cluster_id = dest >> 16;
863 mda_cluster_ldest = dest & 0xffff;
864 } else {
865 mda_cluster_id = (dest >> 4) & 0xf;
866 mda_cluster_ldest = dest & 0xf;
870 * Logical mode: match each APIC that has a bit set
871 * in its LDR that matches a bit in the ldest.
873 CPU_ZERO(dmask);
874 amask = vm_active_cpus(vm);
875 while ((vcpuid = CPU_FFS(&amask)) != 0) {
876 vcpuid--;
877 CPU_CLR(vcpuid, &amask);
879 vlapic = vm_lapic(vm, vcpuid);
880 dfr = vlapic->apic_page->dfr;
881 ldr = vlapic->apic_page->ldr;
883 if ((dfr & APIC_DFR_MODEL_MASK) ==
884 APIC_DFR_MODEL_FLAT) {
885 ldest = ldr >> 24;
886 mda_ldest = mda_flat_ldest;
887 } else if ((dfr & APIC_DFR_MODEL_MASK) ==
888 APIC_DFR_MODEL_CLUSTER) {
889 if (vlapic_x2mode(vlapic)) {
890 cluster = ldr >> 16;
891 ldest = ldr & 0xffff;
892 } else {
893 cluster = ldr >> 28;
894 ldest = (ldr >> 24) & 0xf;
896 if (cluster != mda_cluster_id)
897 continue;
898 mda_ldest = mda_cluster_ldest;
899 } else {
901 * Guest has configured a bad logical
902 * model for this vcpu - skip it.
904 continue;
907 if ((mda_ldest & ldest) != 0) {
908 CPU_SET(vcpuid, dmask);
909 if (lowprio)
910 break;
916 static VMM_STAT(VLAPIC_IPI_SEND, "ipis sent from vcpu");
917 static VMM_STAT(VLAPIC_IPI_RECV, "ipis received by vcpu");
919 static void
920 vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
922 struct LAPIC *lapic = vlapic->apic_page;
924 if (lapic->tpr != val) {
925 lapic->tpr = val;
926 vlapic_update_ppr(vlapic);
930 void
931 vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
933 uint8_t tpr;
935 if (val & ~0xf) {
936 vm_inject_gp(vlapic->vm, vlapic->vcpuid);
937 return;
940 tpr = val << 4;
941 vlapic_set_tpr(vlapic, tpr);
944 uint64_t
945 vlapic_get_cr8(const struct vlapic *vlapic)
947 const struct LAPIC *lapic = vlapic->apic_page;
949 return (lapic->tpr >> 4);
952 static bool
953 vlapic_is_icr_valid(uint64_t icrval)
955 uint32_t mode = icrval & APIC_DELMODE_MASK;
956 uint32_t level = icrval & APIC_LEVEL_MASK;
957 uint32_t trigger = icrval & APIC_TRIGMOD_MASK;
958 uint32_t shorthand = icrval & APIC_DEST_MASK;
960 switch (mode) {
961 case APIC_DELMODE_FIXED:
962 if (trigger == APIC_TRIGMOD_EDGE)
963 return (true);
965 * AMD allows a level assert IPI and Intel converts a level
966 * assert IPI into an edge IPI.
968 if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT)
969 return (true);
970 break;
971 case APIC_DELMODE_LOWPRIO:
972 case APIC_DELMODE_SMI:
973 case APIC_DELMODE_NMI:
974 case APIC_DELMODE_INIT:
975 if (trigger == APIC_TRIGMOD_EDGE &&
976 (shorthand == APIC_DEST_DESTFLD ||
977 shorthand == APIC_DEST_ALLESELF)) {
978 return (true);
981 * AMD allows a level assert IPI and Intel converts a level
982 * assert IPI into an edge IPI.
984 if (trigger == APIC_TRIGMOD_LEVEL &&
985 level == APIC_LEVEL_ASSERT &&
986 (shorthand == APIC_DEST_DESTFLD ||
987 shorthand == APIC_DEST_ALLESELF)) {
988 return (true);
991 * An level triggered deassert INIT is defined in the Intel
992 * Multiprocessor Specification and the Intel Software Developer
993 * Manual. Due to the MPS it's required to send a level assert
994 * INIT to a cpu and then a level deassert INIT. Some operating
995 * systems e.g. FreeBSD or Linux use that algorithm. According
996 * to the SDM a level deassert INIT is only supported by Pentium
997 * and P6 processors. It's always send to all cpus regardless of
998 * the destination or shorthand field. It resets the arbitration
999 * id register. This register is not software accessible and
1000 * only required for the APIC bus arbitration. So, the level
1001 * deassert INIT doesn't need any emulation and we should ignore
1002 * it. The SDM also defines that newer processors don't support
1003 * the level deassert INIT and it's not valid any more. As it's
1004 * defined for older systems, it can't be invalid per se.
1005 * Otherwise, backward compatibility would be broken. However,
1006 * when returning false here, it'll be ignored which is the
1007 * desired behaviour.
1009 if (mode == APIC_DELMODE_INIT &&
1010 trigger == APIC_TRIGMOD_LEVEL &&
1011 level == APIC_LEVEL_DEASSERT) {
1012 return (false);
1014 break;
1015 case APIC_DELMODE_STARTUP:
1016 if (shorthand == APIC_DEST_DESTFLD ||
1017 shorthand == APIC_DEST_ALLESELF) {
1018 return (true);
1020 break;
1021 case APIC_DELMODE_RR:
1022 /* Only available on AMD! */
1023 if (trigger == APIC_TRIGMOD_EDGE &&
1024 shorthand == APIC_DEST_DESTFLD) {
1025 return (true);
1027 break;
1028 case APIC_DELMODE_RESV:
1029 return (false);
1030 default:
1031 panic("vlapic_is_icr_valid: invalid mode 0x%08x", mode);
1034 return (false);
1037 void
1038 vlapic_icrlo_write_handler(struct vlapic *vlapic)
1040 int i;
1041 cpuset_t dmask;
1042 uint64_t icrval;
1043 uint32_t dest, vec, mode, dsh;
1044 struct LAPIC *lapic;
1046 lapic = vlapic->apic_page;
1047 lapic->icr_lo &= ~APIC_DELSTAT_PEND;
1048 icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
1051 * Ignore invalid combinations of the icr.
1053 if (!vlapic_is_icr_valid(icrval))
1054 return;
1056 if (vlapic_x2mode(vlapic))
1057 dest = icrval >> 32;
1058 else
1059 dest = icrval >> (32 + 24);
1060 vec = icrval & APIC_VECTOR_MASK;
1061 mode = icrval & APIC_DELMODE_MASK;
1062 dsh = icrval & APIC_DEST_MASK;
1064 if (mode == APIC_DELMODE_FIXED && vec < 16) {
1065 vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false);
1066 return;
1069 if (mode == APIC_DELMODE_INIT &&
1070 (icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
1071 /* No work required to deassert INIT */
1072 return;
1075 switch (dsh) {
1076 case APIC_DEST_DESTFLD:
1077 vlapic_calcdest(vlapic->vm, &dmask, dest,
1078 (icrval & APIC_DESTMODE_LOG) == 0, false,
1079 vlapic_x2mode(vlapic));
1080 break;
1081 case APIC_DEST_SELF:
1082 CPU_SETOF(vlapic->vcpuid, &dmask);
1083 break;
1084 case APIC_DEST_ALLISELF:
1085 dmask = vm_active_cpus(vlapic->vm);
1086 break;
1087 case APIC_DEST_ALLESELF:
1088 dmask = vm_active_cpus(vlapic->vm);
1089 CPU_CLR(vlapic->vcpuid, &dmask);
1090 break;
1091 default:
1093 * All possible delivery notations are covered above.
1094 * We should never end up here.
1096 panic("unknown delivery shorthand: %x", dsh);
1099 while ((i = CPU_FFS(&dmask)) != 0) {
1100 i--;
1101 CPU_CLR(i, &dmask);
1102 switch (mode) {
1103 case APIC_DELMODE_FIXED:
1104 (void) lapic_intr_edge(vlapic->vm, i, vec);
1105 vmm_stat_incr(vlapic->vm, vlapic->vcpuid,
1106 VLAPIC_IPI_SEND, 1);
1107 vmm_stat_incr(vlapic->vm, i,
1108 VLAPIC_IPI_RECV, 1);
1109 break;
1110 case APIC_DELMODE_NMI:
1111 (void) vm_inject_nmi(vlapic->vm, i);
1112 break;
1113 case APIC_DELMODE_INIT:
1114 (void) vm_inject_init(vlapic->vm, i);
1115 break;
1116 case APIC_DELMODE_STARTUP:
1117 (void) vm_inject_sipi(vlapic->vm, i, vec);
1118 break;
1119 case APIC_DELMODE_LOWPRIO:
1120 case APIC_DELMODE_SMI:
1121 default:
1122 /* Unhandled IPI modes (for now) */
1123 break;
1128 void
1129 vlapic_self_ipi_handler(struct vlapic *vlapic, uint32_t val)
1131 const int vec = val & 0xff;
1133 /* self-IPI is only exposed via x2APIC */
1134 ASSERT(vlapic_x2mode(vlapic));
1136 (void) lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
1137 vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_SEND, 1);
1138 vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_RECV, 1);
1142 vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
1144 struct LAPIC *lapic = vlapic->apic_page;
1145 int idx, i, bitpos, vector;
1146 uint32_t *irrptr, val;
1148 if (vlapic->ops.sync_state) {
1149 (*vlapic->ops.sync_state)(vlapic);
1152 irrptr = &lapic->irr0;
1154 for (i = 7; i >= 0; i--) {
1155 idx = i * 4;
1156 val = atomic_load_acq_int(&irrptr[idx]);
1157 bitpos = fls(val);
1158 if (bitpos != 0) {
1159 vector = i * 32 + (bitpos - 1);
1160 if (PRIO(vector) > PRIO(lapic->ppr)) {
1161 if (vecptr != NULL)
1162 *vecptr = vector;
1163 return (1);
1164 } else
1165 break;
1168 return (0);
1171 void
1172 vlapic_intr_accepted(struct vlapic *vlapic, int vector)
1174 struct LAPIC *lapic = vlapic->apic_page;
1175 uint32_t *irrptr, *isrptr;
1176 int idx;
1178 KASSERT(vector >= 16 && vector < 256, ("invalid vector %d", vector));
1180 if (vlapic->ops.intr_accepted)
1181 return ((*vlapic->ops.intr_accepted)(vlapic, vector));
1184 * clear the ready bit for vector being accepted in irr
1185 * and set the vector as in service in isr.
1187 idx = (vector / 32) * 4;
1189 irrptr = &lapic->irr0;
1190 atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
1192 isrptr = &lapic->isr0;
1193 isrptr[idx] |= 1 << (vector % 32);
1196 * The only way a fresh vector could be accepted into ISR is if it was
1197 * of a higher priority than the current PPR. With that vector now
1198 * in-service, the PPR must be raised.
1200 vlapic_raise_ppr(vlapic, vector);
1203 void
1204 vlapic_svr_write_handler(struct vlapic *vlapic)
1206 struct LAPIC *lapic;
1207 uint32_t old, new, changed;
1209 lapic = vlapic->apic_page;
1211 new = lapic->svr;
1212 old = vlapic->svr_last;
1213 vlapic->svr_last = new;
1215 changed = old ^ new;
1216 if ((changed & APIC_SVR_ENABLE) != 0) {
1217 if ((new & APIC_SVR_ENABLE) == 0) {
1219 * The apic is now disabled so stop the apic timer
1220 * and mask all the LVT entries.
1222 VLAPIC_TIMER_LOCK(vlapic);
1223 callout_stop(&vlapic->callout);
1224 VLAPIC_TIMER_UNLOCK(vlapic);
1225 vlapic_mask_lvts(vlapic);
1226 } else {
1228 * The apic is now enabled so restart the apic timer
1229 * if it is configured in periodic mode.
1231 if (vlapic_periodic_timer(vlapic))
1232 vlapic_icrtmr_write_handler(vlapic);
1237 static bool
1238 vlapic_read(struct vlapic *vlapic, uint16_t offset, uint32_t *outp)
1240 struct LAPIC *lapic = vlapic->apic_page;
1241 uint32_t *reg;
1242 int i;
1244 ASSERT3U(offset & 0x3, ==, 0);
1245 ASSERT3U(offset, <, PAGESIZE);
1246 ASSERT3P(outp, !=, NULL);
1248 uint32_t data = 0;
1249 switch (offset) {
1250 case APIC_OFFSET_ID:
1251 data = lapic->id;
1252 break;
1253 case APIC_OFFSET_VER:
1254 data = lapic->version;
1255 break;
1256 case APIC_OFFSET_TPR:
1257 data = lapic->tpr;
1258 break;
1259 case APIC_OFFSET_APR:
1260 data = lapic->apr;
1261 break;
1262 case APIC_OFFSET_PPR:
1263 data = lapic->ppr;
1264 break;
1265 case APIC_OFFSET_LDR:
1266 data = lapic->ldr;
1267 break;
1268 case APIC_OFFSET_DFR:
1269 data = lapic->dfr;
1270 break;
1271 case APIC_OFFSET_SVR:
1272 data = lapic->svr;
1273 break;
1274 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1275 i = (offset - APIC_OFFSET_ISR0) >> 2;
1276 reg = &lapic->isr0;
1277 data = *(reg + i);
1278 break;
1279 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1280 i = (offset - APIC_OFFSET_TMR0) >> 2;
1281 reg = &lapic->tmr0;
1282 data = *(reg + i);
1283 break;
1284 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1285 i = (offset - APIC_OFFSET_IRR0) >> 2;
1286 reg = &lapic->irr0;
1287 data = atomic_load_acq_int(reg + i);
1288 break;
1289 case APIC_OFFSET_ESR:
1290 data = lapic->esr;
1291 break;
1292 case APIC_OFFSET_ICR_LOW:
1293 data = lapic->icr_lo;
1294 break;
1295 case APIC_OFFSET_ICR_HI:
1296 data = lapic->icr_hi;
1297 break;
1298 case APIC_OFFSET_CMCI_LVT:
1299 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1300 data = vlapic_get_lvt(vlapic, offset);
1301 #ifdef INVARIANTS
1302 reg = vlapic_get_lvtptr(vlapic, offset);
1303 ASSERT3U(data, ==, *reg);
1304 #endif
1305 break;
1306 case APIC_OFFSET_TIMER_ICR:
1307 data = lapic->icr_timer;
1308 break;
1309 case APIC_OFFSET_TIMER_CCR:
1310 data = vlapic_get_ccr(vlapic);
1311 break;
1312 case APIC_OFFSET_TIMER_DCR:
1313 data = lapic->dcr_timer;
1314 break;
1315 case APIC_OFFSET_RRR:
1316 data = 0;
1317 break;
1319 case APIC_OFFSET_SELF_IPI:
1320 case APIC_OFFSET_EOI:
1321 /* Write-only register */
1322 *outp = 0;
1323 return (false);
1325 default:
1326 /* Invalid register */
1327 *outp = 0;
1328 return (false);
1331 *outp = data;
1332 return (true);
1335 static bool
1336 vlapic_write(struct vlapic *vlapic, uint16_t offset, uint32_t data)
1338 struct LAPIC *lapic = vlapic->apic_page;
1339 uint32_t *regptr;
1341 ASSERT3U(offset & 0xf, ==, 0);
1342 ASSERT3U(offset, <, PAGESIZE);
1344 switch (offset) {
1345 case APIC_OFFSET_ID:
1346 lapic->id = data;
1347 vlapic_id_write_handler(vlapic);
1348 break;
1349 case APIC_OFFSET_TPR:
1350 vlapic_set_tpr(vlapic, data & 0xff);
1351 break;
1352 case APIC_OFFSET_EOI:
1353 vlapic_process_eoi(vlapic);
1354 break;
1355 case APIC_OFFSET_LDR:
1356 lapic->ldr = data;
1357 vlapic_ldr_write_handler(vlapic);
1358 break;
1359 case APIC_OFFSET_DFR:
1360 lapic->dfr = data;
1361 vlapic_dfr_write_handler(vlapic);
1362 break;
1363 case APIC_OFFSET_SVR:
1364 lapic->svr = data;
1365 vlapic_svr_write_handler(vlapic);
1366 break;
1367 case APIC_OFFSET_ICR_LOW:
1368 lapic->icr_lo = data;
1369 vlapic_icrlo_write_handler(vlapic);
1370 break;
1371 case APIC_OFFSET_ICR_HI:
1372 lapic->icr_hi = data;
1373 break;
1374 case APIC_OFFSET_CMCI_LVT:
1375 case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1376 regptr = vlapic_get_lvtptr(vlapic, offset);
1377 *regptr = data;
1378 vlapic_lvt_write_handler(vlapic, offset);
1379 break;
1380 case APIC_OFFSET_TIMER_ICR:
1381 lapic->icr_timer = data;
1382 vlapic_icrtmr_write_handler(vlapic);
1383 break;
1385 case APIC_OFFSET_TIMER_DCR:
1386 lapic->dcr_timer = data;
1387 vlapic_dcr_write_handler(vlapic);
1388 break;
1390 case APIC_OFFSET_ESR:
1391 vlapic_esr_write_handler(vlapic);
1392 break;
1394 case APIC_OFFSET_SELF_IPI:
1395 if (vlapic_x2mode(vlapic))
1396 vlapic_self_ipi_handler(vlapic, data);
1397 break;
1399 case APIC_OFFSET_VER:
1400 case APIC_OFFSET_APR:
1401 case APIC_OFFSET_PPR:
1402 case APIC_OFFSET_RRR:
1403 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1404 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1405 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1406 case APIC_OFFSET_TIMER_CCR:
1407 /* Read-only register */
1408 return (false);
1410 default:
1411 /* Invalid register */
1412 return (false);
1415 return (true);
1418 void
1419 vlapic_reset(struct vlapic *vlapic)
1421 struct LAPIC *lapic = vlapic->apic_page;
1422 uint32_t *isrptr, *tmrptr, *irrptr;
1424 /* Reset any timer-related state first */
1425 VLAPIC_TIMER_LOCK(vlapic);
1426 callout_stop(&vlapic->callout);
1427 lapic->icr_timer = 0;
1428 lapic->ccr_timer = 0;
1429 lapic->dcr_timer = 0;
1430 vlapic_update_divider(vlapic);
1431 VLAPIC_TIMER_UNLOCK(vlapic);
1434 * Sync any APIC acceleration (APICv/AVIC) state into the APIC page so
1435 * it is not leftover after the reset. This is performed after the APIC
1436 * timer has been stopped, in case it happened to fire just prior to
1437 * being deactivated.
1439 if (vlapic->ops.sync_state) {
1440 (*vlapic->ops.sync_state)(vlapic);
1443 vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
1444 if (vlapic->vcpuid == 0)
1445 vlapic->msr_apicbase |= APICBASE_BSP;
1447 lapic->id = vlapic_get_id(vlapic);
1448 lapic->version = VLAPIC_VERSION;
1449 lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
1451 lapic->tpr = 0;
1452 lapic->apr = 0;
1453 lapic->ppr = 0;
1455 lapic->eoi = 0;
1456 lapic->ldr = 0;
1457 lapic->dfr = 0xffffffff;
1458 lapic->svr = APIC_SVR_VECTOR;
1459 vlapic->svr_last = lapic->svr;
1461 isrptr = &lapic->isr0;
1462 tmrptr = &lapic->tmr0;
1463 irrptr = &lapic->irr0;
1464 for (uint_t i = 0; i < 8; i++) {
1465 atomic_store_rel_int(&isrptr[i * 4], 0);
1466 atomic_store_rel_int(&tmrptr[i * 4], 0);
1467 atomic_store_rel_int(&irrptr[i * 4], 0);
1470 lapic->esr = 0;
1471 vlapic->esr_pending = 0;
1472 lapic->icr_lo = 0;
1473 lapic->icr_hi = 0;
1475 lapic->lvt_cmci = 0;
1476 lapic->lvt_timer = 0;
1477 lapic->lvt_thermal = 0;
1478 lapic->lvt_pcint = 0;
1479 lapic->lvt_lint0 = 0;
1480 lapic->lvt_lint1 = 0;
1481 lapic->lvt_error = 0;
1482 vlapic_mask_lvts(vlapic);
1485 void
1486 vlapic_init(struct vlapic *vlapic)
1488 KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
1489 KASSERT(vlapic->vcpuid >= 0 &&
1490 vlapic->vcpuid < vm_get_maxcpus(vlapic->vm),
1491 ("vlapic_init: vcpuid is not initialized"));
1492 KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
1493 "initialized"));
1496 * If the vlapic is configured in x2apic mode then it will be
1497 * accessed in the critical section via the MSR emulation code.
1499 * Therefore the timer mutex must be a spinlock because blockable
1500 * mutexes cannot be acquired in a critical section.
1502 mutex_init(&vlapic->timer_lock, NULL, MUTEX_ADAPTIVE, NULL);
1503 callout_init(&vlapic->callout, 1);
1505 vlapic_reset(vlapic);
1508 void
1509 vlapic_cleanup(struct vlapic *vlapic)
1511 callout_drain(&vlapic->callout);
1512 mutex_destroy(&vlapic->timer_lock);
1516 vlapic_mmio_read(struct vlapic *vlapic, uint64_t gpa, uint64_t *valp,
1517 uint_t size)
1519 ASSERT3U(gpa, >=, DEFAULT_APIC_BASE);
1520 ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE);
1522 /* Ignore MMIO accesses when in x2APIC mode or hardware disabled */
1523 if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) {
1524 *valp = UINT64_MAX;
1525 return (0);
1528 const uint16_t off = gpa - DEFAULT_APIC_BASE;
1529 uint32_t raw = 0;
1530 (void) vlapic_read(vlapic, off & ~0xf, &raw);
1532 /* Shift and mask reads which are small and/or unaligned */
1533 const uint8_t align = off & 0xf;
1534 if (align < 4) {
1535 *valp = (uint64_t)raw << (align * 8);
1536 } else {
1537 *valp = 0;
1540 return (0);
1544 vlapic_mmio_write(struct vlapic *vlapic, uint64_t gpa, uint64_t val,
1545 uint_t size)
1547 ASSERT3U(gpa, >=, DEFAULT_APIC_BASE);
1548 ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE);
1550 /* Ignore MMIO accesses when in x2APIC mode or hardware disabled */
1551 if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) {
1552 return (0);
1555 const uint16_t off = gpa - DEFAULT_APIC_BASE;
1556 /* Ignore writes which are not 32-bits wide and 16-byte aligned */
1557 if ((off & 0xf) != 0 || size != 4) {
1558 return (0);
1561 (void) vlapic_write(vlapic, off, (uint32_t)val);
1562 return (0);
1565 /* Should attempts to change the APIC base address be rejected with a #GP? */
1566 int vlapic_gp_on_addr_change = 1;
1568 static vm_msr_result_t
1569 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
1571 const uint64_t diff = vlapic->msr_apicbase ^ val;
1574 * Until the LAPIC emulation for switching between xAPIC and x2APIC
1575 * modes is more polished, it will remain off-limits from being altered
1576 * by the guest.
1578 const uint64_t reserved_bits = APICBASE_RESERVED | APICBASE_X2APIC |
1579 APICBASE_BSP;
1580 if ((diff & reserved_bits) != 0) {
1581 return (VMR_GP);
1584 /* We do not presently allow the LAPIC access address to be modified. */
1585 if ((diff & APICBASE_ADDR_MASK) != 0) {
1587 * Explicitly rebuffing such requests with a #GP is the most
1588 * straightforward way to handle the situation, but certain
1589 * consumers (such as the KVM unit tests) may balk at the
1590 * otherwise unexpected exception.
1592 if (vlapic_gp_on_addr_change) {
1593 return (VMR_GP);
1596 /* If silence is required, just ignore the address change. */
1597 val = (val & ~APICBASE_ADDR_MASK) | DEFAULT_APIC_BASE;
1600 vlapic->msr_apicbase = val;
1601 return (VMR_OK);
1604 static __inline uint16_t
1605 vlapic_msr_to_regoff(uint32_t msr)
1607 ASSERT3U(msr, >=, MSR_APIC_000);
1608 ASSERT3U(msr, <, (MSR_APIC_000 + 0x100));
1610 return ((msr - MSR_APIC_000) << 4);
1613 bool
1614 vlapic_owned_msr(uint32_t msr)
1616 if (msr == MSR_APICBASE) {
1617 return (true);
1619 if (msr >= MSR_APIC_000 &&
1620 msr < (MSR_APIC_000 + 0x100)) {
1621 return (true);
1623 return (false);
1626 vm_msr_result_t
1627 vlapic_rdmsr(struct vlapic *vlapic, uint32_t msr, uint64_t *valp)
1629 ASSERT(vlapic_owned_msr(msr));
1630 ASSERT3P(valp, !=, NULL);
1632 if (msr == MSR_APICBASE) {
1633 *valp = vlapic->msr_apicbase;
1634 return (VMR_OK);
1637 /* #GP for x2APIC MSR accesses in xAPIC mode */
1638 if (!vlapic_x2mode(vlapic)) {
1639 return (VMR_GP);
1642 uint64_t out = 0;
1643 const uint16_t reg = vlapic_msr_to_regoff(msr);
1644 switch (reg) {
1645 case APIC_OFFSET_ICR_LOW: {
1646 /* Read from ICR register gets entire (64-bit) value */
1647 uint32_t low = 0, high = 0;
1648 bool valid;
1650 valid = vlapic_read(vlapic, APIC_OFFSET_ICR_HI, &high);
1651 VERIFY(valid);
1652 valid = vlapic_read(vlapic, APIC_OFFSET_ICR_LOW, &low);
1653 VERIFY(valid);
1655 *valp = ((uint64_t)high << 32) | low;
1656 return (VMR_OK);
1658 case APIC_OFFSET_ICR_HI:
1659 /* Already covered by ICR_LOW */
1660 return (VMR_GP);
1661 default:
1662 break;
1664 if (!vlapic_read(vlapic, reg, (uint32_t *)&out)) {
1665 return (VMR_GP);
1667 *valp = out;
1668 return (VMR_OK);
1671 vm_msr_result_t
1672 vlapic_wrmsr(struct vlapic *vlapic, uint32_t msr, uint64_t val)
1674 ASSERT(vlapic_owned_msr(msr));
1676 if (msr == MSR_APICBASE) {
1677 return (vlapic_set_apicbase(vlapic, val));
1680 /* #GP for x2APIC MSR accesses in xAPIC mode */
1681 if (!vlapic_x2mode(vlapic)) {
1682 return (VMR_GP);
1685 const uint16_t reg = vlapic_msr_to_regoff(msr);
1686 switch (reg) {
1687 case APIC_OFFSET_ICR_LOW: {
1688 /* Write to ICR register sets entire (64-bit) value */
1689 bool valid;
1691 valid = vlapic_write(vlapic, APIC_OFFSET_ICR_HI, val >> 32);
1692 VERIFY(valid);
1693 valid = vlapic_write(vlapic, APIC_OFFSET_ICR_LOW, val);
1694 VERIFY(valid);
1695 return (VMR_OK);
1697 case APIC_OFFSET_ICR_HI:
1698 /* Already covered by ICR_LOW */
1699 return (VMR_GP);
1700 case APIC_OFFSET_ESR:
1701 /* Only 0 may be written from x2APIC mode */
1702 if (val != 0) {
1703 return (VMR_GP);
1705 break;
1706 default:
1707 break;
1709 if (!vlapic_write(vlapic, reg, val)) {
1710 return (VMR_GP);
1712 return (VMR_OK);
1715 void
1716 vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1718 struct vlapic *vlapic;
1719 struct LAPIC *lapic;
1721 vlapic = vm_lapic(vm, vcpuid);
1723 if (state == X2APIC_DISABLED)
1724 vlapic->msr_apicbase &= ~APICBASE_X2APIC;
1725 else
1726 vlapic->msr_apicbase |= APICBASE_X2APIC;
1729 * Reset the local APIC registers whose values are mode-dependent.
1731 * XXX this works because the APIC mode can be changed only at vcpu
1732 * initialization time.
1734 lapic = vlapic->apic_page;
1735 lapic->id = vlapic_get_id(vlapic);
1736 if (vlapic_x2mode(vlapic)) {
1737 lapic->ldr = x2apic_ldr(vlapic);
1738 lapic->dfr = 0;
1739 } else {
1740 lapic->ldr = 0;
1741 lapic->dfr = 0xffffffff;
1744 if (state == X2APIC_ENABLED) {
1745 if (vlapic->ops.enable_x2apic_mode)
1746 (*vlapic->ops.enable_x2apic_mode)(vlapic);
1750 void
1751 vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
1752 int delmode, int vec)
1754 bool lowprio;
1755 int vcpuid;
1756 cpuset_t dmask;
1758 if (delmode != IOART_DELFIXED &&
1759 delmode != IOART_DELLOPRI &&
1760 delmode != IOART_DELEXINT) {
1761 /* Invalid delivery mode */
1762 return;
1764 lowprio = (delmode == IOART_DELLOPRI);
1767 * We don't provide any virtual interrupt redirection hardware so
1768 * all interrupts originating from the ioapic or MSI specify the
1769 * 'dest' in the legacy xAPIC format.
1771 vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
1773 while ((vcpuid = CPU_FFS(&dmask)) != 0) {
1774 vcpuid--;
1775 CPU_CLR(vcpuid, &dmask);
1776 if (delmode == IOART_DELEXINT) {
1777 (void) vm_inject_extint(vm, vcpuid);
1778 } else {
1779 (void) lapic_set_intr(vm, vcpuid, vec, level);
1784 void
1785 vlapic_post_intr(struct vlapic *vlapic, int hostcpu)
1788 * Post an interrupt to the vcpu currently running on 'hostcpu'.
1790 * This is done by leveraging features like Posted Interrupts (Intel)
1791 * Doorbell MSR (AMD AVIC) that avoid a VM exit.
1793 * If neither of these features are available then fallback to
1794 * sending an IPI to 'hostcpu'.
1796 if (vlapic->ops.post_intr)
1797 (*vlapic->ops.post_intr)(vlapic, hostcpu);
1798 else
1799 poke_cpu(hostcpu);
1802 void
1803 vlapic_localize_resources(struct vlapic *vlapic)
1805 vmm_glue_callout_localize(&vlapic->callout);
1808 void
1809 vlapic_pause(struct vlapic *vlapic)
1811 VLAPIC_TIMER_LOCK(vlapic);
1812 callout_stop(&vlapic->callout);
1813 VLAPIC_TIMER_UNLOCK(vlapic);
1817 void
1818 vlapic_resume(struct vlapic *vlapic)
1820 VLAPIC_TIMER_LOCK(vlapic);
1821 if (vlapic->timer_fire_when != 0) {
1822 vlapic_callout_reset(vlapic);
1824 VLAPIC_TIMER_UNLOCK(vlapic);
1827 static int
1828 vlapic_data_read(void *datap, const vmm_data_req_t *req)
1830 VERIFY3U(req->vdr_class, ==, VDC_LAPIC);
1831 VERIFY3U(req->vdr_version, ==, 1);
1832 VERIFY3U(req->vdr_len, >=, sizeof (struct vdi_lapic_v1));
1834 struct vlapic *vlapic = datap;
1835 struct vdi_lapic_v1 *out = req->vdr_data;
1837 VLAPIC_TIMER_LOCK(vlapic);
1839 if (vlapic->ops.sync_state) {
1840 (*vlapic->ops.sync_state)(vlapic);
1843 out->vl_msr_apicbase = vlapic->msr_apicbase;
1844 out->vl_esr_pending = vlapic->esr_pending;
1845 if (vlapic->timer_fire_when != 0) {
1846 out->vl_timer_target =
1847 vm_normalize_hrtime(vlapic->vm, vlapic->timer_fire_when);
1848 } else {
1849 out->vl_timer_target = 0;
1852 const struct LAPIC *lapic = vlapic->apic_page;
1853 struct vdi_lapic_page_v1 *out_page = &out->vl_lapic;
1856 * While this might appear, at first glance, to be missing some fields,
1857 * they are intentionally omitted:
1858 * - PPR: its contents are always generated at runtime
1859 * - EOI: write-only, and contents are ignored after handling
1860 * - RRD: (aka RRR) read-only and always 0
1861 * - CCR: calculated from underlying timer data
1863 out_page->vlp_id = lapic->id;
1864 out_page->vlp_version = lapic->version;
1865 out_page->vlp_tpr = lapic->tpr;
1866 out_page->vlp_apr = lapic->apr;
1867 out_page->vlp_ldr = lapic->ldr;
1868 out_page->vlp_dfr = lapic->dfr;
1869 out_page->vlp_svr = lapic->svr;
1870 out_page->vlp_esr = lapic->esr;
1871 out_page->vlp_icr = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
1872 out_page->vlp_icr_timer = lapic->icr_timer;
1873 out_page->vlp_dcr_timer = lapic->dcr_timer;
1875 out_page->vlp_lvt_cmci = lapic->lvt_cmci;
1876 out_page->vlp_lvt_timer = lapic->lvt_timer;
1877 out_page->vlp_lvt_thermal = lapic->lvt_thermal;
1878 out_page->vlp_lvt_pcint = lapic->lvt_pcint;
1879 out_page->vlp_lvt_lint0 = lapic->lvt_lint0;
1880 out_page->vlp_lvt_lint1 = lapic->lvt_lint1;
1881 out_page->vlp_lvt_error = lapic->lvt_error;
1883 const uint32_t *isrptr = &lapic->isr0;
1884 const uint32_t *tmrptr = &lapic->tmr0;
1885 const uint32_t *irrptr = &lapic->irr0;
1886 for (uint_t i = 0; i < 8; i++) {
1887 out_page->vlp_isr[i] = isrptr[i * 4];
1888 out_page->vlp_tmr[i] = tmrptr[i * 4];
1889 out_page->vlp_irr[i] = irrptr[i * 4];
1891 VLAPIC_TIMER_UNLOCK(vlapic);
1893 return (0);
1896 static uint8_t
1897 popc8(uint8_t val)
1899 uint8_t cnt;
1901 for (cnt = 0; val != 0; val &= (val - 1)) {
1902 cnt++;
1904 return (cnt);
1908 * Descriptions for the various failures which can occur when validating
1909 * to-be-written vlapic state.
1911 enum vlapic_validation_error {
1912 VVE_OK,
1913 VVE_BAD_ID,
1914 VVE_BAD_VERSION,
1915 VVE_BAD_MSR_BASE,
1916 VVE_BAD_ESR,
1917 VVE_BAD_TPR,
1918 VVE_LOW_VECTOR,
1919 VVE_ISR_PRIORITY,
1922 static enum vlapic_validation_error
1923 vlapic_data_validate(const struct vlapic *vlapic, const vmm_data_req_t *req)
1925 ASSERT(req->vdr_version == 1 &&
1926 req->vdr_len >= sizeof (struct vdi_lapic_v1));
1927 const struct vdi_lapic_v1 *src = req->vdr_data;
1929 if ((src->vl_esr_pending & ~APIC_VALID_MASK_ESR) != 0 ||
1930 (src->vl_lapic.vlp_esr & ~APIC_VALID_MASK_ESR) != 0) {
1931 return (VVE_BAD_ESR);
1934 /* Use the same restrictions as the wrmsr accessor for now */
1935 const uint64_t apicbase_reserved = APICBASE_RESERVED | APICBASE_X2APIC |
1936 APICBASE_BSP;
1937 const uint64_t diff = src->vl_msr_apicbase ^ vlapic->msr_apicbase;
1938 if ((diff & apicbase_reserved) != 0) {
1939 return (VVE_BAD_MSR_BASE);
1942 const struct vdi_lapic_page_v1 *page = &src->vl_lapic;
1944 * Demand that ID match for now. This can be further updated when some
1945 * of the x2apic handling is improved.
1947 if (page->vlp_id != vlapic_get_id(vlapic)) {
1948 return (VVE_BAD_ID);
1951 if (page->vlp_version != vlapic->apic_page->version) {
1952 return (VVE_BAD_VERSION);
1955 if (page->vlp_tpr > 0xff) {
1956 return (VVE_BAD_TPR);
1959 /* Vectors 0-15 are not expected to be handled by the lapic */
1960 if ((page->vlp_isr[0] & 0xffff) != 0 ||
1961 (page->vlp_irr[0] & 0xffff) != 0 ||
1962 (page->vlp_tmr[0] & 0xffff) != 0) {
1963 return (VVE_LOW_VECTOR);
1966 /* Only one interrupt should be in-service for each priority level */
1967 for (uint_t i = 0; i < 8; i++) {
1968 if (popc8((uint8_t)page->vlp_isr[i]) > 1 ||
1969 popc8((uint8_t)(page->vlp_isr[i] >> 8)) > 1 ||
1970 popc8((uint8_t)(page->vlp_isr[i] >> 16)) > 1 ||
1971 popc8((uint8_t)(page->vlp_isr[i] >> 24)) > 1) {
1972 return (VVE_ISR_PRIORITY);
1976 return (VVE_OK);
1979 static int
1980 vlapic_data_write(void *datap, const vmm_data_req_t *req)
1982 VERIFY3U(req->vdr_class, ==, VDC_LAPIC);
1983 VERIFY3U(req->vdr_version, ==, 1);
1984 VERIFY3U(req->vdr_len, >=, sizeof (struct vdi_lapic_v1));
1986 struct vlapic *vlapic = datap;
1987 if (vlapic_data_validate(vlapic, req) != VVE_OK) {
1988 return (EINVAL);
1990 const struct vdi_lapic_v1 *src = req->vdr_data;
1991 const struct vdi_lapic_page_v1 *page = &src->vl_lapic;
1992 struct LAPIC *lapic = vlapic->apic_page;
1994 VLAPIC_TIMER_LOCK(vlapic);
1996 /* Already ensured by vlapic_data_validate() */
1997 VERIFY3U(page->vlp_version, ==, lapic->version);
1999 vlapic->msr_apicbase = src->vl_msr_apicbase;
2000 vlapic->esr_pending = src->vl_esr_pending;
2002 lapic->tpr = page->vlp_tpr;
2003 lapic->apr = page->vlp_apr;
2004 lapic->ldr = page->vlp_ldr;
2005 lapic->dfr = page->vlp_dfr;
2006 lapic->svr = page->vlp_svr;
2007 lapic->esr = page->vlp_esr;
2008 lapic->icr_lo = (uint32_t)page->vlp_icr;
2009 lapic->icr_hi = (uint32_t)(page->vlp_icr >> 32);
2011 lapic->icr_timer = page->vlp_icr_timer;
2012 lapic->dcr_timer = page->vlp_dcr_timer;
2013 vlapic_update_divider(vlapic);
2015 /* cleanse LDR/DFR */
2016 vlapic_ldr_write_handler(vlapic);
2017 vlapic_dfr_write_handler(vlapic);
2019 lapic->lvt_cmci = page->vlp_lvt_cmci;
2020 lapic->lvt_timer = page->vlp_lvt_timer;
2021 lapic->lvt_thermal = page->vlp_lvt_thermal;
2022 lapic->lvt_pcint = page->vlp_lvt_pcint;
2023 lapic->lvt_lint0 = page->vlp_lvt_lint0;
2024 lapic->lvt_lint1 = page->vlp_lvt_lint1;
2025 lapic->lvt_error = page->vlp_lvt_error;
2026 /* cleanse LVTs */
2027 vlapic_refresh_lvts(vlapic);
2029 uint32_t *isrptr = &lapic->isr0;
2030 uint32_t *tmrptr = &lapic->tmr0;
2031 uint32_t *irrptr = &lapic->irr0;
2032 for (uint_t i = 0; i < 8; i++) {
2033 isrptr[i * 4] = page->vlp_isr[i];
2034 tmrptr[i * 4] = page->vlp_tmr[i];
2035 irrptr[i * 4] = page->vlp_irr[i];
2038 if (src->vl_timer_target != 0) {
2039 vlapic->timer_fire_when =
2040 vm_denormalize_hrtime(vlapic->vm, src->vl_timer_target);
2043 * Check to see if timer expiration would result computed CCR
2044 * values in excess of what is configured in ICR/DCR.
2046 const hrtime_t now = gethrtime();
2047 if (vlapic->timer_fire_when > now) {
2048 const uint32_t ccr = hrt_freq_count(
2049 vlapic->timer_fire_when - now,
2050 vlapic->timer_cur_freq);
2053 * Until we have a richer event/logging system
2054 * available, just note such an overage as a stat.
2056 if (ccr > lapic->icr_timer) {
2057 vlapic->stats.vs_import_timer_overage++;
2061 if (!vm_is_paused(vlapic->vm)) {
2062 vlapic_callout_reset(vlapic);
2064 } else {
2065 vlapic->timer_fire_when = 0;
2068 if (vlapic->ops.sync_state) {
2069 (*vlapic->ops.sync_state)(vlapic);
2071 VLAPIC_TIMER_UNLOCK(vlapic);
2073 return (0);
2076 static const vmm_data_version_entry_t lapic_v1 = {
2077 .vdve_class = VDC_LAPIC,
2078 .vdve_version = 1,
2079 .vdve_len_expect = sizeof (struct vdi_lapic_v1),
2080 .vdve_readf = vlapic_data_read,
2081 .vdve_writef = vlapic_data_write,
2083 VMM_DATA_VERSION(lapic_v1);