4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2017 Joyent, Inc.
26 * Copyright (c) 2010, Intel Corporation.
27 * All rights reserved.
31 * PSMI 1.1 extensions are supported only in 2.6 and later versions.
32 * PSMI 1.2 extensions are supported only in 2.7 and later versions.
33 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
34 * PSMI 1.5 extensions are supported in Solaris Nevada.
35 * PSMI 1.6 extensions are supported in Solaris Nevada.
36 * PSMI 1.7 extensions are supported in Solaris Nevada.
40 #include <sys/processor.h>
43 #include <sys/smp_impldefs.h>
44 #include <sys/inttypes.h>
46 #include <sys/acpi/acpi.h>
47 #include <sys/acpica.h>
48 #include <sys/psm_common.h>
50 #include <sys/apic_common.h>
53 #include <sys/sunddi.h>
54 #include <sys/ddi_impldefs.h>
56 #include <sys/promif.h>
57 #include <sys/x86_archext.h>
58 #include <sys/cpc_impl.h>
59 #include <sys/uadmin.h>
60 #include <sys/panic.h>
61 #include <sys/debug.h>
62 #include <sys/archsystm.h>
64 #include <sys/machsystm.h>
65 #include <sys/cpuvar.h>
66 #include <sys/rm_platter.h>
67 #include <sys/privregs.h>
68 #include <sys/cyclic.h>
70 #include <sys/pci_intr_lib.h>
71 #include <sys/sunndi.h>
73 #include <sys/clock.h>
76 * Part of mp_platfrom_common.c that's used only by pcplusmp & xpv_psm
78 * These functions may be moved to xpv_psm later when apix and pcplusmp
83 * Local Function Prototypes
85 static void apic_mark_vector(uchar_t oldvector
, uchar_t newvector
);
86 static void apic_xlate_vector_free_timeout_handler(void *arg
);
87 static int apic_check_stuck_interrupt(apic_irq_t
*irq_ptr
, int old_bind_cpu
,
88 int new_bind_cpu
, int apicindex
, int intin_no
, int which_irq
,
89 struct ioapic_reprogram_data
*drep
);
90 static int apic_setup_irq_table(dev_info_t
*dip
, int irqno
,
91 struct apic_io_intr
*intrp
, struct intrspec
*ispec
, iflag_t
*intr_flagp
,
93 static void apic_try_deferred_reprogram(int ipl
, int vect
);
94 static void delete_defer_repro_ent(int which_irq
);
95 static void apic_ioapic_wait_pending_clear(int ioapicindex
,
98 extern int apic_acpi_translate_pci_irq(dev_info_t
*dip
, int busid
, int devid
,
99 int ipin
, int *pci_irqp
, iflag_t
*intr_flagp
);
100 extern int apic_handle_pci_pci_bridge(dev_info_t
*idip
, int child_devno
,
101 int child_ipin
, struct apic_io_intr
**intrp
);
102 extern uchar_t
acpi_find_ioapic(int irq
);
103 extern struct apic_io_intr
*apic_find_io_intr_w_busid(int irqno
, int busid
);
104 extern int apic_find_bus_id(int bustype
);
105 extern int apic_find_intin(uchar_t ioapic
, uchar_t intin
);
106 extern void apic_record_rdt_entry(apic_irq_t
*irqptr
, int irq
);
108 extern int apic_sci_vect
;
109 extern iflag_t apic_sci_flags
;
110 /* ACPI HPET interrupt configuration; -1 if HPET not used */
111 extern int apic_hpet_vect
;
112 extern iflag_t apic_hpet_flags
;
113 extern int apic_intr_policy
;
114 extern char *psm_name
;
117 * number of bits per byte, from <sys/param.h>
119 #define UCHAR_MAX UINT8_MAX
121 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */
122 extern int apic_max_reps_clear_pending
;
124 /* The irq # is implicit in the array index: */
125 struct ioapic_reprogram_data apic_reprogram_info
[APIC_MAX_VECTOR
+1];
127 * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info
128 * is indexed by IRQ number, NOT by vector number.
131 extern int apic_int_busy_mark
;
132 extern int apic_int_free_mark
;
133 extern int apic_diff_for_redistribution
;
134 extern int apic_sample_factor_redistribution
;
135 extern int apic_redist_cpu_skip
;
136 extern int apic_num_imbalance
;
137 extern int apic_num_rebind
;
139 /* timeout for xlate_vector, mark_vector */
140 int apic_revector_timeout
= 16 * 10000; /* 160 millisec */
142 extern int apic_defconf
;
143 extern int apic_irq_translate
;
145 extern int apic_use_acpi_madt_only
; /* 1=ONLY use MADT from ACPI */
147 extern uchar_t apic_io_vectbase
[MAX_IO_APIC
];
149 extern boolean_t ioapic_mask_workaround
[MAX_IO_APIC
];
152 * First available slot to be used as IRQ index into the apic_irq_table
153 * for those interrupts (like MSI/X) that don't have a physical IRQ.
155 extern int apic_first_avail_irq
;
158 * apic_defer_reprogram_lock ensures that only one processor is handling
159 * deferred interrupt programming at *_intr_exit time.
161 static lock_t apic_defer_reprogram_lock
;
164 * The current number of deferred reprogrammings outstanding
166 uint_t apic_reprogram_outstanding
= 0;
170 * Counters that keep track of deferred reprogramming stats
172 uint_t apic_intr_deferrals
= 0;
173 uint_t apic_intr_deliver_timeouts
= 0;
174 uint_t apic_last_ditch_reprogram_failures
= 0;
175 uint_t apic_deferred_setup_failures
= 0;
176 uint_t apic_defer_repro_total_retries
= 0;
177 uint_t apic_defer_repro_successes
= 0;
178 uint_t apic_deferred_spurious_enters
= 0;
181 extern int apic_io_max
;
182 extern struct apic_io_intr
*apic_io_intrp
;
184 uchar_t apic_vector_to_irq
[APIC_MAX_VECTOR
+1];
186 extern uint32_t eisa_level_intr_mask
;
187 /* At least MSB will be set if EISA bus */
189 extern int apic_pci_bus_total
;
190 extern uchar_t apic_single_pci_busid
;
193 * Following declarations are for revectoring; used when ISRs at different
196 static lock_t apic_revector_lock
;
197 int apic_revector_pending
= 0;
198 static uchar_t
*apic_oldvec_to_newvec
;
199 static uchar_t
*apic_newvec_to_oldvec
;
201 /* ACPI Interrupt Source Override Structure ptr */
202 extern ACPI_MADT_INTERRUPT_OVERRIDE
*acpi_isop
;
203 extern int acpi_iso_cnt
;
206 * Auto-configuration routines
210 * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable
211 * are also set to NULL. vector->irq is set to a value which cannot map
212 * to a real irq to show that it is free.
215 apic_init_common(void)
221 * Initialize apic_ipls from apic_vectortoipl. This array is
222 * used in apic_intr_enter to determine the IPL to use for the
223 * corresponding vector. On some systems, due to hardware errata
224 * and interrupt sharing, the IPL may not correspond to the IPL listed
225 * in apic_vectortoipl (see apic_addspl and apic_delspl).
227 for (i
= 0; i
< (APIC_AVAIL_VECTOR
/ APIC_VECTOR_PER_IPL
); i
++) {
228 indx
= i
* APIC_VECTOR_PER_IPL
;
230 for (j
= 0; j
< APIC_VECTOR_PER_IPL
; j
++, indx
++)
231 apic_ipls
[indx
] = apic_vectortoipl
[i
];
234 /* cpu 0 is always up (for now) */
235 apic_cpus
[0].aci_status
= APIC_CPU_ONLINE
| APIC_CPU_INTR_ENABLE
;
237 iptr
= (int *)&apic_irq_table
[0];
238 for (i
= 0; i
<= APIC_MAX_VECTOR
; i
++) {
239 apic_level_intr
[i
] = 0;
241 apic_vector_to_irq
[i
] = APIC_RESV_IRQ
;
243 /* These *must* be initted to B_TRUE! */
244 apic_reprogram_info
[i
].done
= B_TRUE
;
245 apic_reprogram_info
[i
].irqp
= NULL
;
246 apic_reprogram_info
[i
].tries
= 0;
247 apic_reprogram_info
[i
].bindcpu
= 0;
251 * Allocate a dummy irq table entry for the reserved entry.
252 * This takes care of the race between removing an irq and
253 * clock detecting a CPU in that irq during interrupt load
256 apic_irq_table
[APIC_RESV_IRQ
] =
257 kmem_zalloc(sizeof (apic_irq_t
), KM_SLEEP
);
259 mutex_init(&airq_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
263 ioapic_init_intr(int mask_apic
)
266 struct intrspec ispec
;
271 LOCK_INIT_CLEAR(&apic_revector_lock
);
272 LOCK_INIT_CLEAR(&apic_defer_reprogram_lock
);
274 /* mask interrupt vectors */
275 for (j
= 0; j
< apic_io_max
&& mask_apic
; j
++) {
279 /* Bits 23-16 define the maximum redirection entries */
280 intin_max
= (ioapic_read(ioapic_ix
, APIC_VERS_CMD
) >> 16)
282 for (i
= 0; i
<= intin_max
; i
++)
283 ioapic_write(ioapic_ix
, APIC_RDT_CMD
+ 2 * i
, AV_MASK
);
287 * Hack alert: deal with ACPI SCI interrupt chicken/egg here
289 if (apic_sci_vect
> 0) {
291 * acpica has already done add_avintr(); we just
292 * to finish the job by mimicing translate_irq()
294 * Fake up an intrspec and setup the tables
296 ispec
.intrspec_vec
= apic_sci_vect
;
297 ispec
.intrspec_pri
= SCI_IPL
;
299 if (apic_setup_irq_table(NULL
, apic_sci_vect
, NULL
,
300 &ispec
, &apic_sci_flags
, DDI_INTR_TYPE_FIXED
) < 0) {
301 cmn_err(CE_WARN
, "!apic: SCI setup failed");
304 irqptr
= apic_irq_table
[apic_sci_vect
];
306 iflag
= intr_clear();
307 lock_set(&apic_ioapic_lock
);
309 /* Program I/O APIC */
310 (void) apic_setup_io_intr(irqptr
, apic_sci_vect
, B_FALSE
);
312 lock_clear(&apic_ioapic_lock
);
315 irqptr
->airq_share
++;
319 * Hack alert: deal with ACPI HPET interrupt chicken/egg here.
321 if (apic_hpet_vect
> 0) {
323 * hpet has already done add_avintr(); we just need
324 * to finish the job by mimicing translate_irq()
326 * Fake up an intrspec and setup the tables
328 ispec
.intrspec_vec
= apic_hpet_vect
;
329 ispec
.intrspec_pri
= CBE_HIGH_PIL
;
331 if (apic_setup_irq_table(NULL
, apic_hpet_vect
, NULL
,
332 &ispec
, &apic_hpet_flags
, DDI_INTR_TYPE_FIXED
) < 0) {
333 cmn_err(CE_WARN
, "!apic: HPET setup failed");
336 irqptr
= apic_irq_table
[apic_hpet_vect
];
338 iflag
= intr_clear();
339 lock_set(&apic_ioapic_lock
);
341 /* Program I/O APIC */
342 (void) apic_setup_io_intr(irqptr
, apic_hpet_vect
, B_FALSE
);
344 lock_clear(&apic_ioapic_lock
);
347 irqptr
->airq_share
++;
352 * Add mask bits to disable interrupt vector from happening
353 * at or above IPL. In addition, it should remove mask bits
354 * to enable interrupt vectors below the given IPL.
356 * Both add and delspl are complicated by the fact that different interrupts
357 * may share IRQs. This can happen in two ways.
358 * 1. The same H/W line is shared by more than 1 device
359 * 1a. with interrupts at different IPLs
360 * 1b. with interrupts at same IPL
361 * 2. We ran out of vectors at a given IPL and started sharing vectors.
362 * 1b and 2 should be handled gracefully, except for the fact some ISRs
363 * will get called often when no interrupt is pending for the device.
364 * For 1a, we handle it at the higher IPL.
368 apic_addspl_common(int irqno
, int ipl
, int min_ipl
, int max_ipl
)
372 apic_irq_t
*irqptr
, *irqheadptr
;
375 ASSERT(max_ipl
<= UCHAR_MAX
);
376 irqindex
= IRQINDEX(irqno
);
378 if ((irqindex
== -1) || (!apic_irq_table
[irqindex
]))
379 return (PSM_FAILURE
);
381 mutex_enter(&airq_mutex
);
382 irqptr
= irqheadptr
= apic_irq_table
[irqindex
];
384 DDI_INTR_IMPLDBG((CE_CONT
, "apic_addspl: dip=0x%p type=%d irqno=0x%x "
385 "vector=0x%x\n", (void *)irqptr
->airq_dip
,
386 irqptr
->airq_mps_intr_index
, irqno
, irqptr
->airq_vector
));
389 if (VIRTIRQ(irqindex
, irqptr
->airq_share_id
) == irqno
)
391 irqptr
= irqptr
->airq_next
;
393 irqptr
->airq_share
++;
395 mutex_exit(&airq_mutex
);
397 /* return if it is not hardware interrupt */
398 if (irqptr
->airq_mps_intr_index
== RESERVE_INDEX
)
399 return (PSM_SUCCESS
);
401 /* Or if there are more interupts at a higher IPL */
403 return (PSM_SUCCESS
);
406 * if apic_picinit() has not been called yet, just return.
407 * At the end of apic_picinit(), we will call setup_io_intr().
410 if (!apic_picinit_called
)
411 return (PSM_SUCCESS
);
414 * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate,
417 if (irqptr
->airq_ipl
!= max_ipl
&&
418 !ioapic_mask_workaround
[irqptr
->airq_ioapicindex
]) {
420 vector
= apic_allocate_vector(max_ipl
, irqindex
, 1);
422 irqptr
->airq_share
--;
423 return (PSM_FAILURE
);
426 apic_mark_vector(irqptr
->airq_vector
, vector
);
428 irqptr
->airq_vector
= vector
;
429 irqptr
->airq_ipl
= (uchar_t
)max_ipl
;
431 * reprogram irq being added and every one else
432 * who is not in the UNINIT state
434 if ((VIRTIRQ(irqindex
, irqptr
->airq_share_id
) ==
435 irqno
) || (irqptr
->airq_temp_cpu
!= IRQ_UNINIT
)) {
436 apic_record_rdt_entry(irqptr
, irqindex
);
438 iflag
= intr_clear();
439 lock_set(&apic_ioapic_lock
);
441 (void) apic_setup_io_intr(irqptr
, irqindex
,
444 lock_clear(&apic_ioapic_lock
);
447 irqptr
= irqptr
->airq_next
;
449 return (PSM_SUCCESS
);
451 } else if (irqptr
->airq_ipl
!= max_ipl
&&
452 ioapic_mask_workaround
[irqptr
->airq_ioapicindex
]) {
454 * We cannot upgrade the vector, but we can change
455 * the IPL that this vector induces.
457 * Note that we subtract APIC_BASE_VECT from the vector
458 * here because this array is used in apic_intr_enter
459 * (no need to add APIC_BASE_VECT in that hot code
460 * path since we can do it in the rarely-executed path
463 apic_ipls
[irqptr
->airq_vector
- APIC_BASE_VECT
] =
468 irqptr
->airq_ipl
= (uchar_t
)max_ipl
;
469 irqptr
= irqptr
->airq_next
;
472 return (PSM_SUCCESS
);
477 iflag
= intr_clear();
478 lock_set(&apic_ioapic_lock
);
480 (void) apic_setup_io_intr(irqptr
, irqindex
, B_FALSE
);
482 lock_clear(&apic_ioapic_lock
);
485 return (PSM_SUCCESS
);
489 * Recompute mask bits for the given interrupt vector.
490 * If there is no interrupt servicing routine for this
491 * vector, this function should disable interrupt vector
492 * from happening at all IPLs. If there are still
493 * handlers using the given vector, this function should
494 * disable the given vector from happening below the lowest
495 * IPL of the remaining hadlers.
499 apic_delspl_common(int irqno
, int ipl
, int min_ipl
, int max_ipl
)
505 apic_irq_t
*irqptr
, *preirqptr
, *irqheadptr
, *irqp
;
508 mutex_enter(&airq_mutex
);
509 irqindex
= IRQINDEX(irqno
);
510 irqptr
= preirqptr
= irqheadptr
= apic_irq_table
[irqindex
];
512 DDI_INTR_IMPLDBG((CE_CONT
, "apic_delspl: dip=0x%p type=%d irqno=0x%x "
513 "vector=0x%x\n", (void *)irqptr
->airq_dip
,
514 irqptr
->airq_mps_intr_index
, irqno
, irqptr
->airq_vector
));
517 if (VIRTIRQ(irqindex
, irqptr
->airq_share_id
) == irqno
)
520 irqptr
= irqptr
->airq_next
;
524 irqptr
->airq_share
--;
526 mutex_exit(&airq_mutex
);
529 * If there are more interrupts at a higher IPL, we don't need
530 * to disable anything.
533 return (PSM_SUCCESS
);
535 /* return if it is not hardware interrupt */
536 if (irqptr
->airq_mps_intr_index
== RESERVE_INDEX
)
537 return (PSM_SUCCESS
);
539 if (!apic_picinit_called
) {
541 * Clear irq_struct. If two devices shared an intpt
542 * line & 1 unloaded before picinit, we are hosed. But, then
543 * we hope the machine survive.
545 irqptr
->airq_mps_intr_index
= FREE_INDEX
;
546 irqptr
->airq_temp_cpu
= IRQ_UNINIT
;
547 apic_free_vector(irqptr
->airq_vector
);
548 return (PSM_SUCCESS
);
551 * Downgrade vector to new max_ipl if needed. If we cannot allocate,
552 * use old IPL. Not very elegant, but it should work.
554 if ((irqptr
->airq_ipl
!= max_ipl
) && (max_ipl
!= PSM_INVALID_IPL
) &&
555 !ioapic_mask_workaround
[irqptr
->airq_ioapicindex
]) {
557 if ((vector
= apic_allocate_vector(max_ipl
, irqno
, 1))) {
558 apic_mark_vector(irqheadptr
->airq_vector
, vector
);
561 irqp
->airq_vector
= vector
;
562 irqp
->airq_ipl
= (uchar_t
)max_ipl
;
563 if (irqp
->airq_temp_cpu
!= IRQ_UNINIT
) {
564 apic_record_rdt_entry(irqp
, irqindex
);
566 iflag
= intr_clear();
567 lock_set(&apic_ioapic_lock
);
569 (void) apic_setup_io_intr(irqp
,
572 lock_clear(&apic_ioapic_lock
);
575 irqp
= irqp
->airq_next
;
579 } else if (irqptr
->airq_ipl
!= max_ipl
&&
580 max_ipl
!= PSM_INVALID_IPL
&&
581 ioapic_mask_workaround
[irqptr
->airq_ioapicindex
]) {
584 * We cannot downgrade the IPL of the vector below the vector's
585 * hardware priority. If we did, it would be possible for a
586 * higher-priority hardware vector to interrupt a CPU running at an IPL
587 * lower than the hardware priority of the interrupting vector (but
588 * higher than the soft IPL of this IRQ). When this happens, we would
589 * then try to drop the IPL BELOW what it was (effectively dropping
590 * below base_spl) which would be potentially catastrophic.
592 * (e.g. Suppose the hardware vector associated with this IRQ is 0x40
593 * (hardware IPL of 4). Further assume that the old IPL of this IRQ
594 * was 4, but the new IPL is 1. If we forced vector 0x40 to result in
595 * an IPL of 1, it would be possible for the processor to be executing
596 * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting
597 * the currently-executing ISR. When apic_intr_enter consults
598 * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1
599 * so even though the processor was running at IPL 4, an IPL 1
600 * interrupt will have interrupted it, which must not happen)).
602 * Effectively, this means that the hardware priority corresponding to
603 * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's
606 * (In the above example, then, after removal of the IPL 4 device's
607 * interrupt handler, the new IPL will continue to be 4 because the
608 * hardware priority that IPL 1 implies is lower than the hardware
609 * priority of the vector used.)
611 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */
612 const int apic_ipls_index
= irqptr
->airq_vector
-
614 const int vect_inherent_hwpri
= irqptr
->airq_vector
>>
618 * If there are still devices using this IRQ, determine the
621 if (irqptr
->airq_share
) {
622 int vect_desired_hwpri
, hwpri
;
624 ASSERT(max_ipl
< MAXIPL
);
625 vect_desired_hwpri
= apic_ipltopri
[max_ipl
] >>
629 * If the desired IPL's hardware priority is lower
630 * than that of the vector, use the hardware priority
631 * of the vector to determine the new IPL.
633 hwpri
= (vect_desired_hwpri
< vect_inherent_hwpri
) ?
634 vect_inherent_hwpri
: vect_desired_hwpri
;
637 * Now, to get the right index for apic_vectortoipl,
638 * we need to subtract APIC_BASE_VECT from the
639 * hardware-vector-equivalent (in hwpri). Since hwpri
640 * is already shifted, we shift APIC_BASE_VECT before
641 * doing the subtraction.
643 hwpri
-= (APIC_BASE_VECT
>> APIC_IPL_SHIFT
);
646 ASSERT(hwpri
< MAXIPL
);
647 max_ipl
= apic_vectortoipl
[hwpri
];
648 apic_ipls
[apic_ipls_index
] = (uchar_t
)max_ipl
;
652 irqp
->airq_ipl
= (uchar_t
)max_ipl
;
653 irqp
= irqp
->airq_next
;
657 * No more devices on this IRQ, so reset this vector's
658 * element in apic_ipls to the original IPL for this
661 apic_ipls
[apic_ipls_index
] =
662 apic_vectortoipl
[vect_inherent_hwpri
];
667 * If there are still active interrupts, we are done.
669 if (irqptr
->airq_share
)
670 return (PSM_SUCCESS
);
672 iflag
= intr_clear();
673 lock_set(&apic_ioapic_lock
);
675 if (irqptr
->airq_mps_intr_index
== MSI_INDEX
) {
677 * Disable the MSI vector
678 * Make sure we only disable on the last
679 * of the multi-MSI support
681 if (i_ddi_intr_get_current_nenables(irqptr
->airq_dip
) == 1) {
682 apic_pci_msi_disable_mode(irqptr
->airq_dip
,
685 } else if (irqptr
->airq_mps_intr_index
== MSIX_INDEX
) {
687 * Disable the MSI-X vector
688 * needs to clear its mask and addr/data for each MSI-X
690 apic_pci_msi_unconfigure(irqptr
->airq_dip
, DDI_INTR_TYPE_MSIX
,
691 irqptr
->airq_origirq
);
693 * Make sure we only disable on the last MSI-X
695 if (i_ddi_intr_get_current_nenables(irqptr
->airq_dip
) == 1) {
696 apic_pci_msi_disable_mode(irqptr
->airq_dip
,
701 * The assumption here is that this is safe, even for
702 * systems with IOAPICs that suffer from the hardware
703 * erratum because all devices have been quiesced before
704 * they unregister their interrupt handlers. If that
705 * assumption turns out to be false, this mask operation
706 * can induce the same erratum result we're trying to
709 ioapic_ix
= irqptr
->airq_ioapicindex
;
710 intin
= irqptr
->airq_intin_no
;
711 ioapic_write(ioapic_ix
, APIC_RDT_CMD
+ 2 * intin
, AV_MASK
);
714 apic_vt_ops
->apic_intrmap_free_entry(&irqptr
->airq_intrmap_private
);
717 * This irq entry is the only one in the chain.
719 if (irqheadptr
->airq_next
== NULL
) {
720 ASSERT(irqheadptr
== irqptr
);
721 bind_cpu
= irqptr
->airq_temp_cpu
;
722 if (((uint32_t)bind_cpu
!= IRQ_UNBOUND
) &&
723 ((uint32_t)bind_cpu
!= IRQ_UNINIT
)) {
724 ASSERT(apic_cpu_in_range(bind_cpu
));
725 if (bind_cpu
& IRQ_USER_BOUND
) {
726 /* If hardbound, temp_cpu == cpu */
727 bind_cpu
&= ~IRQ_USER_BOUND
;
728 apic_cpus
[bind_cpu
].aci_bound
--;
730 apic_cpus
[bind_cpu
].aci_temp_bound
--;
732 irqptr
->airq_temp_cpu
= IRQ_UNINIT
;
733 irqptr
->airq_mps_intr_index
= FREE_INDEX
;
734 lock_clear(&apic_ioapic_lock
);
736 apic_free_vector(irqptr
->airq_vector
);
737 return (PSM_SUCCESS
);
741 * If we get here, we are sharing the vector and there are more than
742 * one active irq entries in the chain.
744 lock_clear(&apic_ioapic_lock
);
747 mutex_enter(&airq_mutex
);
748 /* Remove the irq entry from the chain */
749 if (irqptr
== irqheadptr
) { /* The irq entry is at the head */
750 apic_irq_table
[irqindex
] = irqptr
->airq_next
;
752 preirqptr
->airq_next
= irqptr
->airq_next
;
754 /* Free the irq entry */
755 kmem_free(irqptr
, sizeof (apic_irq_t
));
756 mutex_exit(&airq_mutex
);
758 return (PSM_SUCCESS
);
762 * apic_introp_xlate() replaces apic_translate_irq() and is
763 * called only from apic_intr_ops(). With the new ADII framework,
764 * the priority can no longer be retrieved through i_ddi_get_intrspec().
765 * It has to be passed in from the caller.
768 * Success: irqno for the given device
772 apic_introp_xlate(dev_info_t
*dip
, struct intrspec
*ispec
, int type
)
775 int dev_len
, pci_irq
, newirq
, bustype
, devid
, busid
, i
;
776 int irqno
= ispec
->intrspec_vec
;
777 ddi_acc_handle_t cfg_handle
;
779 struct apic_io_intr
*intrp
;
781 ACPI_SUBTABLE_HEADER
*hp
;
782 ACPI_MADT_INTERRUPT_OVERRIDE
*isop
;
784 int parent_is_pci_or_pciex
= 0;
785 int child_is_pciex
= 0;
787 DDI_INTR_IMPLDBG((CE_CONT
, "apic_introp_xlate: dip=0x%p name=%s "
788 "type=%d irqno=0x%x\n", (void *)dip
, ddi_get_name(dip
), type
,
791 dev_len
= sizeof (dev_type
);
792 if (ddi_getlongprop_buf(DDI_DEV_T_ANY
, ddi_get_parent(dip
),
793 DDI_PROP_DONTPASS
, "device_type", (caddr_t
)dev_type
,
794 &dev_len
) == DDI_PROP_SUCCESS
) {
795 if ((strcmp(dev_type
, "pci") == 0) ||
796 (strcmp(dev_type
, "pciex") == 0))
797 parent_is_pci_or_pciex
= 1;
800 if (ddi_getlongprop_buf(DDI_DEV_T_ANY
, dip
,
801 DDI_PROP_DONTPASS
, "compatible", (caddr_t
)dev_type
,
802 &dev_len
) == DDI_PROP_SUCCESS
) {
803 if (strstr(dev_type
, "pciex"))
807 if (DDI_INTR_IS_MSI_OR_MSIX(type
)) {
808 if ((airqp
= apic_find_irq(dip
, ispec
, type
)) != NULL
) {
809 airqp
->airq_iflag
.bustype
=
810 child_is_pciex
? BUS_PCIE
: BUS_PCI
;
811 return (apic_vector_to_irq
[airqp
->airq_vector
]);
813 return (apic_setup_irq_table(dip
, irqno
, NULL
, ispec
,
819 /* check if we have already translated this irq */
820 mutex_enter(&airq_mutex
);
821 newirq
= apic_min_device_irq
;
822 for (; newirq
<= apic_max_device_irq
; newirq
++) {
823 airqp
= apic_irq_table
[newirq
];
825 if ((airqp
->airq_dip
== dip
) &&
826 (airqp
->airq_origirq
== irqno
) &&
827 (airqp
->airq_mps_intr_index
!= FREE_INDEX
)) {
829 mutex_exit(&airq_mutex
);
830 return (VIRTIRQ(newirq
, airqp
->airq_share_id
));
832 airqp
= airqp
->airq_next
;
835 mutex_exit(&airq_mutex
);
840 if ((dip
== NULL
) || (!apic_irq_translate
&& !apic_enable_acpi
))
843 if (parent_is_pci_or_pciex
) {
845 if (acpica_get_bdf(dip
, &busid
, &devid
, NULL
) != 0)
847 if (busid
== 0 && apic_pci_bus_total
== 1)
848 busid
= (int)apic_single_pci_busid
;
850 if (pci_config_setup(dip
, &cfg_handle
) != DDI_SUCCESS
)
852 ipin
= pci_config_get8(cfg_handle
, PCI_CONF_IPIN
) - PCI_INTA
;
853 pci_config_teardown(&cfg_handle
);
854 if (apic_enable_acpi
&& !apic_use_acpi_madt_only
) {
855 if (apic_acpi_translate_pci_irq(dip
, busid
, devid
,
856 ipin
, &pci_irq
, &intr_flag
) != ACPI_PSM_SUCCESS
)
859 intr_flag
.bustype
= child_is_pciex
? BUS_PCIE
: BUS_PCI
;
860 return (apic_setup_irq_table(dip
, pci_irq
, NULL
, ispec
,
863 pci_irq
= ((devid
& 0x1f) << 2) | (ipin
& 0x3);
864 if ((intrp
= apic_find_io_intr_w_busid(pci_irq
, busid
))
866 if ((pci_irq
= apic_handle_pci_pci_bridge(dip
,
867 devid
, ipin
, &intrp
)) == -1)
870 return (apic_setup_irq_table(dip
, pci_irq
, intrp
, ispec
,
873 } else if (strcmp(dev_type
, "isa") == 0)
875 else if (strcmp(dev_type
, "eisa") == 0)
879 if (apic_enable_acpi
&& !apic_use_acpi_madt_only
) {
880 /* search iso entries first */
881 if (acpi_iso_cnt
!= 0) {
882 hp
= (ACPI_SUBTABLE_HEADER
*)acpi_isop
;
884 while (i
< acpi_iso_cnt
) {
886 ACPI_MADT_TYPE_INTERRUPT_OVERRIDE
) {
888 (ACPI_MADT_INTERRUPT_OVERRIDE
*) hp
;
889 if (isop
->Bus
== 0 &&
890 isop
->SourceIrq
== irqno
) {
891 newirq
= isop
->GlobalIrq
;
894 ACPI_MADT_POLARITY_MASK
;
897 ACPI_MADT_TRIGGER_MASK
)
899 intr_flag
.bustype
= BUS_ISA
;
901 return (apic_setup_irq_table(
902 dip
, newirq
, NULL
, ispec
,
908 hp
= (ACPI_SUBTABLE_HEADER
*)(((char *)hp
) +
912 intr_flag
.intr_po
= INTR_PO_ACTIVE_HIGH
;
913 intr_flag
.intr_el
= INTR_EL_EDGE
;
914 intr_flag
.bustype
= BUS_ISA
;
915 return (apic_setup_irq_table(dip
, irqno
, NULL
, ispec
,
918 if (bustype
== 0) /* not initialized */
919 bustype
= eisa_level_intr_mask
? BUS_EISA
: BUS_ISA
;
920 for (i
= 0; i
< 2; i
++) {
921 if (((busid
= apic_find_bus_id(bustype
)) != -1) &&
922 ((intrp
= apic_find_io_intr_w_busid(irqno
, busid
))
924 if ((newirq
= apic_setup_irq_table(dip
, irqno
,
925 intrp
, ispec
, NULL
, type
)) != -1) {
930 bustype
= (bustype
== BUS_EISA
) ? BUS_ISA
: BUS_EISA
;
934 /* MPS default configuration */
936 newirq
= apic_setup_irq_table(dip
, irqno
, NULL
, ispec
, NULL
, type
);
939 ASSERT(IRQINDEX(newirq
) == irqno
);
940 ASSERT(apic_irq_table
[irqno
]);
945 * Attempt to share vector with someone else
948 apic_share_vector(int irqno
, iflag_t
*intr_flagp
, short intr_index
, int ipl
,
949 uchar_t ioapicindex
, uchar_t ipin
, apic_irq_t
**irqptrp
)
952 apic_irq_t
*tmpirqp
= NULL
;
954 apic_irq_t
*irqptr
, dummyirq
;
955 int newirq
, chosen_irq
= -1, share
= 127;
956 int lowest
, highest
, i
;
959 DDI_INTR_IMPLDBG((CE_CONT
, "apic_share_vector: irqno=0x%x "
960 "intr_index=0x%x ipl=0x%x\n", irqno
, intr_index
, ipl
));
962 highest
= apic_ipltopri
[ipl
] + APIC_VECTOR_MASK
;
963 lowest
= apic_ipltopri
[ipl
-1] + APIC_VECTOR_PER_IPL
;
965 if (highest
< lowest
) /* Both ipl and ipl-1 map to same pri */
966 lowest
-= APIC_VECTOR_PER_IPL
;
967 dummyirq
.airq_mps_intr_index
= intr_index
;
968 dummyirq
.airq_ioapicindex
= ioapicindex
;
969 dummyirq
.airq_intin_no
= ipin
;
971 dummyirq
.airq_iflag
= *intr_flagp
;
972 apic_record_rdt_entry(&dummyirq
, irqno
);
973 for (i
= lowest
; i
<= highest
; i
++) {
974 newirq
= apic_vector_to_irq
[i
];
975 if (newirq
== APIC_RESV_IRQ
)
977 irqptr
= apic_irq_table
[newirq
];
979 if ((dummyirq
.airq_rdt_entry
& 0xFF00) !=
980 (irqptr
->airq_rdt_entry
& 0xFF00))
984 if (irqptr
->airq_share
< share
) {
985 share
= irqptr
->airq_share
;
989 if (chosen_irq
!= -1) {
991 * Assign a share id which is free or which is larger
992 * than the largest one.
995 mutex_enter(&airq_mutex
);
996 irqptr
= apic_irq_table
[chosen_irq
];
998 if (irqptr
->airq_mps_intr_index
== FREE_INDEX
) {
999 share_id
= irqptr
->airq_share_id
;
1002 if (share_id
<= irqptr
->airq_share_id
)
1003 share_id
= irqptr
->airq_share_id
+ 1;
1007 irqptr
= irqptr
->airq_next
;
1010 irqptr
= kmem_zalloc(sizeof (apic_irq_t
), KM_SLEEP
);
1011 irqptr
->airq_temp_cpu
= IRQ_UNINIT
;
1013 apic_irq_table
[chosen_irq
]->airq_next
;
1014 apic_irq_table
[chosen_irq
]->airq_next
= irqptr
;
1016 tmpirqp
= apic_irq_table
[chosen_irq
];
1019 irqptr
->airq_mps_intr_index
= intr_index
;
1020 irqptr
->airq_ioapicindex
= ioapicindex
;
1021 irqptr
->airq_intin_no
= ipin
;
1023 irqptr
->airq_iflag
= *intr_flagp
;
1024 irqptr
->airq_vector
= apic_irq_table
[chosen_irq
]->airq_vector
;
1025 irqptr
->airq_share_id
= share_id
;
1026 apic_record_rdt_entry(irqptr
, irqno
);
1029 /* shuffle the pointers to test apic_delspl path */
1031 tmpirqp
->airq_next
= irqptr
->airq_next
;
1032 irqptr
->airq_next
= apic_irq_table
[chosen_irq
];
1033 apic_irq_table
[chosen_irq
] = irqptr
;
1036 mutex_exit(&airq_mutex
);
1037 return (VIRTIRQ(chosen_irq
, share_id
));
1043 * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry
1044 * is used already, we will try to allocate a new irqno.
1051 apic_setup_irq_table(dev_info_t
*dip
, int irqno
, struct apic_io_intr
*intrp
,
1052 struct intrspec
*ispec
, iflag_t
*intr_flagp
, int type
)
1056 int newirq
, intr_index
;
1057 uchar_t ipin
, ioapic
, ioapicindex
, vector
;
1062 ASSERT(ispec
!= NULL
);
1064 origirq
= ispec
->intrspec_vec
;
1065 ipl
= ispec
->intrspec_pri
;
1067 DDI_INTR_IMPLDBG((CE_CONT
, "apic_setup_irq_table: dip=0x%p type=%d "
1068 "irqno=0x%x origirq=0x%x\n", (void *)dip
, type
, irqno
, origirq
));
1070 major
= (dip
!= NULL
) ? ddi_driver_major(dip
) : 0;
1072 if (DDI_INTR_IS_MSI_OR_MSIX(type
)) {
1073 /* MSI/X doesn't need to setup ioapic stuffs */
1076 ipin
= (uchar_t
)0xff;
1077 intr_index
= (type
== DDI_INTR_TYPE_MSI
) ? MSI_INDEX
:
1079 mutex_enter(&airq_mutex
);
1080 if ((irqno
= apic_allocate_irq(apic_first_avail_irq
)) == -1) {
1081 mutex_exit(&airq_mutex
);
1082 /* need an irq for MSI/X to index into autovect[] */
1083 cmn_err(CE_WARN
, "No interrupt irq: %s instance %d",
1084 ddi_get_name(dip
), ddi_get_instance(dip
));
1087 mutex_exit(&airq_mutex
);
1089 } else if (intrp
!= NULL
) {
1090 intr_index
= (int)(intrp
- apic_io_intrp
);
1091 ioapic
= intrp
->intr_destid
;
1092 ipin
= intrp
->intr_destintin
;
1093 /* Find ioapicindex. If destid was ALL, we will exit with 0. */
1094 for (ioapicindex
= apic_io_max
- 1; ioapicindex
; ioapicindex
--)
1095 if (apic_io_id
[ioapicindex
] == ioapic
)
1097 ASSERT((ioapic
== apic_io_id
[ioapicindex
]) ||
1098 (ioapic
== INTR_ALL_APIC
));
1100 /* check whether this intin# has been used by another irqno */
1101 if ((newirq
= apic_find_intin(ioapicindex
, ipin
)) != -1) {
1105 } else if (intr_flagp
!= NULL
) {
1107 intr_index
= ACPI_INDEX
;
1108 ioapicindex
= acpi_find_ioapic(irqno
);
1109 ASSERT(ioapicindex
!= 0xFF);
1110 ioapic
= apic_io_id
[ioapicindex
];
1111 ipin
= irqno
- apic_io_vectbase
[ioapicindex
];
1112 if (apic_irq_table
[irqno
] &&
1113 apic_irq_table
[irqno
]->airq_mps_intr_index
== ACPI_INDEX
) {
1114 ASSERT(apic_irq_table
[irqno
]->airq_intin_no
== ipin
&&
1115 apic_irq_table
[irqno
]->airq_ioapicindex
==
1121 /* default configuration */
1123 ioapic
= apic_io_id
[ioapicindex
];
1124 ipin
= (uchar_t
)irqno
;
1125 intr_index
= DEFAULT_INDEX
;
1128 if ((vector
= apic_allocate_vector(ipl
, irqno
, 0)) == 0) {
1129 if ((newirq
= apic_share_vector(irqno
, intr_flagp
, intr_index
,
1130 ipl
, ioapicindex
, ipin
, &irqptr
)) != -1) {
1131 irqptr
->airq_ipl
= ipl
;
1132 irqptr
->airq_origirq
= (uchar_t
)origirq
;
1133 irqptr
->airq_dip
= dip
;
1134 irqptr
->airq_major
= major
;
1135 sdip
= apic_irq_table
[IRQINDEX(newirq
)]->airq_dip
;
1136 /* This is OK to do really */
1138 cmn_err(CE_WARN
, "Sharing vectors: %s"
1139 " instance %d and SCI",
1140 ddi_get_name(dip
), ddi_get_instance(dip
));
1142 cmn_err(CE_WARN
, "Sharing vectors: %s"
1143 " instance %d and %s instance %d",
1144 ddi_get_name(sdip
), ddi_get_instance(sdip
),
1145 ddi_get_name(dip
), ddi_get_instance(dip
));
1149 /* try high priority allocation now that share has failed */
1150 if ((vector
= apic_allocate_vector(ipl
, irqno
, 1)) == 0) {
1151 cmn_err(CE_WARN
, "No interrupt vector: %s instance %d",
1152 ddi_get_name(dip
), ddi_get_instance(dip
));
1157 mutex_enter(&airq_mutex
);
1158 if (apic_irq_table
[irqno
] == NULL
) {
1159 irqptr
= kmem_zalloc(sizeof (apic_irq_t
), KM_SLEEP
);
1160 irqptr
->airq_temp_cpu
= IRQ_UNINIT
;
1161 apic_irq_table
[irqno
] = irqptr
;
1163 irqptr
= apic_irq_table
[irqno
];
1164 if (irqptr
->airq_mps_intr_index
!= FREE_INDEX
) {
1166 * The slot is used by another irqno, so allocate
1167 * a free irqno for this interrupt
1169 newirq
= apic_allocate_irq(apic_first_avail_irq
);
1171 mutex_exit(&airq_mutex
);
1175 irqptr
= apic_irq_table
[irqno
];
1176 if (irqptr
== NULL
) {
1177 irqptr
= kmem_zalloc(sizeof (apic_irq_t
),
1179 irqptr
->airq_temp_cpu
= IRQ_UNINIT
;
1180 apic_irq_table
[irqno
] = irqptr
;
1182 vector
= apic_modify_vector(vector
, newirq
);
1185 apic_max_device_irq
= max(irqno
, apic_max_device_irq
);
1186 apic_min_device_irq
= min(irqno
, apic_min_device_irq
);
1187 mutex_exit(&airq_mutex
);
1188 irqptr
->airq_ioapicindex
= ioapicindex
;
1189 irqptr
->airq_intin_no
= ipin
;
1190 irqptr
->airq_ipl
= ipl
;
1191 irqptr
->airq_vector
= vector
;
1192 irqptr
->airq_origirq
= (uchar_t
)origirq
;
1193 irqptr
->airq_share_id
= 0;
1194 irqptr
->airq_mps_intr_index
= (short)intr_index
;
1195 irqptr
->airq_dip
= dip
;
1196 irqptr
->airq_major
= major
;
1197 irqptr
->airq_cpu
= apic_bind_intr(dip
, irqno
, ioapic
, ipin
);
1199 irqptr
->airq_iflag
= *intr_flagp
;
1201 if (!DDI_INTR_IS_MSI_OR_MSIX(type
)) {
1202 /* setup I/O APIC entry for non-MSI/X interrupts */
1203 apic_record_rdt_entry(irqptr
, irqno
);
1209 * return the cpu to which this intr should be bound.
1210 * Check properties or any other mechanism to see if user wants it
1211 * bound to a specific CPU. If so, return the cpu id with high bit set.
1212 * If not, use the policy to choose a cpu and return the id.
1215 apic_bind_intr(dev_info_t
*dip
, int irq
, uchar_t ioapicid
, uchar_t intin
)
1217 int instance
, instno
, prop_len
, bind_cpu
, count
;
1221 char *name
, *drv_name
, *prop_val
, *cptr
;
1226 if (apic_intr_policy
== INTR_LOWEST_PRIORITY
)
1227 return (IRQ_UNBOUND
);
1229 if (apic_nproc
== 1)
1233 * dip may be NULL for interrupts not associated with a device driver,
1234 * such as the ACPI SCI or HPET interrupts. In that case just use the
1235 * next CPU and return.
1238 iflag
= intr_clear();
1239 lock_set(&apic_ioapic_lock
);
1240 bind_cpu
= apic_get_next_bind_cpu();
1241 lock_clear(&apic_ioapic_lock
);
1242 intr_restore(iflag
);
1244 cmn_err(CE_CONT
, "!%s: irq 0x%x "
1245 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1246 psm_name
, irq
, apic_irq_table
[irq
]->airq_vector
, ioapicid
,
1247 intin
, bind_cpu
& ~IRQ_USER_BOUND
);
1249 return ((uint32_t)bind_cpu
);
1252 name
= ddi_get_name(dip
);
1253 major
= ddi_name_to_major(name
);
1254 drv_name
= ddi_major_to_name(major
);
1255 instance
= ddi_get_instance(dip
);
1256 if (apic_intr_policy
== INTR_ROUND_ROBIN_WITH_AFFINITY
) {
1257 i
= apic_min_device_irq
;
1258 for (; i
<= apic_max_device_irq
; i
++) {
1259 if ((i
== irq
) || (apic_irq_table
[i
] == NULL
) ||
1260 (apic_irq_table
[i
]->airq_mps_intr_index
1264 if ((apic_irq_table
[i
]->airq_major
== major
) &&
1265 (!(apic_irq_table
[i
]->airq_cpu
& IRQ_USER_BOUND
))) {
1266 cpu
= apic_irq_table
[i
]->airq_cpu
;
1269 "!%s: %s (%s) instance #%d "
1270 "irq 0x%x vector 0x%x ioapic 0x%x "
1271 "intin 0x%x is bound to cpu %d\n",
1273 name
, drv_name
, instance
, irq
,
1274 apic_irq_table
[irq
]->airq_vector
,
1275 ioapicid
, intin
, cpu
);
1281 * search for "drvname"_intpt_bind_cpus property first, the
1282 * syntax of the property should be "a[,b,c,...]" where
1283 * instance 0 binds to cpu a, instance 1 binds to cpu b,
1284 * instance 3 binds to cpu c...
1285 * ddi_getlongprop() will search /option first, then /
1286 * if "drvname"_intpt_bind_cpus doesn't exist, then find
1287 * intpt_bind_cpus property. The syntax is the same, and
1288 * it applies to all the devices if its "drvname" specific
1289 * property doesn't exist
1291 (void) strcpy(prop_name
, drv_name
);
1292 (void) strcat(prop_name
, "_intpt_bind_cpus");
1293 rc
= ddi_getlongprop(DDI_DEV_T_ANY
, dip
, 0, prop_name
,
1294 (caddr_t
)&prop_val
, &prop_len
);
1295 if (rc
!= DDI_PROP_SUCCESS
) {
1296 rc
= ddi_getlongprop(DDI_DEV_T_ANY
, dip
, 0,
1297 "intpt_bind_cpus", (caddr_t
)&prop_val
, &prop_len
);
1299 if (rc
== DDI_PROP_SUCCESS
) {
1300 for (i
= count
= 0; i
< (prop_len
- 1); i
++)
1301 if (prop_val
[i
] == ',')
1303 if (prop_val
[i
-1] != ',')
1306 * if somehow the binding instances defined in the
1307 * property are not enough for this instno., then
1308 * reuse the pattern for the next instance until
1309 * it reaches the requested instno
1311 instno
= instance
% count
;
1317 bind_cpu
= stoi(&cptr
);
1318 kmem_free(prop_val
, prop_len
);
1319 /* if specific CPU is bogus, then default to next cpu */
1320 if (!apic_cpu_in_range(bind_cpu
)) {
1321 cmn_err(CE_WARN
, "%s: %s=%s: CPU %d not present",
1322 psm_name
, prop_name
, prop_val
, bind_cpu
);
1323 rc
= DDI_PROP_NOT_FOUND
;
1325 /* indicate that we are bound at user request */
1326 bind_cpu
|= IRQ_USER_BOUND
;
1329 * no need to check apic_cpus[].aci_status, if specific CPU is
1330 * not up, then post_cpu_start will handle it.
1334 if (rc
!= DDI_PROP_SUCCESS
) {
1335 iflag
= intr_clear();
1336 lock_set(&apic_ioapic_lock
);
1337 bind_cpu
= apic_get_next_bind_cpu();
1338 lock_clear(&apic_ioapic_lock
);
1339 intr_restore(iflag
);
1342 cmn_err(CE_CONT
, "!%s: %s (%s) instance %d irq 0x%x "
1343 "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1344 psm_name
, name
, drv_name
, instance
, irq
,
1345 apic_irq_table
[irq
]->airq_vector
, ioapicid
, intin
,
1346 bind_cpu
& ~IRQ_USER_BOUND
);
1348 return ((uint32_t)bind_cpu
);
1352 * Mark vector as being in the process of being deleted. Interrupts
1353 * may still come in on some CPU. The moment an interrupt comes with
1354 * the new vector, we know we can free the old one. Called only from
1355 * addspl and delspl with interrupts disabled. Because an interrupt
1356 * can be shared, but no interrupt from either device may come in,
1357 * we also use a timeout mechanism, which we arbitrarily set to
1358 * apic_revector_timeout microseconds.
1361 apic_mark_vector(uchar_t oldvector
, uchar_t newvector
)
1365 iflag
= intr_clear();
1366 lock_set(&apic_revector_lock
);
1367 if (!apic_oldvec_to_newvec
) {
1368 apic_oldvec_to_newvec
=
1369 kmem_zalloc(sizeof (newvector
) * APIC_MAX_VECTOR
* 2,
1372 if (!apic_oldvec_to_newvec
) {
1374 * This failure is not catastrophic.
1375 * But, the oldvec will never be freed.
1377 apic_error
|= APIC_ERR_MARK_VECTOR_FAIL
;
1378 lock_clear(&apic_revector_lock
);
1379 intr_restore(iflag
);
1382 apic_newvec_to_oldvec
= &apic_oldvec_to_newvec
[APIC_MAX_VECTOR
];
1385 /* See if we already did this for drivers which do double addintrs */
1386 if (apic_oldvec_to_newvec
[oldvector
] != newvector
) {
1387 apic_oldvec_to_newvec
[oldvector
] = newvector
;
1388 apic_newvec_to_oldvec
[newvector
] = oldvector
;
1389 apic_revector_pending
++;
1391 lock_clear(&apic_revector_lock
);
1392 intr_restore(iflag
);
1393 (void) timeout(apic_xlate_vector_free_timeout_handler
,
1394 (void *)(uintptr_t)oldvector
, drv_usectohz(apic_revector_timeout
));
1398 * xlate_vector is called from intr_enter if revector_pending is set.
1399 * It will xlate it if needed and mark the old vector as free.
1402 apic_xlate_vector(uchar_t vector
)
1404 uchar_t newvector
, oldvector
= 0;
1406 lock_set(&apic_revector_lock
);
1407 /* Do we really need to do this ? */
1408 if (!apic_revector_pending
) {
1409 lock_clear(&apic_revector_lock
);
1412 if ((newvector
= apic_oldvec_to_newvec
[vector
]) != 0)
1416 * The incoming vector is new . See if a stale entry is
1419 if ((oldvector
= apic_newvec_to_oldvec
[vector
]) != 0)
1424 apic_revector_pending
--;
1425 apic_oldvec_to_newvec
[oldvector
] = 0;
1426 apic_newvec_to_oldvec
[newvector
] = 0;
1427 apic_free_vector(oldvector
);
1428 lock_clear(&apic_revector_lock
);
1429 /* There could have been more than one reprogramming! */
1430 return (apic_xlate_vector(newvector
));
1432 lock_clear(&apic_revector_lock
);
1437 apic_xlate_vector_free_timeout_handler(void *arg
)
1440 uchar_t oldvector
, newvector
;
1442 oldvector
= (uchar_t
)(uintptr_t)arg
;
1443 iflag
= intr_clear();
1444 lock_set(&apic_revector_lock
);
1445 if ((newvector
= apic_oldvec_to_newvec
[oldvector
]) != 0) {
1446 apic_free_vector(oldvector
);
1447 apic_oldvec_to_newvec
[oldvector
] = 0;
1448 apic_newvec_to_oldvec
[newvector
] = 0;
1449 apic_revector_pending
--;
1452 lock_clear(&apic_revector_lock
);
1453 intr_restore(iflag
);
1457 * Bind interrupt corresponding to irq_ptr to bind_cpu.
1458 * Must be called with interrupts disabled and apic_ioapic_lock held
1461 apic_rebind(apic_irq_t
*irq_ptr
, int bind_cpu
,
1462 struct ioapic_reprogram_data
*drep
)
1464 int ioapicindex
, intin_no
;
1465 uint32_t airq_temp_cpu
;
1466 apic_cpus_info_t
*cpu_infop
;
1471 which_irq
= apic_vector_to_irq
[irq_ptr
->airq_vector
];
1473 intin_no
= irq_ptr
->airq_intin_no
;
1474 ioapicindex
= irq_ptr
->airq_ioapicindex
;
1475 airq_temp_cpu
= irq_ptr
->airq_temp_cpu
;
1476 if (airq_temp_cpu
!= IRQ_UNINIT
&& airq_temp_cpu
!= IRQ_UNBOUND
) {
1477 if (airq_temp_cpu
& IRQ_USER_BOUND
)
1478 /* Mask off high bit so it can be used as array index */
1479 airq_temp_cpu
&= ~IRQ_USER_BOUND
;
1481 ASSERT(apic_cpu_in_range(airq_temp_cpu
));
1485 * Can't bind to a CPU that's not accepting interrupts:
1487 cpu_infop
= &apic_cpus
[bind_cpu
& ~IRQ_USER_BOUND
];
1488 if (!(cpu_infop
->aci_status
& APIC_CPU_INTR_ENABLE
))
1492 * If we are about to change the interrupt vector for this interrupt,
1493 * and this interrupt is level-triggered, attached to an IOAPIC,
1494 * has been delivered to a CPU and that CPU has not handled it
1495 * yet, we cannot reprogram the IOAPIC now.
1497 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr
->airq_mps_intr_index
)) {
1499 rdt_entry
= READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex
,
1502 if ((irq_ptr
->airq_vector
!= RDT_VECTOR(rdt_entry
)) &&
1503 apic_check_stuck_interrupt(irq_ptr
, airq_temp_cpu
,
1504 bind_cpu
, ioapicindex
, intin_no
, which_irq
, drep
) != 0) {
1510 * NOTE: We do not unmask the RDT here, as an interrupt MAY
1511 * still come in before we have a chance to reprogram it below.
1512 * The reprogramming below will simultaneously change and
1513 * unmask the RDT entry.
1516 if ((uint32_t)bind_cpu
== IRQ_UNBOUND
) {
1517 irdt
.ir_lo
= AV_LDEST
| AV_LOPRI
|
1518 irq_ptr
->airq_rdt_entry
;
1520 irdt
.ir_hi
= AV_TOALL
>> APIC_ID_BIT_OFFSET
;
1522 apic_vt_ops
->apic_intrmap_alloc_entry(
1523 &irq_ptr
->airq_intrmap_private
, NULL
,
1524 DDI_INTR_TYPE_FIXED
, 1, ioapicindex
);
1525 apic_vt_ops
->apic_intrmap_map_entry(
1526 irq_ptr
->airq_intrmap_private
, (void *)&irdt
,
1527 DDI_INTR_TYPE_FIXED
, 1);
1528 apic_vt_ops
->apic_intrmap_record_rdt(
1529 irq_ptr
->airq_intrmap_private
, &irdt
);
1531 /* Write the RDT entry -- no specific CPU binding */
1532 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex
, intin_no
,
1533 irdt
.ir_hi
| AV_TOALL
);
1535 if (airq_temp_cpu
!= IRQ_UNINIT
&& airq_temp_cpu
!=
1537 apic_cpus
[airq_temp_cpu
].aci_temp_bound
--;
1540 * Write the vector, trigger, and polarity portion of
1543 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex
, intin_no
,
1546 irq_ptr
->airq_temp_cpu
= IRQ_UNBOUND
;
1551 if (bind_cpu
& IRQ_USER_BOUND
) {
1552 cpu_infop
->aci_bound
++;
1554 cpu_infop
->aci_temp_bound
++;
1556 ASSERT(apic_cpu_in_range(bind_cpu
));
1558 if ((airq_temp_cpu
!= IRQ_UNBOUND
) && (airq_temp_cpu
!= IRQ_UNINIT
)) {
1559 apic_cpus
[airq_temp_cpu
].aci_temp_bound
--;
1561 if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr
->airq_mps_intr_index
)) {
1563 irdt
.ir_lo
= AV_PDEST
| AV_FIXED
| irq_ptr
->airq_rdt_entry
;
1564 irdt
.ir_hi
= cpu_infop
->aci_local_id
;
1566 apic_vt_ops
->apic_intrmap_alloc_entry(
1567 &irq_ptr
->airq_intrmap_private
, NULL
, DDI_INTR_TYPE_FIXED
,
1569 apic_vt_ops
->apic_intrmap_map_entry(
1570 irq_ptr
->airq_intrmap_private
,
1571 (void *)&irdt
, DDI_INTR_TYPE_FIXED
, 1);
1572 apic_vt_ops
->apic_intrmap_record_rdt(
1573 irq_ptr
->airq_intrmap_private
, &irdt
);
1575 /* Write the RDT entry -- bind to a specific CPU: */
1576 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex
, intin_no
,
1579 /* Write the vector, trigger, and polarity portion of the RDT */
1580 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex
, intin_no
,
1584 int type
= (irq_ptr
->airq_mps_intr_index
== MSI_INDEX
) ?
1585 DDI_INTR_TYPE_MSI
: DDI_INTR_TYPE_MSIX
;
1586 if (type
== DDI_INTR_TYPE_MSI
) {
1587 if (irq_ptr
->airq_ioapicindex
==
1588 irq_ptr
->airq_origirq
) {
1590 DDI_INTR_IMPLDBG((CE_CONT
, "apic_rebind: call "
1591 "apic_pci_msi_enable_vector\n"));
1592 apic_pci_msi_enable_vector(irq_ptr
,
1593 type
, which_irq
, irq_ptr
->airq_vector
,
1594 irq_ptr
->airq_intin_no
,
1595 cpu_infop
->aci_local_id
);
1597 if ((irq_ptr
->airq_ioapicindex
+
1598 irq_ptr
->airq_intin_no
- 1) ==
1599 irq_ptr
->airq_origirq
) { /* last one */
1600 DDI_INTR_IMPLDBG((CE_CONT
, "apic_rebind: call "
1601 "apic_pci_msi_enable_mode\n"));
1602 apic_pci_msi_enable_mode(irq_ptr
->airq_dip
,
1605 } else { /* MSI-X */
1606 apic_pci_msi_enable_vector(irq_ptr
, type
,
1607 irq_ptr
->airq_origirq
, irq_ptr
->airq_vector
, 1,
1608 cpu_infop
->aci_local_id
);
1609 apic_pci_msi_enable_mode(irq_ptr
->airq_dip
, type
,
1610 irq_ptr
->airq_origirq
);
1613 irq_ptr
->airq_temp_cpu
= (uint32_t)bind_cpu
;
1614 apic_redist_cpu_skip
&= ~(1 << (bind_cpu
& ~IRQ_USER_BOUND
));
1619 apic_last_ditch_clear_remote_irr(int ioapic_ix
, int intin_no
)
1621 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
, intin_no
)
1622 & AV_REMOTE_IRR
) != 0) {
1624 * Trying to clear the bit through normal
1625 * channels has failed. So as a last-ditch
1626 * effort, try to set the trigger mode to
1627 * edge, then to level. This has been
1628 * observed to work on many systems.
1630 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1632 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1633 intin_no
) & ~AV_LEVEL
);
1635 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1637 READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1638 intin_no
) | AV_LEVEL
);
1641 * If the bit's STILL set, this interrupt may
1644 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1645 intin_no
) & AV_REMOTE_IRR
) != 0) {
1647 prom_printf("%s: Remote IRR still "
1648 "not clear for IOAPIC %d intin %d.\n"
1649 "\tInterrupts to this pin may cease "
1650 "functioning.\n", psm_name
, ioapic_ix
,
1653 apic_last_ditch_reprogram_failures
++;
1660 * This function is protected by apic_ioapic_lock coupled with the
1661 * fact that interrupts are disabled.
1664 delete_defer_repro_ent(int which_irq
)
1666 ASSERT(which_irq
>= 0);
1667 ASSERT(which_irq
<= 255);
1668 ASSERT(LOCK_HELD(&apic_ioapic_lock
));
1670 if (apic_reprogram_info
[which_irq
].done
)
1673 apic_reprogram_info
[which_irq
].done
= B_TRUE
;
1676 apic_defer_repro_total_retries
+=
1677 apic_reprogram_info
[which_irq
].tries
;
1679 apic_defer_repro_successes
++;
1682 if (--apic_reprogram_outstanding
== 0) {
1684 setlvlx
= psm_intr_exit_fn();
1690 * Interrupts must be disabled during this function to prevent
1691 * self-deadlock. Interrupts are disabled because this function
1692 * is called from apic_check_stuck_interrupt(), which is called
1693 * from apic_rebind(), which requires its caller to disable interrupts.
1696 add_defer_repro_ent(apic_irq_t
*irq_ptr
, int which_irq
, int new_bind_cpu
)
1698 ASSERT(which_irq
>= 0);
1699 ASSERT(which_irq
<= 255);
1700 ASSERT(!interrupts_enabled());
1703 * On the off-chance that there's already a deferred
1704 * reprogramming on this irq, check, and if so, just update the
1705 * CPU and irq pointer to which the interrupt is targeted, then return.
1707 if (!apic_reprogram_info
[which_irq
].done
) {
1708 apic_reprogram_info
[which_irq
].bindcpu
= new_bind_cpu
;
1709 apic_reprogram_info
[which_irq
].irqp
= irq_ptr
;
1713 apic_reprogram_info
[which_irq
].irqp
= irq_ptr
;
1714 apic_reprogram_info
[which_irq
].bindcpu
= new_bind_cpu
;
1715 apic_reprogram_info
[which_irq
].tries
= 0;
1717 * This must be the last thing set, since we're not
1718 * grabbing any locks, apic_try_deferred_reprogram() will
1719 * make its decision about using this entry iff done
1722 apic_reprogram_info
[which_irq
].done
= B_FALSE
;
1725 * If there were previously no deferred reprogrammings, change
1726 * setlvlx to call apic_try_deferred_reprogram()
1728 if (++apic_reprogram_outstanding
== 1) {
1730 setlvlx
= apic_try_deferred_reprogram
;
1735 apic_try_deferred_reprogram(int prev_ipl
, int irq
)
1739 struct ioapic_reprogram_data
*drep
;
1741 (*psm_intr_exit_fn())(prev_ipl
, irq
);
1743 if (!lock_try(&apic_defer_reprogram_lock
)) {
1748 * Acquire the apic_ioapic_lock so that any other operations that
1749 * may affect the apic_reprogram_info state are serialized.
1750 * It's still possible for the last deferred reprogramming to clear
1751 * between the time we entered this function and the time we get to
1752 * the for loop below. In that case, *setlvlx will have been set
1753 * back to *_intr_exit and drep will be NULL. (There's no way to
1754 * stop that from happening -- we would need to grab a lock before
1755 * calling *setlvlx, which is neither realistic nor prudent).
1757 iflag
= intr_clear();
1758 lock_set(&apic_ioapic_lock
);
1761 * For each deferred RDT entry, try to reprogram it now. Note that
1762 * there is no lock acquisition to read apic_reprogram_info because
1763 * '.done' is set only after the other fields in the structure are set.
1767 for (reproirq
= 0; reproirq
<= APIC_MAX_VECTOR
; reproirq
++) {
1768 if (apic_reprogram_info
[reproirq
].done
== B_FALSE
) {
1769 drep
= &apic_reprogram_info
[reproirq
];
1775 * Either we found a deferred action to perform, or
1776 * we entered this function spuriously, after *setlvlx
1777 * was restored to point to *_intr_exit. Any other
1778 * permutation is invalid.
1780 ASSERT(drep
!= NULL
|| *setlvlx
== psm_intr_exit_fn());
1783 * Though we can't really do anything about errors
1784 * at this point, keep track of them for reporting.
1785 * Note that it is very possible for apic_setup_io_intr
1786 * to re-register this very timeout if the Remote IRR bit
1787 * has not yet cleared.
1792 if (apic_setup_io_intr(drep
, reproirq
, B_TRUE
) != 0) {
1793 apic_deferred_setup_failures
++;
1796 apic_deferred_spurious_enters
++;
1800 (void) apic_setup_io_intr(drep
, reproirq
, B_TRUE
);
1803 lock_clear(&apic_ioapic_lock
);
1804 intr_restore(iflag
);
1806 lock_clear(&apic_defer_reprogram_lock
);
1810 apic_ioapic_wait_pending_clear(int ioapic_ix
, int intin_no
)
1815 * Wait for the delivery pending bit to clear.
1817 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
, intin_no
) &
1818 (AV_LEVEL
|AV_PENDING
)) == (AV_LEVEL
|AV_PENDING
)) {
1821 * If we're still waiting on the delivery of this interrupt,
1822 * continue to wait here until it is delivered (this should be
1823 * a very small amount of time, but include a timeout just in
1826 for (waited
= 0; waited
< apic_max_reps_clear_pending
;
1828 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1829 intin_no
) & AV_PENDING
) == 0) {
1838 * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR
1839 * bit set. Calls functions that modify the function that setlvlx points to,
1840 * so that the reprogramming can be retried very shortly.
1842 * This function will mask the RDT entry if the interrupt is level-triggered.
1843 * (The caller is responsible for unmasking the RDT entry.)
1845 * Returns non-zero if the caller should defer IOAPIC reprogramming.
1848 apic_check_stuck_interrupt(apic_irq_t
*irq_ptr
, int old_bind_cpu
,
1849 int new_bind_cpu
, int ioapic_ix
, int intin_no
, int which_irq
,
1850 struct ioapic_reprogram_data
*drep
)
1857 * Wait for the delivery pending bit to clear.
1862 apic_ioapic_wait_pending_clear(ioapic_ix
, intin_no
);
1865 * Mask the RDT entry, but only if it's a level-triggered
1868 rdt_entry
= READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1870 if ((rdt_entry
& (AV_LEVEL
|AV_MASK
)) == AV_LEVEL
) {
1873 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
, intin_no
,
1874 AV_MASK
| rdt_entry
);
1877 if ((rdt_entry
& AV_LEVEL
) == AV_LEVEL
) {
1879 * If there was a race and an interrupt was injected
1880 * just before we masked, check for that case here.
1881 * Then, unmask the RDT entry and try again. If we're
1882 * on our last try, don't unmask (because we want the
1883 * RDT entry to remain masked for the rest of the
1886 rdt_entry
= READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1888 if ((rdt_entry
& AV_PENDING
) &&
1889 (reps
< apic_max_reps_clear_pending
)) {
1891 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1892 intin_no
, rdt_entry
& ~AV_MASK
);
1896 } while ((rdt_entry
& AV_PENDING
) &&
1897 (reps
< apic_max_reps_clear_pending
));
1900 if (rdt_entry
& AV_PENDING
)
1901 apic_intr_deliver_timeouts
++;
1905 * If the remote IRR bit is set, then the interrupt has been sent
1906 * to a CPU for processing. We have no choice but to wait for
1907 * that CPU to process the interrupt, at which point the remote IRR
1908 * bit will be cleared.
1910 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
, intin_no
) &
1911 (AV_LEVEL
|AV_REMOTE_IRR
)) == (AV_LEVEL
|AV_REMOTE_IRR
)) {
1914 * If the CPU that this RDT is bound to is NOT the current
1915 * CPU, wait until that CPU handles the interrupt and ACKs
1916 * it. If this interrupt is not bound to any CPU (that is,
1917 * if it's bound to the logical destination of "anyone"), it
1918 * may have been delivered to the current CPU so handle that
1919 * case by deferring the reprogramming (below).
1921 if ((old_bind_cpu
!= IRQ_UNBOUND
) &&
1922 (old_bind_cpu
!= IRQ_UNINIT
) &&
1923 (old_bind_cpu
!= psm_get_cpu_id())) {
1924 for (waited
= 0; waited
< apic_max_reps_clear_pending
;
1926 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix
,
1927 intin_no
) & AV_REMOTE_IRR
) == 0) {
1929 delete_defer_repro_ent(which_irq
);
1931 /* Remote IRR has cleared! */
1938 * If we waited and the Remote IRR bit is still not cleared,
1939 * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS
1940 * times for this interrupt, try the last-ditch workaround:
1942 if (drep
&& drep
->tries
>= APIC_REPROGRAM_MAX_TRIES
) {
1944 apic_last_ditch_clear_remote_irr(ioapic_ix
, intin_no
);
1946 /* Mark this one as reprogrammed: */
1947 delete_defer_repro_ent(which_irq
);
1952 apic_intr_deferrals
++;
1956 * If waiting for the Remote IRR bit (above) didn't
1957 * allow it to clear, defer the reprogramming.
1958 * Add a new deferred-programming entry if the
1959 * caller passed a NULL one (and update the existing one
1960 * in case anything changed).
1962 add_defer_repro_ent(irq_ptr
, which_irq
, new_bind_cpu
);
1966 /* Inform caller to defer IOAPIC programming: */
1972 /* Remote IRR is clear */
1973 delete_defer_repro_ent(which_irq
);
1979 * Called to migrate all interrupts at an irq to another cpu.
1980 * Must be called with interrupts disabled and apic_ioapic_lock held
1983 apic_rebind_all(apic_irq_t
*irq_ptr
, int bind_cpu
)
1985 apic_irq_t
*irqptr
= irq_ptr
;
1989 if (irqptr
->airq_temp_cpu
!= IRQ_UNINIT
)
1990 retval
|= apic_rebind(irqptr
, bind_cpu
, NULL
);
1991 irqptr
= irqptr
->airq_next
;
1998 * apic_intr_redistribute does all the messy computations for identifying
1999 * which interrupt to move to which CPU. Currently we do just one interrupt
2000 * at a time. This reduces the time we spent doing all this within clock
2001 * interrupt. When it is done in idle, we could do more than 1.
2002 * First we find the most busy and the most free CPU (time in ISR only)
2003 * skipping those CPUs that has been identified as being ineligible (cpu_skip)
2004 * Then we look for IRQs which are closest to the difference between the
2005 * most busy CPU and the average ISR load. We try to find one whose load
2006 * is less than difference.If none exists, then we chose one larger than the
2007 * difference, provided it does not make the most idle CPU worse than the
2008 * most busy one. In the end, we clear all the busy fields for CPUs. For
2009 * IRQs, they are cleared as they are scanned.
2012 apic_intr_redistribute(void)
2014 int busiest_cpu
, most_free_cpu
;
2015 int cpu_free
, cpu_busy
, max_busy
, min_busy
;
2017 int average_busy
, cpus_online
;
2020 apic_cpus_info_t
*cpu_infop
;
2021 apic_irq_t
*min_busy_irq
= NULL
;
2022 apic_irq_t
*max_busy_irq
= NULL
;
2024 busiest_cpu
= most_free_cpu
= -1;
2025 cpu_free
= cpu_busy
= max_busy
= average_busy
= 0;
2026 min_free
= apic_sample_factor_redistribution
;
2029 * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu
2030 * without ioapic_lock. That is OK as we are just doing statistical
2031 * sampling anyway and any inaccuracy now will get corrected next time
2032 * The call to rebind which actually changes things will make sure
2033 * we are consistent.
2035 for (i
= 0; i
< apic_nproc
; i
++) {
2036 if (apic_cpu_in_range(i
) &&
2037 !(apic_redist_cpu_skip
& (1 << i
)) &&
2038 (apic_cpus
[i
].aci_status
& APIC_CPU_INTR_ENABLE
)) {
2040 cpu_infop
= &apic_cpus
[i
];
2042 * If no unbound interrupts or only 1 total on this
2045 if (!cpu_infop
->aci_temp_bound
||
2046 (cpu_infop
->aci_bound
+ cpu_infop
->aci_temp_bound
)
2048 apic_redist_cpu_skip
|= 1 << i
;
2052 busy
= cpu_infop
->aci_busy
;
2053 average_busy
+= busy
;
2055 if (max_busy
< busy
) {
2059 if (min_free
> busy
) {
2063 if (busy
> apic_int_busy_mark
) {
2066 if (busy
< apic_int_free_mark
)
2071 if ((cpu_busy
&& cpu_free
) ||
2072 (max_busy
>= (min_free
+ apic_diff_for_redistribution
))) {
2074 apic_num_imbalance
++;
2076 if (apic_verbose
& APIC_VERBOSE_IOAPIC_FLAG
) {
2078 "redistribute busy=%x free=%x max=%x min=%x",
2079 cpu_busy
, cpu_free
, max_busy
, min_free
);
2084 average_busy
/= cpus_online
;
2086 diff
= max_busy
- average_busy
;
2087 min_busy
= max_busy
; /* start with the max possible value */
2089 min_busy_irq
= max_busy_irq
= NULL
;
2090 i
= apic_min_device_irq
;
2091 for (; i
<= apic_max_device_irq
; i
++) {
2092 apic_irq_t
*irq_ptr
;
2093 /* Change to linked list per CPU ? */
2094 if ((irq_ptr
= apic_irq_table
[i
]) == NULL
)
2096 /* Check for irq_busy & decide which one to move */
2097 /* Also zero them for next round */
2098 if ((irq_ptr
->airq_temp_cpu
== busiest_cpu
) &&
2099 irq_ptr
->airq_busy
) {
2100 if (irq_ptr
->airq_busy
< diff
) {
2102 * Check for least busy CPU,
2103 * best fit or what ?
2105 if (max_busy
< irq_ptr
->airq_busy
) {
2107 * Most busy within the
2108 * required differential
2110 max_busy
= irq_ptr
->airq_busy
;
2111 max_busy_irq
= irq_ptr
;
2114 if (min_busy
> irq_ptr
->airq_busy
) {
2116 * least busy, but more than
2120 (diff
+ average_busy
-
2123 * Making sure new cpu
2130 min_busy_irq
= irq_ptr
;
2135 irq_ptr
->airq_busy
= 0;
2138 if (max_busy_irq
!= NULL
) {
2140 if (apic_verbose
& APIC_VERBOSE_IOAPIC_FLAG
) {
2141 prom_printf("rebinding %x to %x",
2142 max_busy_irq
->airq_vector
, most_free_cpu
);
2145 iflag
= intr_clear();
2146 if (lock_try(&apic_ioapic_lock
)) {
2147 if (apic_rebind_all(max_busy_irq
,
2148 most_free_cpu
) == 0) {
2149 /* Make change permenant */
2150 max_busy_irq
->airq_cpu
=
2151 (uint32_t)most_free_cpu
;
2153 lock_clear(&apic_ioapic_lock
);
2155 intr_restore(iflag
);
2157 } else if (min_busy_irq
!= NULL
) {
2159 if (apic_verbose
& APIC_VERBOSE_IOAPIC_FLAG
) {
2160 prom_printf("rebinding %x to %x",
2161 min_busy_irq
->airq_vector
, most_free_cpu
);
2165 iflag
= intr_clear();
2166 if (lock_try(&apic_ioapic_lock
)) {
2167 if (apic_rebind_all(min_busy_irq
,
2168 most_free_cpu
) == 0) {
2169 /* Make change permenant */
2170 min_busy_irq
->airq_cpu
=
2171 (uint32_t)most_free_cpu
;
2173 lock_clear(&apic_ioapic_lock
);
2175 intr_restore(iflag
);
2178 if (cpu_busy
!= (1 << busiest_cpu
)) {
2179 apic_redist_cpu_skip
|= 1 << busiest_cpu
;
2181 * We leave cpu_skip set so that next time we
2182 * can choose another cpu
2189 * found nothing. Could be that we skipped over valid CPUs
2190 * or we have balanced everything. If we had a variable
2191 * ticks_for_redistribution, it could be increased here.
2192 * apic_int_busy, int_free etc would also need to be
2195 if (apic_redist_cpu_skip
)
2196 apic_redist_cpu_skip
= 0;
2198 for (i
= 0; i
< apic_nproc
; i
++) {
2199 if (apic_cpu_in_range(i
)) {
2200 apic_cpus
[i
].aci_busy
= 0;
2206 apic_cleanup_busy(void)
2209 apic_irq_t
*irq_ptr
;
2211 for (i
= 0; i
< apic_nproc
; i
++) {
2212 if (apic_cpu_in_range(i
)) {
2213 apic_cpus
[i
].aci_busy
= 0;
2217 for (i
= apic_min_device_irq
; i
<= apic_max_device_irq
; i
++) {
2218 if ((irq_ptr
= apic_irq_table
[i
]) != NULL
)
2219 irq_ptr
->airq_busy
= 0;