4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/types.h>
27 #include <sys/systm.h>
29 #include <sys/sysmacros.h>
30 #include <sys/archsystm.h>
31 #include <sys/vmsystm.h>
32 #include <sys/machparam.h>
33 #include <sys/machsystm.h>
34 #include <sys/machthread.h>
37 #include <sys/elf_SPARC.h>
38 #include <vm/vm_dep.h>
39 #include <vm/hat_sfmmu.h>
40 #include <vm/seg_kpm.h>
41 #include <sys/cpuvar.h>
42 #include <sys/cheetahregs.h>
43 #include <sys/us3_module.h>
44 #include <sys/async.h>
45 #include <sys/cmn_err.h>
46 #include <sys/debug.h>
47 #include <sys/dditypes.h>
48 #include <sys/prom_debug.h>
49 #include <sys/prom_plat.h>
50 #include <sys/cpu_module.h>
51 #include <sys/sysmacros.h>
52 #include <sys/intreg.h>
53 #include <sys/clock.h>
54 #include <sys/platform_module.h>
55 #include <sys/machtrap.h>
56 #include <sys/ontrap.h>
57 #include <sys/panic.h>
58 #include <sys/memlist.h>
59 #include <sys/bootconf.h>
60 #include <sys/ivintr.h>
61 #include <sys/atomic.h>
62 #include <sys/taskq.h>
64 #include <sys/ndifm.h>
65 #include <sys/ddifm.h>
66 #include <sys/fm/protocol.h>
67 #include <sys/fm/util.h>
68 #include <sys/fm/cpu/UltraSPARC-III.h>
69 #include <sys/fpras_impl.h>
70 #include <sys/dtrace.h>
71 #include <sys/watchpoint.h>
72 #include <sys/plat_ecc_unum.h>
73 #include <sys/cyclic.h>
74 #include <sys/errorq.h>
75 #include <sys/errclassify.h>
77 #include <sys/clock_impl.h>
79 #ifdef CHEETAHPLUS_ERRATUM_25
80 #include <sys/xc_impl.h>
81 #endif /* CHEETAHPLUS_ERRATUM_25 */
83 ch_cpu_logout_t clop_before_flush
;
84 ch_cpu_logout_t clop_after_flush
;
85 uint_t flush_retries_done
= 0;
87 * Note that 'Cheetah PRM' refers to:
88 * SPARC V9 JPS1 Implementation Supplement: Sun UltraSPARC-III
92 * Per CPU pointers to physical address of TL>0 logout data areas.
93 * These pointers have to be in the kernel nucleus to avoid MMU
96 uint64_t ch_err_tl1_paddrs
[NCPU
];
99 * One statically allocated structure to use during startup/DR
100 * to prevent unnecessary panics.
102 ch_err_tl1_data_t ch_err_tl1_data
;
105 * Per CPU pending error at TL>0, used by level15 softint handler
107 uchar_t ch_err_tl1_pending
[NCPU
];
110 * For deferred CE re-enable after trap.
112 taskq_t
*ch_check_ce_tq
;
115 * Internal functions.
117 static int cpu_async_log_err(void *flt
, errorq_elem_t
*eqep
);
118 static void cpu_log_diag_info(ch_async_flt_t
*ch_flt
);
119 static void cpu_queue_one_event(ch_async_flt_t
*ch_flt
, char *reason
,
120 ecc_type_to_info_t
*eccp
, ch_diag_data_t
*cdp
);
121 static int cpu_flt_in_memory_one_event(ch_async_flt_t
*ch_flt
,
122 uint64_t t_afsr_bit
);
123 static int clear_ecc(struct async_flt
*ecc
);
124 #if defined(CPU_IMP_ECACHE_ASSOC)
125 static int cpu_ecache_line_valid(ch_async_flt_t
*ch_flt
);
127 int cpu_ecache_set_size(struct cpu
*cp
);
128 static int cpu_ectag_line_invalid(int cachesize
, uint64_t tag
);
129 int cpu_ectag_pa_to_subblk(int cachesize
, uint64_t subaddr
);
130 uint64_t cpu_ectag_to_pa(int setsize
, uint64_t tag
);
131 int cpu_ectag_pa_to_subblk_state(int cachesize
,
132 uint64_t subaddr
, uint64_t tag
);
133 static void cpu_flush_ecache_line(ch_async_flt_t
*ch_flt
);
134 static int afsr_to_afar_status(uint64_t afsr
, uint64_t afsr_bit
);
135 static int afsr_to_esynd_status(uint64_t afsr
, uint64_t afsr_bit
);
136 static int afsr_to_msynd_status(uint64_t afsr
, uint64_t afsr_bit
);
137 static int afsr_to_synd_status(uint_t cpuid
, uint64_t afsr
, uint64_t afsr_bit
);
138 static int synd_to_synd_code(int synd_status
, ushort_t synd
, uint64_t afsr_bit
);
139 static int cpu_get_mem_unum_synd(int synd_code
, struct async_flt
*, char *buf
);
140 static void cpu_uninit_ecache_scrub_dr(struct cpu
*cp
);
141 static void cpu_scrubphys(struct async_flt
*aflt
);
142 static void cpu_payload_add_aflt(struct async_flt
*, nvlist_t
*, nvlist_t
*,
144 static void cpu_payload_add_ecache(struct async_flt
*, nvlist_t
*);
145 static void cpu_ereport_init(struct async_flt
*aflt
);
146 static int cpu_check_secondary_errors(ch_async_flt_t
*, uint64_t, uint64_t);
147 static uint8_t cpu_flt_bit_to_plat_error(struct async_flt
*aflt
);
148 static void cpu_log_fast_ecc_error(caddr_t tpc
, int priv
, int tl
, uint64_t ceen
,
149 uint64_t nceen
, ch_cpu_logout_t
*clop
);
150 static int cpu_ce_delayed_ec_logout(uint64_t);
151 static int cpu_matching_ecache_line(uint64_t, void *, int, int *);
152 static int cpu_error_is_ecache_data(int, uint64_t);
153 static void cpu_fmri_cpu_set(nvlist_t
*, int);
154 static int cpu_error_to_resource_type(struct async_flt
*aflt
);
156 #ifdef CHEETAHPLUS_ERRATUM_25
157 static int mondo_recover_proc(uint16_t, int);
158 static void cheetah_nudge_init(void);
159 static void cheetah_nudge_onln(void *arg
, cpu_t
*cpu
, cyc_handler_t
*hdlr
,
161 static void cheetah_nudge_buddy(void);
162 #endif /* CHEETAHPLUS_ERRATUM_25 */
164 #if defined(CPU_IMP_L1_CACHE_PARITY)
165 static void cpu_dcache_parity_info(ch_async_flt_t
*ch_flt
);
166 static void cpu_dcache_parity_check(ch_async_flt_t
*ch_flt
, int index
);
167 static void cpu_record_dc_data_parity(ch_async_flt_t
*ch_flt
,
168 ch_dc_data_t
*dest_dcp
, ch_dc_data_t
*src_dcp
, int way
, int word
);
169 static void cpu_icache_parity_info(ch_async_flt_t
*ch_flt
);
170 static void cpu_icache_parity_check(ch_async_flt_t
*ch_flt
, int index
);
171 static void cpu_pcache_parity_info(ch_async_flt_t
*ch_flt
);
172 static void cpu_pcache_parity_check(ch_async_flt_t
*ch_flt
, int index
);
173 static void cpu_payload_add_dcache(struct async_flt
*, nvlist_t
*);
174 static void cpu_payload_add_icache(struct async_flt
*, nvlist_t
*);
175 #endif /* CPU_IMP_L1_CACHE_PARITY */
177 int (*p2get_mem_info
)(int synd_code
, uint64_t paddr
,
178 uint64_t *mem_sizep
, uint64_t *seg_sizep
, uint64_t *bank_sizep
,
179 int *segsp
, int *banksp
, int *mcidp
);
182 * This table is used to determine which bit(s) is(are) bad when an ECC
183 * error occurs. The array is indexed by an 9-bit syndrome. The entries
184 * of this array have the following semantics:
186 * 00-127 The number of the bad bit, when only one bit is bad.
187 * 128 ECC bit C0 is bad.
188 * 129 ECC bit C1 is bad.
189 * 130 ECC bit C2 is bad.
190 * 131 ECC bit C3 is bad.
191 * 132 ECC bit C4 is bad.
192 * 133 ECC bit C5 is bad.
193 * 134 ECC bit C6 is bad.
194 * 135 ECC bit C7 is bad.
195 * 136 ECC bit C8 is bad.
196 * 137-143 reserved for Mtag Data and ECC.
197 * 144(M2) Two bits are bad within a nibble.
198 * 145(M3) Three bits are bad within a nibble.
199 * 146(M3) Four bits are bad within a nibble.
200 * 147(M) Multiple bits (5 or more) are bad.
201 * 148 NO bits are bad.
202 * Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-4,11-5.
214 #define MT0 137 /* Mtag Data bit 0 */
217 #define MTC0 140 /* Mtag Check bit 0 */
226 #if defined(JALAPENO) || defined(SERRANO)
227 #define S003 149 /* Syndrome 0x003 => likely from CPU/EDU:ST/FRU/BP */
228 #define S003MEM 150 /* Syndrome 0x003 => likely from WDU/WBP */
229 #define SLAST S003MEM /* last special syndrome */
230 #else /* JALAPENO || SERRANO */
231 #define S003 149 /* Syndrome 0x003 => likely from EDU:ST */
232 #define S071 150 /* Syndrome 0x071 => likely from WDU/CPU */
233 #define S11C 151 /* Syndrome 0x11c => likely from BERR/DBERR */
234 #define SLAST S11C /* last special syndrome */
235 #endif /* JALAPENO || SERRANO */
236 #if defined(JALAPENO) || defined(SERRANO)
237 #define BPAR0 152 /* syndrom 152 through 167 for bus parity */
239 #endif /* JALAPENO || SERRANO */
241 static uint8_t ecc_syndrome_tab
[] =
243 NA
, C0
, C1
, S003
, C2
, M2
, M3
, 47, C3
, M2
, M2
, 53, M2
, 41, 29, M
,
244 C4
, M
, M
, 50, M2
, 38, 25, M2
, M2
, 33, 24, M2
, 11, M
, M2
, 16,
245 C5
, M
, M
, 46, M2
, 37, 19, M2
, M
, 31, 32, M
, 7, M2
, M2
, 10,
246 M2
, 40, 13, M2
, 59, M
, M2
, 66, M
, M2
, M2
, 0, M2
, 67, 71, M
,
247 C6
, M
, M
, 43, M
, 36, 18, M
, M2
, 49, 15, M
, 63, M2
, M2
, 6,
248 M2
, 44, 28, M2
, M
, M2
, M2
, 52, 68, M2
, M2
, 62, M2
, M3
, M3
, M4
,
249 M2
, 26, 106, M2
, 64, M
, M2
, 2, 120, M
, M2
, M3
, M
, M3
, M3
, M4
,
250 #if defined(JALAPENO) || defined(SERRANO)
251 116, M2
, M2
, M3
, M2
, M3
, M
, M4
, M2
, 58, 54, M2
, M
, M4
, M4
, M3
,
252 #else /* JALAPENO || SERRANO */
253 116, S071
, M2
, M3
, M2
, M3
, M
, M4
, M2
, 58, 54, M2
, M
, M4
, M4
, M3
,
254 #endif /* JALAPENO || SERRANO */
255 C7
, M2
, M
, 42, M
, 35, 17, M2
, M
, 45, 14, M2
, 21, M2
, M2
, 5,
256 M
, 27, M
, M
, 99, M
, M
, 3, 114, M2
, M2
, 20, M2
, M3
, M3
, M
,
257 M2
, 23, 113, M2
, 112, M2
, M
, 51, 95, M
, M2
, M3
, M2
, M3
, M3
, M2
,
258 103, M
, M2
, M3
, M2
, M3
, M3
, M4
, M2
, 48, M
, M
, 73, M2
, M
, M3
,
259 M2
, 22, 110, M2
, 109, M2
, M
, 9, 108, M2
, M
, M3
, M2
, M3
, M3
, M
,
260 102, M2
, M
, M
, M2
, M3
, M3
, M
, M2
, M3
, M3
, M2
, M
, M4
, M
, M3
,
261 98, M
, M2
, M3
, M2
, M
, M3
, M4
, M2
, M3
, M3
, M4
, M3
, M
, M
, M
,
262 M2
, M3
, M3
, M
, M3
, M
, M
, M
, 56, M4
, M
, M3
, M4
, M
, M
, M
,
263 C8
, M
, M2
, 39, M
, 34, 105, M2
, M
, 30, 104, M
, 101, M
, M
, 4,
264 #if defined(JALAPENO) || defined(SERRANO)
265 M
, M
, 100, M
, 83, M
, M2
, 12, 87, M
, M
, 57, M2
, M
, M3
, M
,
266 #else /* JALAPENO || SERRANO */
267 M
, M
, 100, M
, 83, M
, M2
, 12, 87, M
, M
, 57, S11C
, M
, M3
, M
,
268 #endif /* JALAPENO || SERRANO */
269 M2
, 97, 82, M2
, 78, M2
, M2
, 1, 96, M
, M
, M
, M
, M
, M3
, M2
,
270 94, M
, M2
, M3
, M2
, M
, M3
, M
, M2
, M
, 79, M
, 69, M
, M4
, M
,
271 M2
, 93, 92, M
, 91, M
, M2
, 8, 90, M2
, M2
, M
, M
, M
, M
, M4
,
272 89, M
, M
, M3
, M2
, M3
, M3
, M
, M
, M
, M3
, M2
, M3
, M2
, M
, M3
,
273 86, M
, M2
, M3
, M2
, M
, M3
, M
, M2
, M
, M3
, M
, M3
, M
, M
, M3
,
274 M
, M
, M3
, M2
, M3
, M2
, M4
, M
, 60, M
, M2
, M3
, M4
, M
, M
, M2
,
275 M2
, 88, 85, M2
, 84, M
, M2
, 55, 81, M2
, M2
, M3
, M2
, M3
, M3
, M4
,
276 77, M
, M
, M
, M2
, M3
, M
, M
, M2
, M3
, M3
, M4
, M3
, M2
, M
, M
,
277 74, M
, M2
, M3
, M
, M
, M3
, M
, M
, M
, M3
, M
, M3
, M
, M4
, M3
,
278 M2
, 70, 107, M4
, 65, M2
, M2
, M
, 127, M
, M
, M
, M2
, M3
, M3
, M
,
279 80, M2
, M2
, 72, M
, 119, 118, M
, M2
, 126, 76, M
, 125, M
, M4
, M3
,
280 M2
, 115, 124, M
, 75, M
, M
, M3
, 61, M
, M4
, M
, M4
, M
, M
, M
,
281 M
, 123, 122, M4
, 121, M4
, M
, M3
, 117, M2
, M2
, M3
, M4
, M3
, M
, M
,
282 111, M
, M
, M
, M4
, M3
, M3
, M
, M
, M
, M3
, M
, M3
, M2
, M
, M
285 #define ESYND_TBL_SIZE (sizeof (ecc_syndrome_tab) / sizeof (uint8_t))
287 #if !(defined(JALAPENO) || defined(SERRANO))
289 * This table is used to determine which bit(s) is(are) bad when a Mtag
290 * error occurs. The array is indexed by an 4-bit ECC syndrome. The entries
291 * of this array have the following semantics:
293 * -1 Invalid mtag syndrome.
294 * 137 Mtag Data 0 is bad.
295 * 138 Mtag Data 1 is bad.
296 * 139 Mtag Data 2 is bad.
297 * 140 Mtag ECC 0 is bad.
298 * 141 Mtag ECC 1 is bad.
299 * 142 Mtag ECC 2 is bad.
300 * 143 Mtag ECC 3 is bad.
301 * Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-6.
303 short mtag_syndrome_tab
[] =
305 NA
, MTC0
, MTC1
, M2
, MTC2
, M2
, M2
, MT0
, MTC3
, M2
, M2
, MT1
, M2
, MT2
, M2
, M2
308 #define MSYND_TBL_SIZE (sizeof (mtag_syndrome_tab) / sizeof (short))
310 #else /* !(JALAPENO || SERRANO) */
312 #define BSYND_TBL_SIZE 16
314 #endif /* !(JALAPENO || SERRANO) */
317 * Virtual Address bit flag in the data cache. This is actually bit 2 in the
320 #define VA13 INT64_C(0x0000000000000002)
323 * Types returned from cpu_error_to_resource_type()
325 #define ERRTYPE_UNKNOWN 0
326 #define ERRTYPE_CPU 1
327 #define ERRTYPE_MEMORY 2
328 #define ERRTYPE_ECACHE_DATA 3
331 * CE initial classification and subsequent action lookup table
333 static ce_dispact_t ce_disp_table
[CE_INITDISPTBL_SIZE
];
334 static int ce_disp_inited
;
337 * Set to disable leaky and partner check for memory correctables
342 * The following are not incremented atomically so are indicative only
344 static int ce_xdiag_drops
;
345 static int ce_xdiag_lkydrops
;
346 static int ce_xdiag_ptnrdrops
;
347 static int ce_xdiag_bad
;
350 * CE leaky check callback structure
353 struct async_flt
*lkycb_aflt
;
355 errorq_elem_t
*lkycb_eqep
;
359 * defines for various ecache_flush_flag's
361 #define ECACHE_FLUSH_LINE 1
362 #define ECACHE_FLUSH_ALL 2
367 #define STICK_ITERATION 10
375 static int64_t stick_iter
= STICK_ITERATION
;
376 static int64_t stick_tsk
= MAX_TSKEW
;
385 static volatile event_cmd_t stick_sync_cmd
= EVENT_NULL
;
386 static int64_t timestamp
[EVENTS
];
387 static volatile int slave_done
;
390 #define DSYNC_ATTEMPTS 64
392 int64_t skew_val
[DSYNC_ATTEMPTS
];
395 ss_t stick_sync_stats
[NCPU
];
398 uint_t cpu_impl_dual_pgsz
= 0;
399 #if defined(CPU_IMP_DUAL_PAGESIZE)
400 uint_t disable_dual_pgsz
= 0;
401 #endif /* CPU_IMP_DUAL_PAGESIZE */
404 * Save the cache bootup state for use when internal
405 * caches are to be re-enabled after an error occurs.
407 uint64_t cache_boot_state
;
410 * PA[22:0] represent Displacement in Safari configuration space.
412 uint_t root_phys_addr_lo_mask
= 0x7fffffu
;
414 bus_config_eclk_t bus_config_eclk
[] = {
415 #if defined(JALAPENO) || defined(SERRANO)
416 {JBUS_CONFIG_ECLK_1_DIV
, JBUS_CONFIG_ECLK_1
},
417 {JBUS_CONFIG_ECLK_2_DIV
, JBUS_CONFIG_ECLK_2
},
418 {JBUS_CONFIG_ECLK_32_DIV
, JBUS_CONFIG_ECLK_32
},
419 #else /* JALAPENO || SERRANO */
420 {SAFARI_CONFIG_ECLK_1_DIV
, SAFARI_CONFIG_ECLK_1
},
421 {SAFARI_CONFIG_ECLK_2_DIV
, SAFARI_CONFIG_ECLK_2
},
422 {SAFARI_CONFIG_ECLK_32_DIV
, SAFARI_CONFIG_ECLK_32
},
423 #endif /* JALAPENO || SERRANO */
428 * Interval for deferred CEEN reenable
430 int cpu_ceen_delay_secs
= CPU_CEEN_DELAY_SECS
;
433 * set in /etc/system to control logging of user BERR/TO's
435 int cpu_berr_to_verbose
= 0;
438 * set to 0 in /etc/system to defer CEEN reenable for all CEs
440 uint64_t cpu_ce_not_deferred
= CPU_CE_NOT_DEFERRED
;
441 uint64_t cpu_ce_not_deferred_ext
= CPU_CE_NOT_DEFERRED_EXT
;
444 * Set of all offline cpus
446 cpuset_t cpu_offline_set
;
448 static void cpu_delayed_check_ce_errors(void *);
449 static void cpu_check_ce_errors(void *);
450 void cpu_error_ecache_flush(ch_async_flt_t
*);
451 static int cpu_error_ecache_flush_required(ch_async_flt_t
*);
452 static void cpu_log_and_clear_ce(ch_async_flt_t
*);
453 void cpu_ce_detected(ch_cpu_errors_t
*, int);
456 * CE Leaky check timeout in microseconds. This is chosen to be twice the
457 * memory refresh interval of current DIMMs (64ms). After initial fix that
458 * gives at least one full refresh cycle in which the cell can leak
459 * (whereafter further refreshes simply reinforce any incorrect bit value).
461 clock_t cpu_ce_lkychk_timeout_usec
= 128000;
464 * CE partner check partner caching period in seconds
466 int cpu_ce_ptnr_cachetime_sec
= 60;
469 * Sets trap table entry ttentry by overwriting eight instructions from ttlabel
471 #define CH_SET_TRAP(ttentry, ttlabel) \
472 bcopy((const void *)&ttlabel, &ttentry, 32); \
473 flush_instr_mem((caddr_t)&ttentry, 32);
475 static int min_ecache_size
;
476 static uint_t priv_hcl_1
;
477 static uint_t priv_hcl_2
;
478 static uint_t priv_hcl_4
;
479 static uint_t priv_hcl_8
;
485 extern int cpc_has_overflow_intr
;
488 * Setup chip-specific trap handlers.
492 cache
|= (CACHE_VAC
| CACHE_PTAG
| CACHE_IOCOHERENT
);
494 at_flags
= EF_SPARC_32PLUS
| EF_SPARC_SUN_US1
| EF_SPARC_SUN_US3
;
497 * save the cache bootup state.
499 cache_boot_state
= get_dcu() & DCU_CACHE
;
502 * Due to the number of entries in the fully-associative tlb
503 * this may have to be tuned lower than in spitfire.
505 pp_slots
= MIN(8, MAXPP_SLOTS
);
508 * Block stores do not invalidate all pages of the d$, pagecopy
509 * et. al. need virtual translations with virtual coloring taken
510 * into consideration. prefetch/ldd will pollute the d$ on the
513 pp_consistent_coloring
= PPAGE_STORE_VCOLORING
| PPAGE_LOADS_POLLUTE
;
515 if (use_page_coloring
) {
520 "sparcv9+vis2 sparcv9+vis sparcv9 "
521 "sparcv8plus+vis2 sparcv8plus+vis sparcv8plus "
522 "sparcv8 sparcv8-fsmuld sparcv7 sparc";
525 * On Panther-based machines, this should
526 * also include AV_SPARC_POPC too
528 cpu_hwcap_flags
= AV_SPARC_VIS
| AV_SPARC_VIS2
;
531 * On cheetah, there's no hole in the virtual address space
533 hole_start
= hole_end
= 0;
536 * The kpm mapping window.
538 * The size of a single kpm range.
539 * The overall size will be: kpm_size * vac_colors.
541 * The virtual start address of the kpm range within the kernel
542 * virtual address space. kpm_vbase has to be kpm_size aligned.
544 kpm_size
= (size_t)(8ull * 1024 * 1024 * 1024 * 1024); /* 8TB */
546 kpm_vbase
= (caddr_t
)0x8000000000000000ull
; /* 8EB */
550 * The traptrace code uses either %tick or %stick for
551 * timestamping. We have %stick so we can use it.
553 traptrace_use_stick
= 1;
556 * Cheetah has a performance counter overflow interrupt
558 cpc_has_overflow_intr
= 1;
560 #if defined(CPU_IMP_DUAL_PAGESIZE)
562 * Use Cheetah+ and later dual page size support.
564 if (!disable_dual_pgsz
) {
565 cpu_impl_dual_pgsz
= 1;
567 #endif /* CPU_IMP_DUAL_PAGESIZE */
570 * Declare that this architecture/cpu combination does fpRAS.
572 fpras_implemented
= 1;
575 * Setup CE lookup table
577 CE_INITDISPTBL_POPULATE(ce_disp_table
);
582 * Called by setcpudelay
585 cpu_init_tick_freq(void)
588 * For UltraSPARC III and beyond we want to use the
589 * system clock rate as the basis for low level timing,
590 * due to support of mixed speed CPUs and power managment.
592 if (system_clock_freq
== 0)
593 cmn_err(CE_PANIC
, "setcpudelay: invalid system_clock_freq");
595 sys_tick_freq
= system_clock_freq
;
598 #ifdef CHEETAHPLUS_ERRATUM_25
602 int cheetah_bpe_off
= 0;
603 int cheetah_sendmondo_recover
= 1;
604 int cheetah_sendmondo_fullscan
= 0;
605 int cheetah_sendmondo_recover_delay
= 5;
607 #define CHEETAH_LIVELOCK_MIN_DELAY 1
610 * Recovery Statistics
612 typedef struct cheetah_livelock_entry
{
613 int cpuid
; /* fallen cpu */
614 int buddy
; /* cpu that ran recovery */
615 clock_t lbolt
; /* when recovery started */
616 hrtime_t recovery_time
; /* time spent in recovery */
617 } cheetah_livelock_entry_t
;
619 #define CHEETAH_LIVELOCK_NENTRY 32
621 cheetah_livelock_entry_t cheetah_livelock_hist
[CHEETAH_LIVELOCK_NENTRY
];
622 int cheetah_livelock_entry_nxt
;
624 #define CHEETAH_LIVELOCK_ENTRY_NEXT(statp) { \
625 statp = cheetah_livelock_hist + cheetah_livelock_entry_nxt; \
626 if (++cheetah_livelock_entry_nxt >= CHEETAH_LIVELOCK_NENTRY) { \
627 cheetah_livelock_entry_nxt = 0; \
631 #define CHEETAH_LIVELOCK_ENTRY_SET(statp, item, val) statp->item = val
634 hrtime_t hrt
; /* maximum recovery time */
635 int recovery
; /* recovered */
636 int full_claimed
; /* maximum pages claimed in full recovery */
637 int proc_entry
; /* attempted to claim TSB */
638 int proc_tsb_scan
; /* tsb scanned */
639 int proc_tsb_partscan
; /* tsb partially scanned */
640 int proc_tsb_fullscan
; /* whole tsb scanned */
641 int proc_claimed
; /* maximum pages claimed in tsb scan */
642 int proc_user
; /* user thread */
643 int proc_kernel
; /* kernel thread */
644 int proc_onflt
; /* bad stack */
645 int proc_cpu
; /* null cpu */
646 int proc_thread
; /* null thread */
647 int proc_proc
; /* null proc */
648 int proc_as
; /* null as */
649 int proc_hat
; /* null hat */
650 int proc_hat_inval
; /* hat contents don't make sense */
651 int proc_hat_busy
; /* hat is changing TSBs */
652 int proc_tsb_reloc
; /* TSB skipped because being relocated */
653 int proc_cnum_bad
; /* cnum out of range */
654 int proc_cnum
; /* last cnum processed */
655 tte_t proc_tte
; /* last tte processed */
656 } cheetah_livelock_stat
;
658 #define CHEETAH_LIVELOCK_STAT(item) cheetah_livelock_stat.item++
660 #define CHEETAH_LIVELOCK_STATSET(item, value) \
661 cheetah_livelock_stat.item = value
663 #define CHEETAH_LIVELOCK_MAXSTAT(item, value) { \
664 if (value > cheetah_livelock_stat.item) \
665 cheetah_livelock_stat.item = value; \
669 * Attempt to recover a cpu by claiming every cache line as saved
670 * in the TSB that the non-responsive cpu is using. Since we can't
671 * grab any adaptive lock, this is at best an attempt to do so. Because
672 * we don't grab any locks, we must operate under the protection of
675 * Return 1 if cpuid could be recovered, 0 if failed.
678 mondo_recover_proc(uint16_t cpuid
, int bn
)
687 struct tsb_info
*tsbinfop
;
693 u_longlong_t pahi
, palo
;
694 int pages_claimed
= 0;
696 int tried_kernel_tsb
= 0;
699 CHEETAH_LIVELOCK_STAT(proc_entry
);
701 if (on_fault(&ljb
)) {
702 CHEETAH_LIVELOCK_STAT(proc_onflt
);
706 if ((cp
= cpu
[cpuid
]) == NULL
) {
707 CHEETAH_LIVELOCK_STAT(proc_cpu
);
711 if ((t
= cp
->cpu_thread
) == NULL
) {
712 CHEETAH_LIVELOCK_STAT(proc_thread
);
716 if ((p
= ttoproc(t
)) == NULL
) {
717 CHEETAH_LIVELOCK_STAT(proc_proc
);
721 if ((as
= p
->p_as
) == NULL
) {
722 CHEETAH_LIVELOCK_STAT(proc_as
);
726 if ((hat
= as
->a_hat
) == NULL
) {
727 CHEETAH_LIVELOCK_STAT(proc_hat
);
731 if (hat
!= ksfmmup
) {
732 CHEETAH_LIVELOCK_STAT(proc_user
);
733 if (hat
->sfmmu_flags
& (HAT_BUSY
| HAT_SWAPPED
| HAT_SWAPIN
)) {
734 CHEETAH_LIVELOCK_STAT(proc_hat_busy
);
737 tsbinfop
= hat
->sfmmu_tsb
;
738 if (tsbinfop
== NULL
) {
739 CHEETAH_LIVELOCK_STAT(proc_hat_inval
);
742 tsbp
= tsbinfop
->tsb_va
;
743 end_tsbp
= tsbp
+ TSB_BYTES(tsbinfop
->tsb_szc
);
745 CHEETAH_LIVELOCK_STAT(proc_kernel
);
748 end_tsbp
= tsbp
+ TSB_BYTES(ktsb_sz
);
752 if (hat
->sfmmu_as
!= as
) {
753 CHEETAH_LIVELOCK_STAT(proc_hat_inval
);
757 mmu_ctxp
= CPU_MMU_CTXP(cp
);
759 cnum
= hat
->sfmmu_ctxs
[mmu_ctxp
->mmu_idx
].cnum
;
760 CHEETAH_LIVELOCK_STATSET(proc_cnum
, cnum
);
762 if ((cnum
< 0) || (cnum
== INVALID_CONTEXT
) ||
763 (cnum
>= mmu_ctxp
->mmu_nctxs
)) {
764 CHEETAH_LIVELOCK_STAT(proc_cnum_bad
);
769 CHEETAH_LIVELOCK_STAT(proc_tsb_scan
);
772 * Skip TSBs being relocated. This is important because
773 * we want to avoid the following deadlock scenario:
775 * 1) when we came in we set ourselves to "in recover" state.
776 * 2) when we try to touch TSB being relocated the mapping
777 * will be in the suspended state so we'll spin waiting
778 * for it to be unlocked.
779 * 3) when the CPU that holds the TSB mapping locked tries to
780 * unlock it it will send a xtrap which will fail to xcall
781 * us or the CPU we're trying to recover, and will in turn
782 * enter the mondo code.
783 * 4) since we are still spinning on the locked mapping
784 * no further progress will be made and the system will
785 * inevitably hard hang.
787 * A TSB not being relocated can't begin being relocated
788 * while we're accessing it because we check
789 * sendmondo_in_recover before relocating TSBs.
791 if (hat
!= ksfmmup
&&
792 (tsbinfop
->tsb_flags
& TSB_RELOC_FLAG
) != 0) {
793 CHEETAH_LIVELOCK_STAT(proc_tsb_reloc
);
797 for (tsbep
= (struct tsbe
*)tsbp
;
798 tsbep
< (struct tsbe
*)end_tsbp
; tsbep
++) {
799 tsbe_tte
= tsbep
->tte_data
;
801 if (tsbe_tte
.tte_val
== 0) {
807 if (tsbe_tte
.tte_se
) {
809 * Don't want device registers
813 if (tsbe_tte
.tte_cp
== 0) {
815 * Must be cached in E$
819 if (tsbep
->tte_tag
.tag_invalid
!= 0) {
821 * Invalid tag, ingnore this entry.
825 CHEETAH_LIVELOCK_STATSET(proc_tte
, tsbe_tte
);
827 if ((idsr
& (IDSR_NACK_BIT(bn
) |
828 IDSR_BUSY_BIT(bn
))) == 0) {
829 CHEETAH_LIVELOCK_STAT(proc_tsb_partscan
);
832 pahi
= tsbe_tte
.tte_pahi
;
833 palo
= tsbe_tte
.tte_palo
;
834 paddr
= (uint64_t)((pahi
<< 32) |
835 (palo
<< MMU_PAGESHIFT
));
836 claimlines(paddr
, TTEBYTES(TTE_CSZ(&tsbe_tte
)),
837 CH_ECACHE_SUBBLK_SIZE
);
838 if ((idsr
& IDSR_BUSY_BIT(bn
)) == 0) {
844 if (tsbinfop
!= NULL
)
845 tsbinfop
= tsbinfop
->tsb_next
;
846 if (tsbinfop
!= NULL
) {
847 tsbp
= tsbinfop
->tsb_va
;
848 end_tsbp
= tsbp
+ TSB_BYTES(tsbinfop
->tsb_szc
);
849 } else if (tsbp
== ktsb_base
) {
850 tried_kernel_tsb
= 1;
851 } else if (!tried_kernel_tsb
) {
853 end_tsbp
= tsbp
+ TSB_BYTES(ktsb_sz
);
857 } while (tsbinfop
!= NULL
||
858 ((tsbp
== ktsb_base
) && !tried_kernel_tsb
));
860 CHEETAH_LIVELOCK_STAT(proc_tsb_fullscan
);
861 CHEETAH_LIVELOCK_MAXSTAT(proc_claimed
, pages_claimed
);
864 if ((idsr
& (IDSR_NACK_BIT(bn
) |
865 IDSR_BUSY_BIT(bn
))) == 0) {
873 CHEETAH_LIVELOCK_MAXSTAT(proc_claimed
, pages_claimed
);
882 * Attempt to claim ownership, temporarily, of every cache line that a
883 * non-responsive cpu might be using. This might kick that cpu out of
886 * The return value indicates to the caller if we have exhausted all recovery
887 * techniques. If 1 is returned, it is useless to call this function again
888 * even for a different target CPU.
891 mondo_recover(uint16_t cpuid
, int bn
)
894 uint64_t begin_pa
, end_pa
, cur_pa
;
895 hrtime_t begin_hrt
, end_hrt
;
897 int pages_claimed
= 0;
898 cheetah_livelock_entry_t
*histp
;
901 if (atomic_cas_32(&sendmondo_in_recover
, 0, 1) != 0) {
903 * Wait while recovery takes place
905 while (sendmondo_in_recover
) {
909 * Assume we didn't claim the whole memory. If
910 * the target of this caller is not recovered,
916 CHEETAH_LIVELOCK_ENTRY_NEXT(histp
);
917 CHEETAH_LIVELOCK_ENTRY_SET(histp
, lbolt
, LBOLT_WAITFREE
);
918 CHEETAH_LIVELOCK_ENTRY_SET(histp
, cpuid
, cpuid
);
919 CHEETAH_LIVELOCK_ENTRY_SET(histp
, buddy
, CPU
->cpu_id
);
921 begin_hrt
= gethrtime_waitfree();
923 * First try to claim the lines in the TSB the target
924 * may have been using.
926 if (mondo_recover_proc(cpuid
, bn
) == 1) {
928 * Didn't claim the whole memory
934 * We tried using the TSB. The target is still
935 * not recovered. Check if complete memory scan is
938 if (cheetah_sendmondo_fullscan
== 0) {
940 * Full memory scan is disabled.
947 * Try claiming the whole memory.
949 for (seg
= memsegs
; seg
; seg
= seg
->next
) {
950 begin_pa
= (uint64_t)(seg
->pages_base
) << MMU_PAGESHIFT
;
951 end_pa
= (uint64_t)(seg
->pages_end
) << MMU_PAGESHIFT
;
952 for (cur_pa
= begin_pa
; cur_pa
< end_pa
;
953 cur_pa
+= MMU_PAGESIZE
) {
955 if ((idsr
& (IDSR_NACK_BIT(bn
) |
956 IDSR_BUSY_BIT(bn
))) == 0) {
958 * Didn't claim all memory
962 claimlines(cur_pa
, MMU_PAGESIZE
,
963 CH_ECACHE_SUBBLK_SIZE
);
964 if ((idsr
& IDSR_BUSY_BIT(bn
)) == 0) {
972 * We did all we could.
980 end_hrt
= gethrtime_waitfree();
981 CHEETAH_LIVELOCK_STAT(recovery
);
982 CHEETAH_LIVELOCK_MAXSTAT(hrt
, (end_hrt
- begin_hrt
));
983 CHEETAH_LIVELOCK_MAXSTAT(full_claimed
, pages_claimed
);
984 CHEETAH_LIVELOCK_ENTRY_SET(histp
, recovery_time
, \
985 (end_hrt
- begin_hrt
));
987 while (atomic_cas_32(&sendmondo_in_recover
, 1, 0) != 1)
994 * This is called by the cyclic framework when this CPU becomes online
998 cheetah_nudge_onln(void *arg
, cpu_t
*cpu
, cyc_handler_t
*hdlr
, cyc_time_t
*when
)
1001 hdlr
->cyh_func
= (cyc_func_t
)cheetah_nudge_buddy
;
1002 hdlr
->cyh_level
= CY_LOW_LEVEL
;
1003 hdlr
->cyh_arg
= NULL
;
1006 * Stagger the start time
1008 when
->cyt_when
= cpu
->cpu_id
* (NANOSEC
/ NCPU
);
1009 if (cheetah_sendmondo_recover_delay
< CHEETAH_LIVELOCK_MIN_DELAY
) {
1010 cheetah_sendmondo_recover_delay
= CHEETAH_LIVELOCK_MIN_DELAY
;
1012 when
->cyt_interval
= cheetah_sendmondo_recover_delay
* NANOSEC
;
1016 * Create a low level cyclic to send a xtrap to the next cpu online.
1017 * However, there's no need to have this running on a uniprocessor system.
1020 cheetah_nudge_init(void)
1022 cyc_omni_handler_t hdlr
;
1024 if (max_ncpus
== 1) {
1028 hdlr
.cyo_online
= cheetah_nudge_onln
;
1029 hdlr
.cyo_offline
= NULL
;
1030 hdlr
.cyo_arg
= NULL
;
1032 mutex_enter(&cpu_lock
);
1033 (void) cyclic_add_omni(&hdlr
);
1034 mutex_exit(&cpu_lock
);
1038 * Cyclic handler to wake up buddy
1041 cheetah_nudge_buddy(void)
1044 * Disable kernel preemption to protect the cpu list
1047 if ((CPU
->cpu_next_onln
!= CPU
) && (sendmondo_in_recover
== 0)) {
1048 xt_one(CPU
->cpu_next_onln
->cpu_id
, (xcfunc_t
*)xt_sync_tl1
,
1054 #endif /* CHEETAHPLUS_ERRATUM_25 */
1056 #ifdef SEND_MONDO_STATS
1057 uint32_t x_one_stimes
[64];
1058 uint32_t x_one_ltimes
[16];
1059 uint32_t x_set_stimes
[64];
1060 uint32_t x_set_ltimes
[16];
1061 uint32_t x_set_cpus
[NCPU
];
1062 uint32_t x_nack_stimes
[64];
1066 * Note: A version of this function is used by the debugger via the KDI,
1067 * and must be kept in sync with this version. Any changes made to this
1068 * function to support new chips or to accomodate errata must also be included
1069 * in the KDI-specific version. See us3_kdi.c.
1072 send_one_mondo(int cpuid
)
1075 uint64_t idsr
, starttick
, endtick
, tick
, lasttick
;
1077 #ifdef CHEETAHPLUS_ERRATUM_25
1081 CPU_STATS_ADDQ(CPU
, sys
, xcalls
, 1);
1082 starttick
= lasttick
= gettick();
1084 endtick
= starttick
+ xc_tick_limit
;
1086 #if defined(JALAPENO) || defined(SERRANO)
1088 * Lower 2 bits of the agent ID determine which BUSY/NACK pair
1089 * will be used for dispatching interrupt. For now, assume
1090 * there are no more than IDSR_BN_SETS CPUs, hence no aliasing
1091 * issues with respect to BUSY/NACK pair usage.
1093 busymask
= IDSR_BUSY_BIT(cpuid
);
1094 #else /* JALAPENO || SERRANO */
1095 busymask
= IDSR_BUSY
;
1096 #endif /* JALAPENO || SERRANO */
1104 * If there is a big jump between the current tick
1105 * count and lasttick, we have probably hit a break
1106 * point. Adjust endtick accordingly to avoid panic.
1108 if (tick
> (lasttick
+ xc_tick_jump_limit
))
1109 endtick
+= (tick
- lasttick
);
1111 if (tick
> endtick
) {
1114 #ifdef CHEETAHPLUS_ERRATUM_25
1115 if (cheetah_sendmondo_recover
&& recovered
== 0) {
1116 if (mondo_recover(cpuid
, 0)) {
1118 * We claimed the whole memory or
1119 * full scan is disabled.
1124 endtick
= tick
+ xc_tick_limit
;
1131 #endif /* CHEETAHPLUS_ERRATUM_25 */
1133 cmn_err(CE_PANIC
, "send mondo timeout "
1134 "(target 0x%x) [%d NACK %d BUSY]",
1139 if (idsr
& busymask
) {
1148 #ifdef SEND_MONDO_STATS
1150 int n
= gettick() - starttick
;
1152 x_one_stimes
[n
>> 7]++;
1154 x_one_ltimes
[(n
>> 13) & 0xf]++;
1165 * Return processor specific async error structure
1171 return (sizeof (ch_async_flt_t
));
1175 * Tunable to disable the checking of other cpu logout areas during panic for
1176 * potential syndrome 71 generating errors.
1178 int enable_check_other_cpus_logout
= 1;
1181 * Check other cpus logout area for potential synd 71 generating
1185 cpu_check_cpu_logout(int cpuid
, caddr_t tpc
, int tl
, int ecc_type
,
1186 ch_cpu_logout_t
*clop
)
1188 struct async_flt
*aflt
;
1189 ch_async_flt_t ch_flt
;
1190 uint64_t t_afar
, t_afsr
, t_afsr_ext
, t_afsr_errs
;
1192 if (clop
== NULL
|| clop
->clo_data
.chd_afar
== LOGOUT_INVALID
) {
1196 bzero(&ch_flt
, sizeof (ch_async_flt_t
));
1198 t_afar
= clop
->clo_data
.chd_afar
;
1199 t_afsr
= clop
->clo_data
.chd_afsr
;
1200 t_afsr_ext
= clop
->clo_data
.chd_afsr_ext
;
1201 #if defined(SERRANO)
1202 ch_flt
.afar2
= clop
->clo_data
.chd_afar2
;
1203 #endif /* SERRANO */
1206 * In order to simplify code, we maintain this afsr_errs
1207 * variable which holds the aggregate of AFSR and AFSR_EXT
1210 t_afsr_errs
= (t_afsr_ext
& C_AFSR_EXT_ALL_ERRS
) |
1211 (t_afsr
& C_AFSR_ALL_ERRS
);
1213 /* Setup the async fault structure */
1214 aflt
= (struct async_flt
*)&ch_flt
;
1215 aflt
->flt_id
= gethrtime_waitfree();
1216 ch_flt
.afsr_ext
= t_afsr_ext
;
1217 ch_flt
.afsr_errs
= t_afsr_errs
;
1218 aflt
->flt_stat
= t_afsr
;
1219 aflt
->flt_addr
= t_afar
;
1220 aflt
->flt_bus_id
= cpuid
;
1221 aflt
->flt_inst
= cpuid
;
1223 aflt
->flt_prot
= AFLT_PROT_NONE
;
1224 aflt
->flt_class
= CPU_FAULT
;
1225 aflt
->flt_priv
= ((t_afsr
& C_AFSR_PRIV
) != 0);
1227 aflt
->flt_status
= ecc_type
;
1228 aflt
->flt_panic
= C_AFSR_PANIC(t_afsr_errs
);
1231 * Queue events on the async event queue, one event per error bit.
1232 * If no events are queued, queue an event to complain.
1234 if (cpu_queue_events(&ch_flt
, NULL
, t_afsr_errs
, clop
) == 0) {
1235 ch_flt
.flt_type
= CPU_INV_AFSR
;
1236 cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_INVALID_AFSR
,
1237 (void *)&ch_flt
, sizeof (ch_async_flt_t
), ue_queue
,
1242 * Zero out + invalidate CPU logout.
1244 bzero(clop
, sizeof (ch_cpu_logout_t
));
1245 clop
->clo_data
.chd_afar
= LOGOUT_INVALID
;
1249 * Check the logout areas of all other cpus for unlogged errors.
1252 cpu_check_other_cpus_logout(void)
1257 ch_err_tl1_data_t
*cl1p
;
1260 for (i
= 0; i
< NCPU
; i
++) {
1263 if ((cp
== NULL
) || !(cp
->cpu_flags
& CPU_EXISTS
) ||
1264 (cp
->cpu_id
== myid
) || (CPU_PRIVATE(cp
) == NULL
)) {
1269 * Check each of the tl>0 logout areas
1271 cl1p
= CPU_PRIVATE_PTR(cp
, chpr_tl1_err_data
[0]);
1272 for (j
= 0; j
< CH_ERR_TL1_TLMAX
; j
++, cl1p
++) {
1273 if (cl1p
->ch_err_tl1_flags
== 0)
1276 cpu_check_cpu_logout(i
, (caddr_t
)cl1p
->ch_err_tl1_tpc
,
1277 1, ECC_F_TRAP
, &cl1p
->ch_err_tl1_logout
);
1281 * Check each of the remaining logout areas
1283 cpu_check_cpu_logout(i
, NULL
, 0, ECC_F_TRAP
,
1284 CPU_PRIVATE_PTR(cp
, chpr_fecctl0_logout
));
1285 cpu_check_cpu_logout(i
, NULL
, 0, ECC_C_TRAP
,
1286 CPU_PRIVATE_PTR(cp
, chpr_cecc_logout
));
1287 cpu_check_cpu_logout(i
, NULL
, 0, ECC_D_TRAP
,
1288 CPU_PRIVATE_PTR(cp
, chpr_async_logout
));
1293 * The fast_ecc_err handler transfers control here for UCU, UCC events.
1294 * Note that we flush Ecache twice, once in the fast_ecc_err handler to
1295 * flush the error that caused the UCU/UCC, then again here at the end to
1296 * flush the TL=1 trap handler code out of the Ecache, so we can minimize
1297 * the probability of getting a TL>1 Fast ECC trap when we're fielding
1298 * another Fast ECC trap.
1300 * Cheetah+ also handles: TSCE: No additional processing required.
1301 * Panther adds L3_UCU and L3_UCC which are reported in AFSR_EXT.
1303 * Note that the p_clo_flags input is only valid in cases where the
1304 * cpu_private struct is not yet initialized (since that is the only
1305 * time that information cannot be obtained from the logout struct.)
1309 cpu_fast_ecc_error(struct regs
*rp
, ulong_t p_clo_flags
)
1311 ch_cpu_logout_t
*clop
;
1312 uint64_t ceen
, nceen
;
1315 * Get the CPU log out info. If we can't find our CPU private
1316 * pointer, then we will have to make due without any detailed
1317 * logout information.
1319 if (CPU_PRIVATE(CPU
) == NULL
) {
1321 ceen
= p_clo_flags
& EN_REG_CEEN
;
1322 nceen
= p_clo_flags
& EN_REG_NCEEN
;
1324 clop
= CPU_PRIVATE_PTR(CPU
, chpr_fecctl0_logout
);
1325 ceen
= clop
->clo_flags
& EN_REG_CEEN
;
1326 nceen
= clop
->clo_flags
& EN_REG_NCEEN
;
1329 cpu_log_fast_ecc_error((caddr_t
)rp
->r_pc
,
1330 (rp
->r_tstate
& TSTATE_PRIV
) ? 1 : 0, 0, ceen
, nceen
, clop
);
1334 * Log fast ecc error, called from either Fast ECC at TL=0 or Fast
1335 * ECC at TL>0. Need to supply either a error register pointer or a
1336 * cpu logout structure pointer.
1339 cpu_log_fast_ecc_error(caddr_t tpc
, int priv
, int tl
, uint64_t ceen
,
1340 uint64_t nceen
, ch_cpu_logout_t
*clop
)
1342 struct async_flt
*aflt
;
1343 ch_async_flt_t ch_flt
;
1344 uint64_t t_afar
, t_afsr
, t_afsr_ext
, t_afsr_errs
;
1345 char pr_reason
[MAX_REASON_STRING
];
1346 ch_cpu_errors_t cpu_error_regs
;
1348 bzero(&ch_flt
, sizeof (ch_async_flt_t
));
1350 * If no cpu logout data, then we will have to make due without
1351 * any detailed logout information.
1354 ch_flt
.flt_diag_data
.chd_afar
= LOGOUT_INVALID
;
1355 get_cpu_error_state(&cpu_error_regs
);
1356 set_cpu_error_state(&cpu_error_regs
);
1357 t_afar
= cpu_error_regs
.afar
;
1358 t_afsr
= cpu_error_regs
.afsr
;
1359 t_afsr_ext
= cpu_error_regs
.afsr_ext
;
1360 #if defined(SERRANO)
1361 ch_flt
.afar2
= cpu_error_regs
.afar2
;
1362 #endif /* SERRANO */
1364 t_afar
= clop
->clo_data
.chd_afar
;
1365 t_afsr
= clop
->clo_data
.chd_afsr
;
1366 t_afsr_ext
= clop
->clo_data
.chd_afsr_ext
;
1367 #if defined(SERRANO)
1368 ch_flt
.afar2
= clop
->clo_data
.chd_afar2
;
1369 #endif /* SERRANO */
1373 * In order to simplify code, we maintain this afsr_errs
1374 * variable which holds the aggregate of AFSR and AFSR_EXT
1377 t_afsr_errs
= (t_afsr_ext
& C_AFSR_EXT_ALL_ERRS
) |
1378 (t_afsr
& C_AFSR_ALL_ERRS
);
1379 pr_reason
[0] = '\0';
1381 /* Setup the async fault structure */
1382 aflt
= (struct async_flt
*)&ch_flt
;
1383 aflt
->flt_id
= gethrtime_waitfree();
1384 ch_flt
.afsr_ext
= t_afsr_ext
;
1385 ch_flt
.afsr_errs
= t_afsr_errs
;
1386 aflt
->flt_stat
= t_afsr
;
1387 aflt
->flt_addr
= t_afar
;
1388 aflt
->flt_bus_id
= getprocessorid();
1389 aflt
->flt_inst
= CPU
->cpu_id
;
1391 aflt
->flt_prot
= AFLT_PROT_NONE
;
1392 aflt
->flt_class
= CPU_FAULT
;
1393 aflt
->flt_priv
= priv
;
1395 aflt
->flt_status
= ECC_F_TRAP
;
1396 aflt
->flt_panic
= C_AFSR_PANIC(t_afsr_errs
);
1399 * XXXX - Phenomenal hack to get around Solaris not getting all the
1400 * cmn_err messages out to the console. The situation is a UCU (in
1401 * priv mode) which causes a WDU which causes a UE (on the retry).
1402 * The messages for the UCU and WDU are enqueued and then pulled off
1403 * the async queue via softint and syslogd starts to process them
1404 * but doesn't get them to the console. The UE causes a panic, but
1405 * since the UCU/WDU messages are already in transit, those aren't
1406 * on the async queue. The hack is to check if we have a matching
1407 * WDU event for the UCU, and if it matches, we're more than likely
1408 * going to panic with a UE, unless we're under protection. So, we
1409 * check to see if we got a matching WDU event and if we're under
1412 * For Cheetah/Cheetah+/Jaguar/Jalapeno, the sequence we care about
1415 * For Panther, it could look like either of these:
1416 * UCU---->WDU->L3_WDU->UE
1417 * L3_UCU->WDU->L3_WDU->UE
1419 if ((t_afsr_errs
& (C_AFSR_UCU
| C_AFSR_L3_UCU
)) &&
1420 aflt
->flt_panic
== 0 && aflt
->flt_priv
!= 0 &&
1421 curthread
->t_ontrap
== NULL
&& curthread
->t_lofault
== NULL
) {
1422 get_cpu_error_state(&cpu_error_regs
);
1423 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
)) {
1425 ((cpu_error_regs
.afsr
& C_AFSR_WDU
) &&
1426 (cpu_error_regs
.afsr_ext
& C_AFSR_L3_WDU
) &&
1427 (cpu_error_regs
.afar
== t_afar
));
1428 aflt
->flt_panic
|= ((clop
== NULL
) &&
1429 (t_afsr_errs
& C_AFSR_WDU
) &&
1430 (t_afsr_errs
& C_AFSR_L3_WDU
));
1433 ((cpu_error_regs
.afsr
& C_AFSR_WDU
) &&
1434 (cpu_error_regs
.afar
== t_afar
));
1435 aflt
->flt_panic
|= ((clop
== NULL
) &&
1436 (t_afsr_errs
& C_AFSR_WDU
));
1441 * Queue events on the async event queue, one event per error bit.
1442 * If no events are queued or no Fast ECC events are on in the AFSR,
1443 * queue an event to complain.
1445 if (cpu_queue_events(&ch_flt
, pr_reason
, t_afsr_errs
, clop
) == 0 ||
1446 ((t_afsr_errs
& (C_AFSR_FECC_ERRS
| C_AFSR_EXT_FECC_ERRS
)) == 0)) {
1447 ch_flt
.flt_type
= CPU_INV_AFSR
;
1448 cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_INVALID_AFSR
,
1449 (void *)&ch_flt
, sizeof (ch_async_flt_t
), ue_queue
,
1454 * Zero out + invalidate CPU logout.
1457 bzero(clop
, sizeof (ch_cpu_logout_t
));
1458 clop
->clo_data
.chd_afar
= LOGOUT_INVALID
;
1462 * We carefully re-enable NCEEN and CEEN and then check if any deferred
1463 * or disrupting errors have happened. We do this because if a
1464 * deferred or disrupting error had occurred with NCEEN/CEEN off, the
1465 * trap will not be taken when NCEEN/CEEN is re-enabled. Note that
1466 * CEEN works differently on Cheetah than on Spitfire. Also, we enable
1467 * NCEEN/CEEN *before* checking the AFSR to avoid the small window of a
1468 * deferred or disrupting error happening between checking the AFSR and
1469 * enabling NCEEN/CEEN.
1471 * Note: CEEN and NCEEN are only reenabled if they were on when trap
1474 set_error_enable(get_error_enable() | (nceen
| ceen
));
1475 if (clear_errors(&ch_flt
)) {
1476 aflt
->flt_panic
|= ((ch_flt
.afsr_errs
&
1477 (C_AFSR_EXT_ASYNC_ERRS
| C_AFSR_ASYNC_ERRS
)) != 0);
1478 (void) cpu_queue_events(&ch_flt
, pr_reason
, ch_flt
.afsr_errs
,
1483 * Panic here if aflt->flt_panic has been set. Enqueued errors will
1484 * be logged as part of the panic flow.
1486 if (aflt
->flt_panic
)
1487 fm_panic("%sError(s)", pr_reason
);
1490 * Flushing the Ecache here gets the part of the trap handler that
1491 * is run at TL=1 out of the Ecache.
1497 * This is called via sys_trap from pil15_interrupt code if the
1498 * corresponding entry in ch_err_tl1_pending is set. Checks the
1499 * various ch_err_tl1_data structures for valid entries based on the bit
1500 * settings in the ch_err_tl1_flags entry of the structure.
1504 cpu_tl1_error(struct regs
*rp
, int panic
)
1506 ch_err_tl1_data_t
*cl1p
, cl1
;
1509 uint64_t ceen
, nceen
;
1511 if (ch_err_tl1_paddrs
[CPU
->cpu_id
] == 0) {
1512 cl1p
= &ch_err_tl1_data
;
1514 } else if (CPU_PRIVATE(CPU
) != NULL
) {
1515 cl1p
= CPU_PRIVATE_PTR(CPU
, chpr_tl1_err_data
[0]);
1516 ncl1ps
= CH_ERR_TL1_TLMAX
;
1521 for (i
= 0; i
< ncl1ps
; i
++, cl1p
++) {
1522 if (cl1p
->ch_err_tl1_flags
== 0)
1526 * Grab a copy of the logout data and invalidate
1530 bzero(cl1p
, sizeof (ch_err_tl1_data_t
));
1531 cl1p
->ch_err_tl1_logout
.clo_data
.chd_afar
= LOGOUT_INVALID
;
1532 me_flags
= CH_ERR_ME_FLAGS(cl1
.ch_err_tl1_flags
);
1535 * Log "first error" in ch_err_tl1_data.
1537 if (cl1
.ch_err_tl1_flags
& CH_ERR_FECC
) {
1538 ceen
= get_error_enable() & EN_REG_CEEN
;
1539 nceen
= get_error_enable() & EN_REG_NCEEN
;
1540 cpu_log_fast_ecc_error((caddr_t
)cl1
.ch_err_tl1_tpc
, 1,
1541 1, ceen
, nceen
, &cl1
.ch_err_tl1_logout
);
1543 #if defined(CPU_IMP_L1_CACHE_PARITY)
1544 if (cl1
.ch_err_tl1_flags
& (CH_ERR_IPE
| CH_ERR_DPE
)) {
1545 cpu_parity_error(rp
, cl1
.ch_err_tl1_flags
,
1546 (caddr_t
)cl1
.ch_err_tl1_tpc
);
1548 #endif /* CPU_IMP_L1_CACHE_PARITY */
1551 * Log "multiple events" in ch_err_tl1_data. Note that
1552 * we don't read and clear the AFSR/AFAR in the TL>0 code
1553 * if the structure is busy, we just do the cache flushing
1554 * we have to do and then do the retry. So the AFSR/AFAR
1555 * at this point *should* have some relevant info. If there
1556 * are no valid errors in the AFSR, we'll assume they've
1557 * already been picked up and logged. For I$/D$ parity,
1558 * we just log an event with an "Unknown" (NULL) TPC.
1560 if (me_flags
& CH_ERR_FECC
) {
1561 ch_cpu_errors_t cpu_error_regs
;
1562 uint64_t t_afsr_errs
;
1565 * Get the error registers and see if there's
1566 * a pending error. If not, don't bother
1567 * generating an "Invalid AFSR" error event.
1569 get_cpu_error_state(&cpu_error_regs
);
1570 t_afsr_errs
= (cpu_error_regs
.afsr_ext
&
1571 C_AFSR_EXT_ALL_ERRS
) |
1572 (cpu_error_regs
.afsr
& C_AFSR_ALL_ERRS
);
1573 if (t_afsr_errs
!= 0) {
1574 ceen
= get_error_enable() & EN_REG_CEEN
;
1575 nceen
= get_error_enable() & EN_REG_NCEEN
;
1576 cpu_log_fast_ecc_error(NULL
, 1,
1577 1, ceen
, nceen
, NULL
);
1580 #if defined(CPU_IMP_L1_CACHE_PARITY)
1581 if (me_flags
& (CH_ERR_IPE
| CH_ERR_DPE
)) {
1582 cpu_parity_error(rp
, me_flags
, NULL
);
1584 #endif /* CPU_IMP_L1_CACHE_PARITY */
1589 * Called from Fast ECC TL>0 handler in case of fatal error.
1590 * cpu_tl1_error should always find an associated ch_err_tl1_data structure,
1591 * but if we don't, we'll panic with something reasonable.
1595 cpu_tl1_err_panic(struct regs
*rp
, ulong_t flags
)
1597 cpu_tl1_error(rp
, 1);
1599 * Should never return, but just in case.
1601 fm_panic("Unsurvivable ECC Error at TL>0");
1605 * The ce_err/ce_err_tl1 handlers transfer control here for CE, EMC, EDU:ST,
1606 * EDC, WDU, WDC, CPU, CPC, IVU, IVC events.
1607 * Disrupting errors controlled by NCEEN: EDU:ST, WDU, CPU, IVU
1608 * Disrupting errors controlled by CEEN: CE, EMC, EDC, WDC, CPC, IVC
1610 * Cheetah+ also handles (No additional processing required):
1611 * DUE, DTO, DBERR (NCEEN controlled)
1612 * THCE (CEEN and ET_ECC_en controlled)
1613 * TUE (ET_ECC_en controlled)
1615 * Panther further adds:
1616 * IMU, L3_EDU, L3_WDU, L3_CPU (NCEEN controlled)
1617 * IMC, L3_EDC, L3_WDC, L3_CPC, L3_THCE (CEEN controlled)
1618 * TUE_SH, TUE (NCEEN and L2_tag_ECC_en controlled)
1619 * L3_TUE, L3_TUE_SH (NCEEN and ET_ECC_en controlled)
1620 * THCE (CEEN and L2_tag_ECC_en controlled)
1621 * L3_THCE (CEEN and ET_ECC_en controlled)
1623 * Note that the p_clo_flags input is only valid in cases where the
1624 * cpu_private struct is not yet initialized (since that is the only
1625 * time that information cannot be obtained from the logout struct.)
1629 cpu_disrupting_error(struct regs
*rp
, ulong_t p_clo_flags
)
1631 struct async_flt
*aflt
;
1632 ch_async_flt_t ch_flt
;
1633 char pr_reason
[MAX_REASON_STRING
];
1634 ch_cpu_logout_t
*clop
;
1635 uint64_t t_afar
, t_afsr
, t_afsr_ext
, t_afsr_errs
;
1636 ch_cpu_errors_t cpu_error_regs
;
1638 bzero(&ch_flt
, sizeof (ch_async_flt_t
));
1640 * Get the CPU log out info. If we can't find our CPU private
1641 * pointer, then we will have to make due without any detailed
1642 * logout information.
1644 if (CPU_PRIVATE(CPU
) == NULL
) {
1646 ch_flt
.flt_diag_data
.chd_afar
= LOGOUT_INVALID
;
1647 get_cpu_error_state(&cpu_error_regs
);
1648 set_cpu_error_state(&cpu_error_regs
);
1649 t_afar
= cpu_error_regs
.afar
;
1650 t_afsr
= cpu_error_regs
.afsr
;
1651 t_afsr_ext
= cpu_error_regs
.afsr_ext
;
1652 #if defined(SERRANO)
1653 ch_flt
.afar2
= cpu_error_regs
.afar2
;
1654 #endif /* SERRANO */
1656 clop
= CPU_PRIVATE_PTR(CPU
, chpr_cecc_logout
);
1657 t_afar
= clop
->clo_data
.chd_afar
;
1658 t_afsr
= clop
->clo_data
.chd_afsr
;
1659 t_afsr_ext
= clop
->clo_data
.chd_afsr_ext
;
1660 #if defined(SERRANO)
1661 ch_flt
.afar2
= clop
->clo_data
.chd_afar2
;
1662 #endif /* SERRANO */
1666 * In order to simplify code, we maintain this afsr_errs
1667 * variable which holds the aggregate of AFSR and AFSR_EXT
1670 t_afsr_errs
= (t_afsr_ext
& C_AFSR_EXT_ALL_ERRS
) |
1671 (t_afsr
& C_AFSR_ALL_ERRS
);
1673 pr_reason
[0] = '\0';
1674 /* Setup the async fault structure */
1675 aflt
= (struct async_flt
*)&ch_flt
;
1676 ch_flt
.afsr_ext
= t_afsr_ext
;
1677 ch_flt
.afsr_errs
= t_afsr_errs
;
1678 aflt
->flt_stat
= t_afsr
;
1679 aflt
->flt_addr
= t_afar
;
1680 aflt
->flt_pc
= (caddr_t
)rp
->r_pc
;
1681 aflt
->flt_priv
= (rp
->r_tstate
& TSTATE_PRIV
) ? 1 : 0;
1683 aflt
->flt_panic
= C_AFSR_PANIC(t_afsr_errs
);
1686 * If this trap is a result of one of the errors not masked
1687 * by cpu_ce_not_deferred, we don't reenable CEEN. Instead
1688 * indicate that a timeout is to be set later.
1690 if (!(t_afsr_errs
& (cpu_ce_not_deferred
| cpu_ce_not_deferred_ext
)) &&
1692 ch_flt
.flt_trapped_ce
= CE_CEEN_DEFER
| CE_CEEN_TRAPPED
;
1694 ch_flt
.flt_trapped_ce
= CE_CEEN_NODEFER
| CE_CEEN_TRAPPED
;
1697 * log the CE and clean up
1699 cpu_log_and_clear_ce(&ch_flt
);
1702 * We re-enable CEEN (if required) and check if any disrupting errors
1703 * have happened. We do this because if a disrupting error had occurred
1704 * with CEEN off, the trap will not be taken when CEEN is re-enabled.
1705 * Note that CEEN works differently on Cheetah than on Spitfire. Also,
1706 * we enable CEEN *before* checking the AFSR to avoid the small window
1707 * of a error happening between checking the AFSR and enabling CEEN.
1709 if (ch_flt
.flt_trapped_ce
& CE_CEEN_NODEFER
)
1710 set_error_enable(get_error_enable() | EN_REG_CEEN
);
1711 if (clear_errors(&ch_flt
)) {
1712 (void) cpu_queue_events(&ch_flt
, pr_reason
, ch_flt
.afsr_errs
,
1717 * Panic here if aflt->flt_panic has been set. Enqueued errors will
1718 * be logged as part of the panic flow.
1720 if (aflt
->flt_panic
)
1721 fm_panic("%sError(s)", pr_reason
);
1725 * The async_err handler transfers control here for UE, EMU, EDU:BLD,
1726 * L3_EDU:BLD, TO, and BERR events.
1727 * Deferred errors controlled by NCEEN: UE, EMU, EDU:BLD, L3_EDU:BLD, TO, BERR
1729 * Cheetah+: No additional errors handled.
1731 * Note that the p_clo_flags input is only valid in cases where the
1732 * cpu_private struct is not yet initialized (since that is the only
1733 * time that information cannot be obtained from the logout struct.)
1737 cpu_deferred_error(struct regs
*rp
, ulong_t p_clo_flags
)
1740 ch_async_flt_t ch_flt
;
1741 struct async_flt
*aflt
;
1742 int trampolined
= 0;
1743 char pr_reason
[MAX_REASON_STRING
];
1744 ch_cpu_logout_t
*clop
;
1745 uint64_t ceen
, clo_flags
;
1747 uint64_t t_afar
, t_afsr
, t_afsr_ext
, t_afsr_errs
;
1748 ch_cpu_errors_t cpu_error_regs
;
1749 int expected
= DDI_FM_ERR_UNEXPECTED
;
1753 * We need to look at p_flag to determine if the thread detected an
1754 * error while dumping core. We can't grab p_lock here, but it's ok
1755 * because we just need a consistent snapshot and we know that everyone
1756 * else will store a consistent set of bits while holding p_lock. We
1757 * don't have to worry about a race because SDOCORE is set once prior
1758 * to doing i/o from the process's address space and is never cleared.
1760 uint_t pflag
= ttoproc(curthread
)->p_flag
;
1762 bzero(&ch_flt
, sizeof (ch_async_flt_t
));
1764 * Get the CPU log out info. If we can't find our CPU private
1765 * pointer then we will have to make due without any detailed
1766 * logout information.
1768 if (CPU_PRIVATE(CPU
) == NULL
) {
1770 ch_flt
.flt_diag_data
.chd_afar
= LOGOUT_INVALID
;
1771 get_cpu_error_state(&cpu_error_regs
);
1772 set_cpu_error_state(&cpu_error_regs
);
1773 t_afar
= cpu_error_regs
.afar
;
1774 t_afsr
= cpu_error_regs
.afsr
;
1775 t_afsr_ext
= cpu_error_regs
.afsr_ext
;
1776 #if defined(SERRANO)
1777 ch_flt
.afar2
= cpu_error_regs
.afar2
;
1778 #endif /* SERRANO */
1779 clo_flags
= p_clo_flags
;
1781 clop
= CPU_PRIVATE_PTR(CPU
, chpr_async_logout
);
1782 t_afar
= clop
->clo_data
.chd_afar
;
1783 t_afsr
= clop
->clo_data
.chd_afsr
;
1784 t_afsr_ext
= clop
->clo_data
.chd_afsr_ext
;
1785 #if defined(SERRANO)
1786 ch_flt
.afar2
= clop
->clo_data
.chd_afar2
;
1787 #endif /* SERRANO */
1788 clo_flags
= clop
->clo_flags
;
1792 * In order to simplify code, we maintain this afsr_errs
1793 * variable which holds the aggregate of AFSR and AFSR_EXT
1796 t_afsr_errs
= (t_afsr_ext
& C_AFSR_EXT_ALL_ERRS
) |
1797 (t_afsr
& C_AFSR_ALL_ERRS
);
1798 pr_reason
[0] = '\0';
1801 * Grab information encoded into our clo_flags field.
1803 ceen
= clo_flags
& EN_REG_CEEN
;
1804 tl
= (clo_flags
& CLO_FLAGS_TL_MASK
) >> CLO_FLAGS_TL_SHIFT
;
1805 ttype
= (clo_flags
& CLO_FLAGS_TT_MASK
) >> CLO_FLAGS_TT_SHIFT
;
1808 * handle the specific error
1810 aflt
= (struct async_flt
*)&ch_flt
;
1811 aflt
->flt_id
= gethrtime_waitfree();
1812 aflt
->flt_bus_id
= getprocessorid();
1813 aflt
->flt_inst
= CPU
->cpu_id
;
1814 ch_flt
.afsr_ext
= t_afsr_ext
;
1815 ch_flt
.afsr_errs
= t_afsr_errs
;
1816 aflt
->flt_stat
= t_afsr
;
1817 aflt
->flt_addr
= t_afar
;
1818 aflt
->flt_pc
= (caddr_t
)rp
->r_pc
;
1819 aflt
->flt_prot
= AFLT_PROT_NONE
;
1820 aflt
->flt_class
= CPU_FAULT
;
1821 aflt
->flt_priv
= (rp
->r_tstate
& TSTATE_PRIV
) ? 1 : 0;
1822 aflt
->flt_tl
= (uchar_t
)tl
;
1823 aflt
->flt_panic
= ((tl
!= 0) || (aft_testfatal
!= 0) ||
1824 C_AFSR_PANIC(t_afsr_errs
));
1825 aflt
->flt_core
= (pflag
& SDOCORE
) ? 1 : 0;
1826 aflt
->flt_status
= ((ttype
== T_DATA_ERROR
) ? ECC_D_TRAP
: ECC_I_TRAP
);
1829 * If the trap occurred in privileged mode at TL=0, we need to check to
1830 * see if we were executing in the kernel under on_trap() or t_lofault
1831 * protection. If so, modify the saved registers so that we return
1832 * from the trap to the appropriate trampoline routine.
1834 if (aflt
->flt_priv
&& tl
== 0) {
1835 if (curthread
->t_ontrap
!= NULL
) {
1836 on_trap_data_t
*otp
= curthread
->t_ontrap
;
1838 if (otp
->ot_prot
& OT_DATA_EC
) {
1839 aflt
->flt_prot
= AFLT_PROT_EC
;
1840 otp
->ot_trap
|= OT_DATA_EC
;
1841 rp
->r_pc
= otp
->ot_trampoline
;
1842 rp
->r_npc
= rp
->r_pc
+ 4;
1846 if ((t_afsr
& (C_AFSR_TO
| C_AFSR_BERR
)) &&
1847 (otp
->ot_prot
& OT_DATA_ACCESS
)) {
1848 aflt
->flt_prot
= AFLT_PROT_ACCESS
;
1849 otp
->ot_trap
|= OT_DATA_ACCESS
;
1850 rp
->r_pc
= otp
->ot_trampoline
;
1851 rp
->r_npc
= rp
->r_pc
+ 4;
1854 * for peeks and caut_gets errors are expected
1856 hp
= (ddi_acc_hdl_t
*)otp
->ot_handle
;
1858 expected
= DDI_FM_ERR_PEEK
;
1859 else if (hp
->ah_acc
.devacc_attr_access
==
1861 expected
= DDI_FM_ERR_EXPECTED
;
1864 } else if (curthread
->t_lofault
) {
1865 aflt
->flt_prot
= AFLT_PROT_COPY
;
1867 rp
->r_pc
= curthread
->t_lofault
;
1868 rp
->r_npc
= rp
->r_pc
+ 4;
1874 * If we're in user mode or we're doing a protected copy, we either
1875 * want the ASTON code below to send a signal to the user process
1876 * or we want to panic if aft_panic is set.
1878 * If we're in privileged mode and we're not doing a copy, then we
1879 * need to check if we've trampolined. If we haven't trampolined,
1882 if (!aflt
->flt_priv
|| aflt
->flt_prot
== AFLT_PROT_COPY
) {
1884 ((C_AFSR_ASYNC_ERRS
| C_AFSR_EXT_ASYNC_ERRS
) &
1885 ~(C_AFSR_BERR
| C_AFSR_TO
)))
1886 aflt
->flt_panic
|= aft_panic
;
1887 } else if (!trampolined
) {
1888 aflt
->flt_panic
= 1;
1892 * If we've trampolined due to a privileged TO or BERR, or if an
1893 * unprivileged TO or BERR occurred, we don't want to enqueue an
1894 * event for that TO or BERR. Queue all other events (if any) besides
1895 * the TO/BERR. Since we may not be enqueing any events, we need to
1896 * ignore the number of events queued. If we haven't trampolined due
1897 * to a TO or BERR, just enqueue events normally.
1899 log_afsr
= t_afsr_errs
;
1901 log_afsr
&= ~(C_AFSR_TO
| C_AFSR_BERR
);
1902 } else if (!aflt
->flt_priv
) {
1904 * User mode, suppress messages if
1905 * cpu_berr_to_verbose is not set.
1907 if (!cpu_berr_to_verbose
)
1908 log_afsr
&= ~(C_AFSR_TO
| C_AFSR_BERR
);
1912 * Log any errors that occurred
1915 ((C_AFSR_ALL_ERRS
| C_AFSR_EXT_ALL_ERRS
) & ~C_AFSR_ME
)) &&
1916 cpu_queue_events(&ch_flt
, pr_reason
, log_afsr
, clop
) == 0) ||
1917 (t_afsr_errs
& (C_AFSR_ASYNC_ERRS
| C_AFSR_EXT_ASYNC_ERRS
)) == 0) {
1918 ch_flt
.flt_type
= CPU_INV_AFSR
;
1919 cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_INVALID_AFSR
,
1920 (void *)&ch_flt
, sizeof (ch_async_flt_t
), ue_queue
,
1925 * Zero out + invalidate CPU logout.
1928 bzero(clop
, sizeof (ch_cpu_logout_t
));
1929 clop
->clo_data
.chd_afar
= LOGOUT_INVALID
;
1932 #if defined(JALAPENO) || defined(SERRANO)
1934 * UE/RUE/BERR/TO: Call our bus nexus friends to check for
1935 * IO errors that may have resulted in this trap.
1937 if (t_afsr
& (C_AFSR_UE
|C_AFSR_RUE
|C_AFSR_TO
|C_AFSR_BERR
)) {
1938 cpu_run_bus_error_handlers(aflt
, expected
);
1942 * UE/RUE: If UE or RUE is in memory, we need to flush the bad
1943 * line from the Ecache. We also need to query the bus nexus for
1944 * fatal errors. Attempts to do diagnostic read on caches may
1945 * introduce more errors (especially when the module is bad).
1947 if (t_afsr
& (C_AFSR_UE
|C_AFSR_RUE
)) {
1949 * Ask our bus nexus friends if they have any fatal errors. If
1950 * so, they will log appropriate error messages.
1952 if (bus_func_invoke(BF_TYPE_UE
) == BF_FATAL
)
1953 aflt
->flt_panic
= 1;
1956 * We got a UE or RUE and are panicking, save the fault PA in
1957 * a known location so that the platform specific panic code
1958 * can check for copyback errors.
1960 if (aflt
->flt_panic
&& cpu_flt_in_memory(&ch_flt
, C_AFSR_UE
)) {
1966 * Flush Ecache line or entire Ecache
1968 if (t_afsr
& (C_AFSR_UE
| C_AFSR_RUE
| C_AFSR_EDU
| C_AFSR_BERR
))
1969 cpu_error_ecache_flush(&ch_flt
);
1970 #else /* JALAPENO || SERRANO */
1972 * UE/BERR/TO: Call our bus nexus friends to check for
1973 * IO errors that may have resulted in this trap.
1975 if (t_afsr
& (C_AFSR_UE
|C_AFSR_TO
|C_AFSR_BERR
)) {
1976 cpu_run_bus_error_handlers(aflt
, expected
);
1980 * UE: If the UE is in memory, we need to flush the bad
1981 * line from the Ecache. We also need to query the bus nexus for
1982 * fatal errors. Attempts to do diagnostic read on caches may
1983 * introduce more errors (especially when the module is bad).
1985 if (t_afsr
& C_AFSR_UE
) {
1987 * Ask our legacy bus nexus friends if they have any fatal
1988 * errors. If so, they will log appropriate error messages.
1990 if (bus_func_invoke(BF_TYPE_UE
) == BF_FATAL
)
1991 aflt
->flt_panic
= 1;
1994 * We got a UE and are panicking, save the fault PA in a known
1995 * location so that the platform specific panic code can check
1996 * for copyback errors.
1998 if (aflt
->flt_panic
&& cpu_flt_in_memory(&ch_flt
, C_AFSR_UE
)) {
2004 * Flush Ecache line or entire Ecache
2007 (C_AFSR_UE
| C_AFSR_EDU
| C_AFSR_BERR
| C_AFSR_L3_EDU
))
2008 cpu_error_ecache_flush(&ch_flt
);
2009 #endif /* JALAPENO || SERRANO */
2012 * We carefully re-enable NCEEN and CEEN and then check if any deferred
2013 * or disrupting errors have happened. We do this because if a
2014 * deferred or disrupting error had occurred with NCEEN/CEEN off, the
2015 * trap will not be taken when NCEEN/CEEN is re-enabled. Note that
2016 * CEEN works differently on Cheetah than on Spitfire. Also, we enable
2017 * NCEEN/CEEN *before* checking the AFSR to avoid the small window of a
2018 * deferred or disrupting error happening between checking the AFSR and
2019 * enabling NCEEN/CEEN.
2021 * Note: CEEN reenabled only if it was on when trap taken.
2023 set_error_enable(get_error_enable() | (EN_REG_NCEEN
| ceen
));
2024 if (clear_errors(&ch_flt
)) {
2026 * Check for secondary errors, and avoid panicking if we
2029 if (cpu_check_secondary_errors(&ch_flt
, t_afsr_errs
,
2031 aflt
->flt_panic
|= ((ch_flt
.afsr_errs
&
2032 (C_AFSR_ASYNC_ERRS
| C_AFSR_EXT_ASYNC_ERRS
)) != 0);
2034 (void) cpu_queue_events(&ch_flt
, pr_reason
, ch_flt
.afsr_errs
,
2039 * Panic here if aflt->flt_panic has been set. Enqueued errors will
2040 * be logged as part of the panic flow.
2042 if (aflt
->flt_panic
)
2043 fm_panic("%sError(s)", pr_reason
);
2046 * If we queued an error and we are going to return from the trap and
2047 * the error was in user mode or inside of a copy routine, set AST flag
2048 * so the queue will be drained before returning to user mode. The
2049 * AST processing will also act on our failure policy.
2051 if (!aflt
->flt_priv
|| aflt
->flt_prot
== AFLT_PROT_COPY
) {
2055 (C_AFSR_ASYNC_ERRS
| C_AFSR_EXT_ASYNC_ERRS
&
2056 ~(C_AFSR_BERR
| C_AFSR_TO
)))
2057 pcb_flag
|= ASYNC_HWERR
;
2059 if (t_afsr
& C_AFSR_BERR
)
2060 pcb_flag
|= ASYNC_BERR
;
2062 if (t_afsr
& C_AFSR_TO
)
2063 pcb_flag
|= ASYNC_BTO
;
2065 ttolwp(curthread
)->lwp_pcb
.pcb_flags
|= pcb_flag
;
2070 #if defined(CPU_IMP_L1_CACHE_PARITY)
2072 * Handling of data and instruction parity errors (traps 0x71, 0x72).
2074 * For Panther, P$ data parity errors during floating point load hits
2075 * are also detected (reported as TT 0x71) and handled by this trap
2078 * AFSR/AFAR are not set for parity errors, only TPC (a virtual address)
2083 cpu_parity_error(struct regs
*rp
, uint_t flags
, caddr_t tpc
)
2085 ch_async_flt_t ch_flt
;
2086 struct async_flt
*aflt
;
2087 uchar_t tl
= ((flags
& CH_ERR_TL
) != 0);
2088 uchar_t iparity
= ((flags
& CH_ERR_IPE
) != 0);
2089 uchar_t panic
= ((flags
& CH_ERR_PANIC
) != 0);
2091 int index
, way
, word
;
2092 ch_dc_data_t tmp_dcp
;
2093 int dc_set_size
= dcache_size
/ CH_DCACHE_NWAY
;
2094 uint64_t parity_bits
, pbits
;
2095 /* The parity bit array corresponds to the result of summing two bits */
2096 static int parity_bits_popc
[] = { 0, 1, 1, 0 };
2100 * For icache parity errors the fault address is the trap PC.
2101 * For dcache/pcache parity errors the instruction would have to
2102 * be decoded to determine the address and that isn't possible
2105 bzero(&ch_flt
, sizeof (ch_async_flt_t
));
2106 aflt
= (struct async_flt
*)&ch_flt
;
2107 aflt
->flt_id
= gethrtime_waitfree();
2108 aflt
->flt_bus_id
= getprocessorid();
2109 aflt
->flt_inst
= CPU
->cpu_id
;
2111 aflt
->flt_addr
= iparity
? (uint64_t)tpc
: AFLT_INV_ADDR
;
2112 aflt
->flt_prot
= AFLT_PROT_NONE
;
2113 aflt
->flt_class
= CPU_FAULT
;
2114 aflt
->flt_priv
= (tl
|| (rp
->r_tstate
& TSTATE_PRIV
)) ? 1 : 0;
2116 aflt
->flt_panic
= panic
;
2117 aflt
->flt_status
= iparity
? ECC_IP_TRAP
: ECC_DP_TRAP
;
2118 ch_flt
.flt_type
= iparity
? CPU_IC_PARITY
: CPU_DC_PARITY
;
2121 cpu_icache_parity_info(&ch_flt
);
2122 if (ch_flt
.parity_data
.ipe
.cpl_off
!= -1)
2123 error_class
= FM_EREPORT_CPU_USIII_IDSPE
;
2124 else if (ch_flt
.parity_data
.ipe
.cpl_way
!= -1)
2125 error_class
= FM_EREPORT_CPU_USIII_ITSPE
;
2127 error_class
= FM_EREPORT_CPU_USIII_IPE
;
2128 aflt
->flt_payload
= FM_EREPORT_PAYLOAD_ICACHE_PE
;
2130 cpu_dcache_parity_info(&ch_flt
);
2131 if (ch_flt
.parity_data
.dpe
.cpl_off
!= -1) {
2133 * If not at TL 0 and running on a Jalapeno processor,
2134 * then process as a true ddspe. A true
2135 * ddspe error can only occur if the way == 0
2137 way
= ch_flt
.parity_data
.dpe
.cpl_way
;
2138 if ((tl
== 0) && (way
!= 0) &&
2139 IS_JALAPENO(cpunodes
[CPU
->cpu_id
].implementation
)) {
2140 for (index
= 0; index
< dc_set_size
;
2141 index
+= dcache_linesize
) {
2142 get_dcache_dtag(index
+ way
*
2144 (uint64_t *)&tmp_dcp
);
2146 * Check data array for even parity.
2147 * The 8 parity bits are grouped into
2148 * 4 pairs each of which covers a 64-bit
2149 * word. The endianness is reversed
2150 * -- the low-order parity bits cover
2151 * the high-order data words.
2153 parity_bits
= tmp_dcp
.dc_utag
>> 8;
2154 for (word
= 0; word
< 4; word
++) {
2155 pbits
= (parity_bits
>>
2156 (6 - word
* 2)) & 3;
2158 tmp_dcp
.dc_data
[word
]) +
2159 parity_bits_popc
[pbits
]) &
2160 1) && (tmp_dcp
.dc_tag
&
2163 correct_dcache_parity(
2166 if (cache_boot_state
&
2177 } /* (tl == 0) && (way != 0) && IS JALAPENO */
2178 error_class
= FM_EREPORT_CPU_USIII_DDSPE
;
2179 } else if (ch_flt
.parity_data
.dpe
.cpl_way
!= -1)
2180 error_class
= FM_EREPORT_CPU_USIII_DTSPE
;
2182 error_class
= FM_EREPORT_CPU_USIII_DPE
;
2183 aflt
->flt_payload
= FM_EREPORT_PAYLOAD_DCACHE_PE
;
2185 * For panther we also need to check the P$ for parity errors.
2187 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
)) {
2188 cpu_pcache_parity_info(&ch_flt
);
2189 if (ch_flt
.parity_data
.dpe
.cpl_cache
== CPU_PC_PARITY
) {
2190 error_class
= FM_EREPORT_CPU_USIII_PDSPE
;
2192 FM_EREPORT_PAYLOAD_PCACHE_PE
;
2197 cpu_errorq_dispatch(error_class
, (void *)&ch_flt
,
2198 sizeof (ch_async_flt_t
), ue_queue
, aflt
->flt_panic
);
2202 * Invalidate entire I$.
2203 * This is required due to the use of diagnostic ASI
2204 * accesses that may result in a loss of I$ coherency.
2206 if (cache_boot_state
& DCU_IC
) {
2210 * According to section P.3.1 of the Panther PRM, we
2211 * need to do a little more for recovery on those
2212 * CPUs after encountering an I$ parity error.
2214 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
)) {
2216 correct_dcache_parity(dcache_size
,
2222 * Since the valid bit is ignored when checking parity the
2223 * D$ data and tag must also be corrected. Set D$ data bits
2224 * to zero and set utag to 0, 1, 2, 3.
2226 correct_dcache_parity(dcache_size
, dcache_linesize
);
2229 * According to section P.3.3 of the Panther PRM, we
2230 * need to do a little more for recovery on those
2231 * CPUs after encountering a D$ or P$ parity error.
2233 * As far as clearing P$ parity errors, it is enough to
2234 * simply invalidate all entries in the P$ since P$ parity
2235 * error traps are only generated for floating point load
2238 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
)) {
2246 * Invalidate entire D$ if it was enabled.
2247 * This is done to avoid stale data in the D$ which might
2248 * occur with the D$ disabled and the trap handler doing
2249 * stores affecting lines already in the D$.
2251 if (cache_boot_state
& DCU_DC
) {
2256 * Restore caches to their bootup state.
2258 set_dcu(get_dcu() | cache_boot_state
);
2261 * Panic here if aflt->flt_panic has been set. Enqueued errors will
2262 * be logged as part of the panic flow.
2264 if (aflt
->flt_panic
)
2265 fm_panic("%sError(s)", iparity
? "IPE " : "DPE ");
2268 * If this error occurred at TL>0 then flush the E$ here to reduce
2269 * the chance of getting an unrecoverable Fast ECC error. This
2270 * flush will evict the part of the parity trap handler that is run
2279 * On an I$ parity error, mark the appropriate entries in the ch_async_flt_t
2280 * to indicate which portions of the captured data should be in the ereport.
2283 cpu_async_log_ic_parity_err(ch_async_flt_t
*ch_flt
)
2285 int way
= ch_flt
->parity_data
.ipe
.cpl_way
;
2286 int offset
= ch_flt
->parity_data
.ipe
.cpl_off
;
2288 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
2291 if ((offset
!= -1) || (way
!= -1)) {
2293 * Parity error in I$ tag or data
2295 tag_index
= ch_flt
->parity_data
.ipe
.cpl_ic
[way
].ic_idx
;
2296 if (IS_PANTHER(cpunodes
[aflt
->flt_inst
].implementation
))
2297 ch_flt
->parity_data
.ipe
.cpl_ic
[way
].ic_way
=
2298 PN_ICIDX_TO_WAY(tag_index
);
2300 ch_flt
->parity_data
.ipe
.cpl_ic
[way
].ic_way
=
2301 CH_ICIDX_TO_WAY(tag_index
);
2302 ch_flt
->parity_data
.ipe
.cpl_ic
[way
].ic_logflag
=
2306 * Parity error was not identified.
2307 * Log tags and data for all ways.
2309 for (way
= 0; way
< CH_ICACHE_NWAY
; way
++) {
2310 tag_index
= ch_flt
->parity_data
.ipe
.cpl_ic
[way
].ic_idx
;
2311 if (IS_PANTHER(cpunodes
[aflt
->flt_inst
].implementation
))
2312 ch_flt
->parity_data
.ipe
.cpl_ic
[way
].ic_way
=
2313 PN_ICIDX_TO_WAY(tag_index
);
2315 ch_flt
->parity_data
.ipe
.cpl_ic
[way
].ic_way
=
2316 CH_ICIDX_TO_WAY(tag_index
);
2317 ch_flt
->parity_data
.ipe
.cpl_ic
[way
].ic_logflag
=
2324 * On an D$ parity error, mark the appropriate entries in the ch_async_flt_t
2325 * to indicate which portions of the captured data should be in the ereport.
2328 cpu_async_log_dc_parity_err(ch_async_flt_t
*ch_flt
)
2330 int way
= ch_flt
->parity_data
.dpe
.cpl_way
;
2331 int offset
= ch_flt
->parity_data
.dpe
.cpl_off
;
2336 * Parity error in D$ or P$ data array.
2338 * First check to see whether the parity error is in D$ or P$
2339 * since P$ data parity errors are reported in Panther using
2342 if (ch_flt
->parity_data
.dpe
.cpl_cache
== CPU_PC_PARITY
) {
2343 tag_index
= ch_flt
->parity_data
.dpe
.cpl_pc
[way
].pc_idx
;
2344 ch_flt
->parity_data
.dpe
.cpl_pc
[way
].pc_way
=
2345 CH_PCIDX_TO_WAY(tag_index
);
2346 ch_flt
->parity_data
.dpe
.cpl_pc
[way
].pc_logflag
=
2349 tag_index
= ch_flt
->parity_data
.dpe
.cpl_dc
[way
].dc_idx
;
2350 ch_flt
->parity_data
.dpe
.cpl_dc
[way
].dc_way
=
2351 CH_DCIDX_TO_WAY(tag_index
);
2352 ch_flt
->parity_data
.dpe
.cpl_dc
[way
].dc_logflag
=
2355 } else if (way
!= -1) {
2357 * Parity error in D$ tag.
2359 tag_index
= ch_flt
->parity_data
.dpe
.cpl_dc
[way
].dc_idx
;
2360 ch_flt
->parity_data
.dpe
.cpl_dc
[way
].dc_way
=
2361 CH_DCIDX_TO_WAY(tag_index
);
2362 ch_flt
->parity_data
.dpe
.cpl_dc
[way
].dc_logflag
=
2366 #endif /* CPU_IMP_L1_CACHE_PARITY */
2369 * The cpu_async_log_err() function is called via the [uc]e_drain() function to
2370 * post-process CPU events that are dequeued. As such, it can be invoked
2371 * from softint context, from AST processing in the trap() flow, or from the
2372 * panic flow. We decode the CPU-specific data, and take appropriate actions.
2373 * Historically this entry point was used to log the actual cmn_err(9F) text;
2374 * now with FMA it is used to prepare 'flt' to be converted into an ereport.
2375 * With FMA this function now also returns a flag which indicates to the
2376 * caller whether the ereport should be posted (1) or suppressed (0).
2379 cpu_async_log_err(void *flt
, errorq_elem_t
*eqep
)
2381 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)flt
;
2382 struct async_flt
*aflt
= (struct async_flt
*)flt
;
2384 extern void memscrub_induced_error(void);
2386 switch (ch_flt
->flt_type
) {
2389 * If it is a disrupting trap and the AFSR is zero, then
2390 * the event has probably already been noted. Do not post
2393 if ((aflt
->flt_status
& ECC_C_TRAP
) &&
2394 (!(aflt
->flt_stat
& C_AFSR_MASK
)))
2404 case CPU_UE_ECACHE_RETIRE
:
2406 cpu_page_retire(ch_flt
);
2410 * Cases where we may want to suppress logging or perform
2411 * extended diagnostics.
2416 * We want to skip logging and further classification
2417 * only if ALL the following conditions are true:
2419 * 1. There is only one error
2420 * 2. That error is a correctable memory error
2421 * 3. The error is caused by the memory scrubber (in
2422 * which case the error will have occurred under
2423 * on_trap protection)
2424 * 4. The error is on a retired page
2426 * Note: AFLT_PROT_EC is used places other than the memory
2427 * scrubber. However, none of those errors should occur
2428 * on a retired page.
2430 if ((ch_flt
->afsr_errs
&
2431 (C_AFSR_ALL_ERRS
| C_AFSR_EXT_ALL_ERRS
)) == C_AFSR_CE
&&
2432 aflt
->flt_prot
== AFLT_PROT_EC
) {
2434 if (page_retire_check(aflt
->flt_addr
, NULL
) == 0) {
2435 if (ch_flt
->flt_trapped_ce
& CE_CEEN_DEFER
) {
2438 * Since we're skipping logging, we'll need
2439 * to schedule the re-enabling of CEEN
2441 (void) timeout(cpu_delayed_check_ce_errors
,
2442 (void *)(uintptr_t)aflt
->flt_inst
,
2443 drv_usectohz((clock_t)cpu_ceen_delay_secs
2448 * Inform memscrubber - scrubbing induced
2449 * CE on a retired page.
2451 memscrub_induced_error();
2457 * Perform/schedule further classification actions, but
2458 * only if the page is healthy (we don't want bad
2459 * pages inducing too much diagnostic activity). If we could
2460 * not find a page pointer then we also skip this. If
2461 * ce_scrub_xdiag_recirc returns nonzero then it has chosen
2462 * to copy and recirculate the event (for further diagnostics)
2463 * and we should not proceed to log it here.
2465 * This must be the last step here before the cpu_log_err()
2466 * below - if an event recirculates cpu_ce_log_err() will
2467 * not call the current function but just proceed directly
2468 * to cpu_ereport_post after the cpu_log_err() avoided below.
2470 * Note: Check cpu_impl_async_log_err if changing this
2472 if (page_retire_check(aflt
->flt_addr
, &errors
) == EINVAL
) {
2473 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
,
2474 CE_XDIAG_SKIP_NOPP
);
2476 if (errors
!= PR_OK
) {
2477 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
,
2478 CE_XDIAG_SKIP_PAGEDET
);
2479 } else if (ce_scrub_xdiag_recirc(aflt
, ce_queue
, eqep
,
2480 offsetof(ch_async_flt_t
, cmn_asyncflt
))) {
2487 * Cases where we just want to report the error and continue.
2497 * Cases where we want to fall through to handle panicking.
2501 * We want to skip logging in the same conditions as the
2502 * CE case. In addition, we want to make sure we're not
2505 if (!panicstr
&& (ch_flt
->afsr_errs
&
2506 (C_AFSR_ALL_ERRS
| C_AFSR_EXT_ALL_ERRS
)) == C_AFSR_UE
&&
2507 aflt
->flt_prot
== AFLT_PROT_EC
) {
2508 if (page_retire_check(aflt
->flt_addr
, NULL
) == 0) {
2509 /* Zero the address to clear the error */
2510 softcall(ecc_page_zero
, (void *)aflt
->flt_addr
);
2512 * Inform memscrubber - scrubbing induced
2513 * UE on a retired page.
2515 memscrub_induced_error();
2524 * If the us3_common.c code doesn't know the flt_type, it may
2525 * be an implementation-specific code. Call into the impldep
2526 * backend to find out what to do: if it tells us to continue,
2527 * break and handle as if falling through from a UE; if not,
2528 * the impldep backend has handled the error and we're done.
2530 switch (cpu_impl_async_log_err(flt
, eqep
)) {
2531 case CH_ASYNC_LOG_DONE
:
2533 case CH_ASYNC_LOG_RECIRC
:
2535 case CH_ASYNC_LOG_CONTINUE
:
2536 break; /* continue on to handle UE-like error */
2538 cmn_err(CE_WARN
, "discarding error 0x%p with "
2539 "invalid fault type (0x%x)",
2540 (void *)aflt
, ch_flt
->flt_type
);
2545 /* ... fall through from the UE case */
2547 if (aflt
->flt_addr
!= AFLT_INV_ADDR
&& aflt
->flt_in_memory
) {
2549 cpu_page_retire(ch_flt
);
2552 * Clear UEs on panic so that we don't
2553 * get haunted by them during panic or
2556 cpu_clearphys(aflt
);
2557 (void) clear_errors(NULL
);
2565 * Retire the bad page that may contain the flushed error.
2568 cpu_page_retire(ch_async_flt_t
*ch_flt
)
2570 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
2571 (void) page_retire(aflt
->flt_addr
, PR_UE
);
2575 * Return true if the error specified in the AFSR indicates
2576 * an E$ data error (L2$ for Cheetah/Cheetah+/Jaguar, L3$
2577 * for Panther, none for Jalapeno/Serrano).
2581 cpu_error_is_ecache_data(int cpuid
, uint64_t t_afsr
)
2583 #if defined(JALAPENO) || defined(SERRANO)
2585 #elif defined(CHEETAH_PLUS)
2586 if (IS_PANTHER(cpunodes
[cpuid
].implementation
))
2587 return ((t_afsr
& C_AFSR_EXT_L3_DATA_ERRS
) != 0);
2588 return ((t_afsr
& C_AFSR_EC_DATA_ERRS
) != 0);
2589 #else /* CHEETAH_PLUS */
2590 return ((t_afsr
& C_AFSR_EC_DATA_ERRS
) != 0);
2595 * The cpu_log_err() function is called by cpu_async_log_err() to perform the
2596 * generic event post-processing for correctable and uncorrectable memory,
2597 * E$, and MTag errors. Historically this entry point was used to log bits of
2598 * common cmn_err(9F) text; now with FMA it is used to prepare 'flt' to be
2599 * converted into an ereport. In addition, it transmits the error to any
2600 * platform-specific service-processor FRU logging routines, if available.
2603 cpu_log_err(struct async_flt
*aflt
)
2605 char unum
[UNUM_NAMLEN
];
2606 int synd_status
, synd_code
, afar_status
;
2607 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
2609 if (cpu_error_is_ecache_data(aflt
->flt_inst
, ch_flt
->flt_bit
))
2610 aflt
->flt_status
|= ECC_ECACHE
;
2612 aflt
->flt_status
&= ~ECC_ECACHE
;
2614 * Determine syndrome status.
2616 synd_status
= afsr_to_synd_status(aflt
->flt_inst
,
2617 ch_flt
->afsr_errs
, ch_flt
->flt_bit
);
2620 * Determine afar status.
2622 if (pf_is_memory(aflt
->flt_addr
>> MMU_PAGESHIFT
))
2623 afar_status
= afsr_to_afar_status(ch_flt
->afsr_errs
,
2626 afar_status
= AFLT_STAT_INVALID
;
2628 synd_code
= synd_to_synd_code(synd_status
,
2629 aflt
->flt_synd
, ch_flt
->flt_bit
);
2632 * If afar status is not invalid do a unum lookup.
2634 if (afar_status
!= AFLT_STAT_INVALID
) {
2635 (void) cpu_get_mem_unum_synd(synd_code
, aflt
, unum
);
2641 * Do not send the fruid message (plat_ecc_error_data_t)
2642 * to the SC if it can handle the enhanced error information
2643 * (plat_ecc_error2_data_t) or when the tunable
2644 * ecc_log_fruid_enable is set to 0.
2647 if (&plat_ecc_capability_sc_get
&&
2648 plat_ecc_capability_sc_get(PLAT_ECC_ERROR_MESSAGE
)) {
2649 if (&plat_log_fruid_error
)
2650 plat_log_fruid_error(synd_code
, aflt
, unum
,
2654 if (aflt
->flt_func
!= NULL
)
2655 aflt
->flt_func(aflt
, unum
);
2657 if (afar_status
!= AFLT_STAT_INVALID
)
2658 cpu_log_diag_info(ch_flt
);
2661 * If we have a CEEN error , we do not reenable CEEN until after
2662 * we exit the trap handler. Otherwise, another error may
2663 * occur causing the handler to be entered recursively.
2664 * We set a timeout to trigger in cpu_ceen_delay_secs seconds,
2665 * to try and ensure that the CPU makes progress in the face
2668 if (ch_flt
->flt_trapped_ce
& CE_CEEN_DEFER
) {
2669 (void) timeout(cpu_delayed_check_ce_errors
,
2670 (void *)(uintptr_t)aflt
->flt_inst
,
2671 drv_usectohz((clock_t)cpu_ceen_delay_secs
* MICROSEC
));
2676 * Invoked by error_init() early in startup and therefore before
2677 * startup_errorq() is called to drain any error Q -
2685 * start_other_cpus()
2687 * The purpose of this routine is to create error-related taskqs. Taskqs
2688 * are used for this purpose because cpu_lock can't be grabbed from interrupt
2692 cpu_error_init(int items
)
2695 * Create taskq(s) to reenable CE
2697 ch_check_ce_tq
= taskq_create("cheetah_check_ce", 1, minclsyspri
,
2698 items
, items
, TASKQ_PREPOPULATE
);
2702 cpu_ce_log_err(struct async_flt
*aflt
, errorq_elem_t
*eqep
)
2704 char unum
[UNUM_NAMLEN
];
2707 switch (aflt
->flt_class
) {
2709 cpu_ereport_init(aflt
);
2710 if (cpu_async_log_err(aflt
, eqep
))
2711 cpu_ereport_post(aflt
);
2715 if (aflt
->flt_func
!= NULL
) {
2716 (void) cpu_get_mem_unum_aflt(AFLT_STAT_VALID
, aflt
,
2717 unum
, UNUM_NAMLEN
, &len
);
2718 aflt
->flt_func(aflt
, unum
);
2722 case RECIRC_CPU_FAULT
:
2723 aflt
->flt_class
= CPU_FAULT
;
2725 cpu_ereport_post(aflt
);
2728 case RECIRC_BUS_FAULT
:
2729 ASSERT(aflt
->flt_class
!= RECIRC_BUS_FAULT
);
2732 cmn_err(CE_WARN
, "discarding CE error 0x%p with invalid "
2733 "fault class (0x%x)", (void *)aflt
, aflt
->flt_class
);
2739 * Scrub and classify a CE. This function must not modify the
2740 * fault structure passed to it but instead should return the classification
2745 cpu_ce_scrub_mem_err_common(struct async_flt
*ecc
, boolean_t logout_tried
)
2747 uchar_t disp
= CE_XDIAG_EXTALG
;
2750 ch_cpu_logout_t
*clop
;
2753 * Clear CEEN. CPU CE TL > 0 trap handling will already have done
2754 * this, but our other callers have not. Disable preemption to
2755 * avoid CPU migration so that we restore CEEN on the correct
2758 * CEEN is cleared so that further CEs that our instruction and
2759 * data footprint induce do not cause use to either creep down
2760 * kernel stack to the point of overflow, or do so much CE
2761 * notification as to make little real forward progress.
2763 * NCEEN must not be cleared. However it is possible that
2764 * our accesses to the flt_addr may provoke a bus error or timeout
2765 * if the offending address has just been unconfigured as part of
2766 * a DR action. So we must operate under on_trap protection.
2769 orig_err
= get_error_enable();
2770 if (orig_err
& EN_REG_CEEN
)
2771 set_error_enable(orig_err
& ~EN_REG_CEEN
);
2774 * Our classification algorithm includes the line state before
2775 * the scrub; we'd like this captured after the detection and
2776 * before the algorithm below - the earlier the better.
2778 * If we've come from a cpu CE trap then this info already exists
2779 * in the cpu logout area.
2781 * For a CE detected by memscrub for which there was no trap
2782 * (running with CEEN off) cpu_log_and_clear_ce has called
2783 * cpu_ce_delayed_ec_logout to capture some cache data, and
2784 * marked the fault structure as incomplete as a flag to later
2787 * If called directly from an IO detected CE there has been
2788 * no line data capture. In this case we logout to the cpu logout
2789 * area - that's appropriate since it's the cpu cache data we need
2790 * for classification. We thus borrow the cpu logout area for a
2791 * short time, and cpu_ce_delayed_ec_logout will mark it as busy in
2792 * this time (we will invalidate it again below).
2794 * If called from the partner check xcall handler then this cpu
2795 * (the partner) has not necessarily experienced a CE at this
2796 * address. But we want to capture line state before its scrub
2797 * attempt since we use that in our classification.
2799 if (logout_tried
== B_FALSE
) {
2800 if (!cpu_ce_delayed_ec_logout(ecc
->flt_addr
))
2801 disp
|= CE_XDIAG_NOLOGOUT
;
2805 * Scrub memory, then check AFSR for errors. The AFAR we scrub may
2806 * no longer be valid (if DR'd since the initial event) so we
2807 * perform this scrub under on_trap protection. If this access is
2808 * ok then further accesses below will also be ok - DR cannot
2809 * proceed while this thread is active (preemption is disabled);
2810 * to be safe we'll nonetheless use on_trap again below.
2812 if (!on_trap(&otd
, OT_DATA_ACCESS
)) {
2816 if (orig_err
& EN_REG_CEEN
)
2817 set_error_enable(orig_err
);
2824 * Did the casx read of the scrub log a CE that matches the AFAR?
2825 * Note that it's quite possible that the read sourced the data from
2829 disp
|= CE_XDIAG_CE1
;
2832 * Read the data again. This time the read is very likely to
2833 * come from memory since the scrub induced a writeback to memory.
2835 if (!on_trap(&otd
, OT_DATA_ACCESS
)) {
2836 (void) lddphys(P2ALIGN(ecc
->flt_addr
, 8));
2839 if (orig_err
& EN_REG_CEEN
)
2840 set_error_enable(orig_err
);
2846 /* Did that read induce a CE that matches the AFAR? */
2848 disp
|= CE_XDIAG_CE2
;
2851 * Look at the logout information and record whether we found the
2852 * line in l2/l3 cache. For Panther we are interested in whether
2853 * we found it in either cache (it won't reside in both but
2854 * it is possible to read it that way given the moving target).
2856 clop
= CPU_PRIVATE(CPU
) ? CPU_PRIVATE_PTR(CPU
, chpr_cecc_logout
) : NULL
;
2857 if (!(disp
& CE_XDIAG_NOLOGOUT
) && clop
&&
2858 clop
->clo_data
.chd_afar
!= LOGOUT_INVALID
) {
2865 * If hit is nonzero then a match was found and hit will
2866 * be one greater than the index which hit. For Panther we
2867 * also need to pay attention to level to see which of l2$ or
2870 hit
= cpu_matching_ecache_line(ecc
->flt_addr
, &clop
->clo_data
,
2875 disp
|= CE_XDIAG_AFARMATCH
;
2877 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
)) {
2879 ecp
= &clop
->clo_data
.chd_l2_data
[hit
];
2881 ecp
= &clop
->clo_data
.chd_ec_data
[hit
];
2884 ecp
= &clop
->clo_data
.chd_ec_data
[hit
];
2886 totalsize
= cpunodes
[CPU
->cpu_id
].ecache_size
;
2887 state
= cpu_ectag_pa_to_subblk_state(totalsize
,
2888 ecc
->flt_addr
, ecp
->ec_tag
);
2891 * Cheetah variants use different state encodings -
2892 * the CH_ECSTATE_* defines vary depending on the
2893 * module we're compiled for. Translate into our
2894 * one true version. Conflate Owner-Shared state
2895 * of SSM mode with Owner as victimisation of such
2896 * lines may cause a writeback.
2899 case CH_ECSTATE_MOD
:
2903 case CH_ECSTATE_OWN
:
2904 case CH_ECSTATE_OWS
:
2908 case CH_ECSTATE_EXL
:
2912 case CH_ECSTATE_SHR
:
2923 * If we initiated the delayed logout then we are responsible
2924 * for invalidating the logout area.
2926 if (logout_tried
== B_FALSE
) {
2927 bzero(clop
, sizeof (ch_cpu_logout_t
));
2928 clop
->clo_data
.chd_afar
= LOGOUT_INVALID
;
2933 * Re-enable CEEN if we turned it off.
2935 if (orig_err
& EN_REG_CEEN
)
2936 set_error_enable(orig_err
);
2943 * Scrub a correctable memory error and collect data for classification
2944 * of CE type. This function is called in the detection path, ie tl0 handling
2945 * of a correctable error trap (cpus) or interrupt (IO) at high PIL.
2948 cpu_ce_scrub_mem_err(struct async_flt
*ecc
, boolean_t logout_tried
)
2951 * Cheetah CE classification does not set any bits in flt_status.
2952 * Instead we will record classification datapoints in flt_disp.
2954 ecc
->flt_status
&= ~(ECC_INTERMITTENT
| ECC_PERSISTENT
| ECC_STICKY
);
2957 * To check if the error detected by IO is persistent, sticky or
2958 * intermittent. This is noticed by clear_ecc().
2960 if (ecc
->flt_status
& ECC_IOBUS
)
2961 ecc
->flt_stat
= C_AFSR_MEMORY
;
2964 * Record information from this first part of the algorithm in
2967 ecc
->flt_disp
= cpu_ce_scrub_mem_err_common(ecc
, logout_tried
);
2971 * Select a partner to perform a further CE classification check from.
2972 * Must be called with kernel preemption disabled (to stop the cpu list
2973 * from changing). The detecting cpu we are partnering has cpuid
2974 * aflt->flt_inst; we might not be running on the detecting cpu.
2976 * Restrict choice to active cpus in the same cpu partition as ourselves in
2977 * an effort to stop bad cpus in one partition causing other partitions to
2978 * perform excessive diagnostic activity. Actually since the errorq drain
2979 * is run from a softint most of the time and that is a global mechanism
2980 * this isolation is only partial. Return NULL if we fail to find a
2983 * We prefer a partner that is in a different latency group to ourselves as
2984 * we will share fewer datapaths. If such a partner is unavailable then
2985 * choose one in the same lgroup but prefer a different chip and only allow
2986 * a sibling core if flags includes PTNR_SIBLINGOK. If all else fails and
2987 * flags includes PTNR_SELFOK then permit selection of the original detector.
2989 * We keep a cache of the last partner selected for a cpu, and we'll try to
2990 * use that previous partner if no more than cpu_ce_ptnr_cachetime_sec seconds
2991 * have passed since that selection was made. This provides the benefit
2992 * of the point-of-view of different partners over time but without
2993 * requiring frequent cpu list traversals.
2996 #define PTNR_SIBLINGOK 0x1 /* Allow selection of sibling core */
2997 #define PTNR_SELFOK 0x2 /* Allow selection of cpu to "partner" itself */
3000 ce_ptnr_select(struct async_flt
*aflt
, int flags
, int *typep
)
3002 cpu_t
*sp
, *dtcr
, *ptnr
, *locptnr
, *sibptnr
;
3003 hrtime_t lasttime
, thistime
;
3005 ASSERT(curthread
->t_preempt
> 0 || getpil() >= DISP_LEVEL
);
3007 dtcr
= cpu
[aflt
->flt_inst
];
3010 * Short-circuit for the following cases:
3011 * . the dtcr is not flagged active
3012 * . there is just one cpu present
3013 * . the detector has disappeared
3014 * . we were given a bad flt_inst cpuid; this should not happen
3015 * (eg PCI code now fills flt_inst) but if it does it is no
3017 * . there is just one cpu left online in the cpu partition
3019 * If we return NULL after this point then we do not update the
3020 * chpr_ceptnr_seltime which will cause us to perform a full lookup
3021 * again next time; this is the case where the only other cpu online
3022 * in the detector's partition is on the same chip as the detector
3023 * and since CEEN re-enable is throttled even that case should not
3026 if (dtcr
== NULL
|| !cpu_flagged_active(dtcr
->cpu_flags
)) {
3029 if (ncpus
== 1 || dtcr
->cpu_part
->cp_ncpus
== 1) {
3030 if (flags
& PTNR_SELFOK
) {
3031 *typep
= CE_XDIAG_PTNR_SELF
;
3038 thistime
= gethrtime();
3039 lasttime
= CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_seltime
);
3042 * Select a starting point.
3046 * We've never selected a partner for this detector before.
3047 * Start the scan at the next online cpu in the same cpu
3050 sp
= dtcr
->cpu_next_part
;
3051 } else if (thistime
- lasttime
< cpu_ce_ptnr_cachetime_sec
* NANOSEC
) {
3053 * Our last selection has not aged yet. If this partner:
3054 * . is still a valid cpu,
3055 * . is still in the same partition as the detector
3056 * . is still marked active
3057 * . satisfies the 'flags' argument criteria
3058 * then select it again without updating the timestamp.
3060 sp
= cpu
[CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_id
)];
3061 if (sp
== NULL
|| sp
->cpu_part
!= dtcr
->cpu_part
||
3062 !cpu_flagged_active(sp
->cpu_flags
) ||
3063 (sp
== dtcr
&& !(flags
& PTNR_SELFOK
)) ||
3064 (pg_plat_cpus_share(sp
, dtcr
, PGHW_CHIP
) &&
3065 !(flags
& PTNR_SIBLINGOK
))) {
3066 sp
= dtcr
->cpu_next_part
;
3068 if (sp
->cpu_lpl
->lpl_lgrp
!= dtcr
->cpu_lpl
->lpl_lgrp
) {
3069 *typep
= CE_XDIAG_PTNR_REMOTE
;
3070 } else if (sp
== dtcr
) {
3071 *typep
= CE_XDIAG_PTNR_SELF
;
3072 } else if (pg_plat_cpus_share(sp
, dtcr
, PGHW_CHIP
)) {
3073 *typep
= CE_XDIAG_PTNR_SIBLING
;
3075 *typep
= CE_XDIAG_PTNR_LOCAL
;
3081 * Our last selection has aged. If it is nonetheless still a
3082 * valid cpu then start the scan at the next cpu in the
3083 * partition after our last partner. If the last selection
3084 * is no longer a valid cpu then go with our default. In
3085 * this way we slowly cycle through possible partners to
3086 * obtain multiple viewpoints over time.
3088 sp
= cpu
[CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_id
)];
3090 sp
= dtcr
->cpu_next_part
;
3092 sp
= sp
->cpu_next_part
; /* may be dtcr */
3093 if (sp
->cpu_part
!= dtcr
->cpu_part
)
3099 * We have a proposed starting point for our search, but if this
3100 * cpu is offline then its cpu_next_part will point to itself
3101 * so we can't use that to iterate over cpus in this partition in
3102 * the loop below. We still want to avoid iterating over cpus not
3103 * in our partition, so in the case that our starting point is offline
3104 * we will repoint it to be the detector itself; and if the detector
3105 * happens to be offline we'll return NULL from the following loop.
3107 if (!cpu_flagged_active(sp
->cpu_flags
)) {
3115 if (ptnr
== dtcr
|| !cpu_flagged_active(ptnr
->cpu_flags
))
3117 if (ptnr
->cpu_lpl
->lpl_lgrp
!= dtcr
->cpu_lpl
->lpl_lgrp
) {
3118 CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_id
) = ptnr
->cpu_id
;
3119 CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_seltime
) = thistime
;
3120 *typep
= CE_XDIAG_PTNR_REMOTE
;
3123 if (pg_plat_cpus_share(ptnr
, dtcr
, PGHW_CHIP
)) {
3124 if (sibptnr
== NULL
)
3128 if (locptnr
== NULL
)
3130 } while ((ptnr
= ptnr
->cpu_next_part
) != sp
);
3133 * A foreign partner has already been returned if one was available.
3135 * If locptnr is not NULL it is a cpu in the same lgroup as the
3136 * detector, is active, and is not a sibling of the detector.
3138 * If sibptnr is not NULL it is a sibling of the detector, and is
3141 * If we have to resort to using the detector itself we have already
3142 * checked that it is active.
3145 CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_id
) = locptnr
->cpu_id
;
3146 CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_seltime
) = thistime
;
3147 *typep
= CE_XDIAG_PTNR_LOCAL
;
3149 } else if (sibptnr
&& flags
& PTNR_SIBLINGOK
) {
3150 CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_id
) = sibptnr
->cpu_id
;
3151 CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_seltime
) = thistime
;
3152 *typep
= CE_XDIAG_PTNR_SIBLING
;
3154 } else if (flags
& PTNR_SELFOK
) {
3155 CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_id
) = dtcr
->cpu_id
;
3156 CPU_PRIVATE_VAL(dtcr
, chpr_ceptnr_seltime
) = thistime
;
3157 *typep
= CE_XDIAG_PTNR_SELF
;
3165 * Cross call handler that is requested to run on the designated partner of
3166 * a cpu that experienced a possibly sticky or possibly persistnet CE.
3169 ce_ptnrchk_xc(struct async_flt
*aflt
, uchar_t
*dispp
)
3171 *dispp
= cpu_ce_scrub_mem_err_common(aflt
, B_FALSE
);
3175 * The associated errorqs are never destroyed so we do not need to deal with
3176 * them disappearing before this timeout fires. If the affected memory
3177 * has been DR'd out since the original event the scrub algrithm will catch
3178 * any errors and return null disposition info. If the original detecting
3179 * cpu has been DR'd out then ereport detector info will not be able to
3180 * lookup CPU type; with a small timeout this is unlikely.
3183 ce_lkychk_cb(ce_lkychk_cb_t
*cbarg
)
3185 struct async_flt
*aflt
= cbarg
->lkycb_aflt
;
3191 if (cp
= ce_ptnr_select(aflt
, PTNR_SIBLINGOK
| PTNR_SELFOK
,
3193 xc_one(cp
->cpu_id
, (xcfunc_t
*)ce_ptnrchk_xc
, (uint64_t)aflt
,
3195 CE_XDIAG_SETLKYINFO(aflt
->flt_disp
, disp
);
3196 CE_XDIAG_SETPTNRID(aflt
->flt_disp
, cp
->cpu_id
);
3197 CE_XDIAG_SETPTNRTYPE(aflt
->flt_disp
, ptnrtype
);
3199 ce_xdiag_lkydrops
++;
3201 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
,
3202 CE_XDIAG_SKIP_NOPTNR
);
3206 errorq_commit(cbarg
->lkycb_eqp
, cbarg
->lkycb_eqep
, ERRORQ_ASYNC
);
3207 kmem_free(cbarg
, sizeof (ce_lkychk_cb_t
));
3211 * Called from errorq drain code when processing a CE error, both from
3212 * CPU and PCI drain functions. Decide what further classification actions,
3213 * if any, we will perform. Perform immediate actions now, and schedule
3214 * delayed actions as required. Note that we are no longer necessarily running
3215 * on the detecting cpu, and that the async_flt structure will not persist on
3216 * return from this function.
3218 * Calls to this function should aim to be self-throtlling in some way. With
3219 * the delayed re-enable of CEEN the absolute rate of calls should not
3220 * be excessive. Callers should also avoid performing in-depth classification
3221 * for events in pages that are already known to be suspect.
3223 * We return nonzero to indicate that the event has been copied and
3224 * recirculated for further testing. The caller should not log the event
3225 * in this case - it will be logged when further test results are available.
3227 * Our possible contexts are that of errorq_drain: below lock level or from
3228 * panic context. We can assume that the cpu we are running on is online.
3233 static int ce_xdiag_forceaction
;
3237 ce_scrub_xdiag_recirc(struct async_flt
*aflt
, errorq_t
*eqp
,
3238 errorq_elem_t
*eqep
, size_t afltoffset
)
3240 ce_dispact_t dispact
, action
;
3242 uchar_t dtcrinfo
, disp
;
3245 if (!ce_disp_inited
|| panicstr
|| ce_xdiag_off
) {
3248 } else if (!aflt
->flt_in_memory
) {
3250 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
, CE_XDIAG_SKIP_NOTMEM
);
3254 dtcrinfo
= CE_XDIAG_DTCRINFO(aflt
->flt_disp
);
3257 * Some correctable events are not scrubbed/classified, such as those
3258 * noticed at the tail of cpu_deferred_error. So if there is no
3259 * initial detector classification go no further.
3261 if (!CE_XDIAG_EXT_ALG_APPLIED(dtcrinfo
)) {
3263 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
, CE_XDIAG_SKIP_NOSCRUB
);
3267 dispact
= CE_DISPACT(ce_disp_table
,
3268 CE_XDIAG_AFARMATCHED(dtcrinfo
),
3269 CE_XDIAG_STATE(dtcrinfo
),
3270 CE_XDIAG_CE1SEEN(dtcrinfo
),
3271 CE_XDIAG_CE2SEEN(dtcrinfo
));
3274 action
= CE_ACT(dispact
); /* bad lookup caught below */
3276 if (ce_xdiag_forceaction
!= 0)
3277 action
= ce_xdiag_forceaction
;
3281 case CE_ACT_LKYCHK
: {
3283 errorq_elem_t
*neqep
;
3284 struct async_flt
*ecc
;
3285 ce_lkychk_cb_t
*cbargp
;
3287 if ((ndata
= errorq_elem_dup(eqp
, eqep
, &neqep
)) == NULL
) {
3288 ce_xdiag_lkydrops
++;
3289 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
,
3290 CE_XDIAG_SKIP_DUPFAIL
);
3293 ecc
= (struct async_flt
*)(ndata
+ afltoffset
);
3295 ASSERT(ecc
->flt_class
== CPU_FAULT
||
3296 ecc
->flt_class
== BUS_FAULT
);
3297 ecc
->flt_class
= (ecc
->flt_class
== CPU_FAULT
) ?
3298 RECIRC_CPU_FAULT
: RECIRC_BUS_FAULT
;
3300 cbargp
= kmem_alloc(sizeof (ce_lkychk_cb_t
), KM_SLEEP
);
3301 cbargp
->lkycb_aflt
= ecc
;
3302 cbargp
->lkycb_eqp
= eqp
;
3303 cbargp
->lkycb_eqep
= neqep
;
3305 (void) timeout((void (*)(void *))ce_lkychk_cb
,
3306 (void *)cbargp
, drv_usectohz(cpu_ce_lkychk_timeout_usec
));
3310 case CE_ACT_PTNRCHK
:
3311 kpreempt_disable(); /* stop cpu list changing */
3312 if ((cp
= ce_ptnr_select(aflt
, 0, &ptnrtype
)) != NULL
) {
3313 xc_one(cp
->cpu_id
, (xcfunc_t
*)ce_ptnrchk_xc
,
3314 (uint64_t)aflt
, (uint64_t)&disp
);
3315 CE_XDIAG_SETPTNRINFO(aflt
->flt_disp
, disp
);
3316 CE_XDIAG_SETPTNRID(aflt
->flt_disp
, cp
->cpu_id
);
3317 CE_XDIAG_SETPTNRTYPE(aflt
->flt_disp
, ptnrtype
);
3318 } else if (ncpus
> 1) {
3319 ce_xdiag_ptnrdrops
++;
3320 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
,
3321 CE_XDIAG_SKIP_NOPTNR
);
3323 ce_xdiag_ptnrdrops
++;
3324 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
,
3325 CE_XDIAG_SKIP_UNIPROC
);
3333 case CE_ACT(CE_DISP_BAD
):
3336 cmn_err(CE_PANIC
, "ce_scrub_post: Bad action '%d'", action
);
3339 CE_XDIAG_SETSKIPCODE(aflt
->flt_disp
, CE_XDIAG_SKIP_ACTBAD
);
3347 * We route all errors through a single switch statement.
3350 cpu_ue_log_err(struct async_flt
*aflt
)
3352 switch (aflt
->flt_class
) {
3354 cpu_ereport_init(aflt
);
3355 if (cpu_async_log_err(aflt
, NULL
))
3356 cpu_ereport_post(aflt
);
3360 bus_async_log_err(aflt
);
3364 cmn_err(CE_WARN
, "discarding async error %p with invalid "
3365 "fault class (0x%x)", (void *)aflt
, aflt
->flt_class
);
3371 * Routine for panic hook callback from panic_idle().
3374 cpu_async_panic_callb(void)
3376 ch_async_flt_t ch_flt
;
3377 struct async_flt
*aflt
;
3378 ch_cpu_errors_t cpu_error_regs
;
3381 get_cpu_error_state(&cpu_error_regs
);
3383 afsr_errs
= (cpu_error_regs
.afsr
& C_AFSR_ALL_ERRS
) |
3384 (cpu_error_regs
.afsr_ext
& C_AFSR_EXT_ALL_ERRS
);
3388 bzero(&ch_flt
, sizeof (ch_async_flt_t
));
3389 aflt
= (struct async_flt
*)&ch_flt
;
3390 aflt
->flt_id
= gethrtime_waitfree();
3391 aflt
->flt_bus_id
= getprocessorid();
3392 aflt
->flt_inst
= CPU
->cpu_id
;
3393 aflt
->flt_stat
= cpu_error_regs
.afsr
;
3394 aflt
->flt_addr
= cpu_error_regs
.afar
;
3395 aflt
->flt_prot
= AFLT_PROT_NONE
;
3396 aflt
->flt_class
= CPU_FAULT
;
3397 aflt
->flt_priv
= ((cpu_error_regs
.afsr
& C_AFSR_PRIV
) != 0);
3398 aflt
->flt_panic
= 1;
3399 ch_flt
.afsr_ext
= cpu_error_regs
.afsr_ext
;
3400 ch_flt
.afsr_errs
= afsr_errs
;
3401 #if defined(SERRANO)
3402 ch_flt
.afar2
= cpu_error_regs
.afar2
;
3403 #endif /* SERRANO */
3404 (void) cpu_queue_events(&ch_flt
, NULL
, afsr_errs
, NULL
);
3409 * Routine to convert a syndrome into a syndrome code.
3412 synd_to_synd_code(int synd_status
, ushort_t synd
, uint64_t afsr_bit
)
3414 if (synd_status
== AFLT_STAT_INVALID
)
3418 * Use the syndrome to index the appropriate syndrome table,
3419 * to get the code indicating which bit(s) is(are) bad.
3422 (C_AFSR_MSYND_ERRS
| C_AFSR_ESYND_ERRS
| C_AFSR_EXT_ESYND_ERRS
)) {
3423 if (afsr_bit
& C_AFSR_MSYND_ERRS
) {
3424 #if defined(JALAPENO) || defined(SERRANO)
3425 if ((synd
== 0) || (synd
>= BSYND_TBL_SIZE
))
3428 return (BPAR0
+ synd
);
3429 #else /* JALAPENO || SERRANO */
3430 if ((synd
== 0) || (synd
>= MSYND_TBL_SIZE
))
3433 return (mtag_syndrome_tab
[synd
]);
3434 #endif /* JALAPENO || SERRANO */
3436 if ((synd
== 0) || (synd
>= ESYND_TBL_SIZE
))
3439 return (ecc_syndrome_tab
[synd
]);
3447 cpu_get_mem_sid(char *unum
, char *buf
, int buflen
, int *lenp
)
3449 if (&plat_get_mem_sid
)
3450 return (plat_get_mem_sid(unum
, buf
, buflen
, lenp
));
3456 cpu_get_mem_offset(uint64_t flt_addr
, uint64_t *offp
)
3458 if (&plat_get_mem_offset
)
3459 return (plat_get_mem_offset(flt_addr
, offp
));
3465 cpu_get_mem_addr(char *unum
, char *sid
, uint64_t offset
, uint64_t *addrp
)
3467 if (&plat_get_mem_addr
)
3468 return (plat_get_mem_addr(unum
, sid
, offset
, addrp
));
3474 * Routine to return a string identifying the physical name
3475 * associated with a memory/cache error.
3478 cpu_get_mem_unum(int synd_status
, ushort_t flt_synd
, uint64_t flt_stat
,
3479 uint64_t flt_addr
, int flt_bus_id
, int flt_in_memory
,
3480 ushort_t flt_status
, char *buf
, int buflen
, int *lenp
)
3486 * An AFSR of -1 defaults to a memory syndrome.
3488 if (flt_stat
== (uint64_t)-1)
3489 flt_stat
= C_AFSR_CE
;
3491 synd_code
= synd_to_synd_code(synd_status
, flt_synd
, flt_stat
);
3494 * Syndrome code must be either a single-bit error code
3495 * (0...143) or -1 for unum lookup.
3497 if (synd_code
< 0 || synd_code
>= M2
)
3499 if (&plat_get_mem_unum
) {
3500 if ((ret
= plat_get_mem_unum(synd_code
, flt_addr
, flt_bus_id
,
3501 flt_in_memory
, flt_status
, buf
, buflen
, lenp
)) != 0) {
3513 * Wrapper for cpu_get_mem_unum() routine that takes an
3514 * async_flt struct rather than explicit arguments.
3517 cpu_get_mem_unum_aflt(int synd_status
, struct async_flt
*aflt
,
3518 char *buf
, int buflen
, int *lenp
)
3521 * If we come thru here for an IO bus error aflt->flt_stat will
3522 * not be the CPU AFSR, and we pass in a -1 to cpu_get_mem_unum()
3523 * so it will interpret this as a memory error.
3525 return (cpu_get_mem_unum(synd_status
, aflt
->flt_synd
,
3526 (aflt
->flt_class
== BUS_FAULT
) ?
3527 (uint64_t)-1 : ((ch_async_flt_t
*)aflt
)->flt_bit
,
3528 aflt
->flt_addr
, aflt
->flt_bus_id
, aflt
->flt_in_memory
,
3529 aflt
->flt_status
, buf
, buflen
, lenp
));
3533 * Return unum string given synd_code and async_flt into
3534 * the buf with size UNUM_NAMLEN
3537 cpu_get_mem_unum_synd(int synd_code
, struct async_flt
*aflt
, char *buf
)
3542 * Syndrome code must be either a single-bit error code
3543 * (0...143) or -1 for unum lookup.
3545 if (synd_code
< 0 || synd_code
>= M2
)
3547 if (&plat_get_mem_unum
) {
3548 if ((ret
= plat_get_mem_unum(synd_code
, aflt
->flt_addr
,
3549 aflt
->flt_bus_id
, aflt
->flt_in_memory
,
3550 aflt
->flt_status
, buf
, UNUM_NAMLEN
, &len
)) != 0) {
3561 * This routine is a more generic interface to cpu_get_mem_unum()
3562 * that may be used by other modules (e.g. the 'mm' driver, through
3563 * the 'MEM_NAME' ioctl, which is used by fmd to resolve unum's
3564 * for Jalapeno/Serrano FRC/RCE or FRU/RUE paired events).
3567 cpu_get_mem_name(uint64_t synd
, uint64_t *afsr
, uint64_t afar
,
3568 char *buf
, int buflen
, int *lenp
)
3570 int synd_status
, flt_in_memory
, ret
;
3571 ushort_t flt_status
= 0;
3572 char unum
[UNUM_NAMLEN
];
3573 uint64_t t_afsr_errs
;
3576 * Check for an invalid address.
3578 if (afar
== (uint64_t)-1)
3581 if (synd
== (uint64_t)-1)
3582 synd_status
= AFLT_STAT_INVALID
;
3584 synd_status
= AFLT_STAT_VALID
;
3586 flt_in_memory
= (*afsr
& C_AFSR_MEMORY
) &&
3587 pf_is_memory(afar
>> MMU_PAGESHIFT
);
3590 * Get aggregate AFSR for call to cpu_error_is_ecache_data.
3592 if (*afsr
== (uint64_t)-1)
3593 t_afsr_errs
= C_AFSR_CE
;
3595 t_afsr_errs
= (*afsr
& C_AFSR_ALL_ERRS
);
3596 #if defined(CHEETAH_PLUS)
3597 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
))
3598 t_afsr_errs
|= (*(afsr
+ 1) & C_AFSR_EXT_ALL_ERRS
);
3599 #endif /* CHEETAH_PLUS */
3603 * Turn on ECC_ECACHE if error type is E$ Data.
3605 if (cpu_error_is_ecache_data(CPU
->cpu_id
, t_afsr_errs
))
3606 flt_status
|= ECC_ECACHE
;
3608 ret
= cpu_get_mem_unum(synd_status
, (ushort_t
)synd
, t_afsr_errs
, afar
,
3609 CPU
->cpu_id
, flt_in_memory
, flt_status
, unum
, UNUM_NAMLEN
, lenp
);
3613 if (*lenp
>= buflen
)
3614 return (ENAMETOOLONG
);
3616 (void) strncpy(buf
, unum
, buflen
);
3622 * Routine to return memory information associated
3623 * with a physical address and syndrome.
3626 cpu_get_mem_info(uint64_t synd
, uint64_t afar
,
3627 uint64_t *mem_sizep
, uint64_t *seg_sizep
, uint64_t *bank_sizep
,
3628 int *segsp
, int *banksp
, int *mcidp
)
3630 int synd_status
, synd_code
;
3632 if (afar
== (uint64_t)-1)
3635 if (synd
== (uint64_t)-1)
3636 synd_status
= AFLT_STAT_INVALID
;
3638 synd_status
= AFLT_STAT_VALID
;
3640 synd_code
= synd_to_synd_code(synd_status
, synd
, C_AFSR_CE
);
3642 if (p2get_mem_info
!= NULL
)
3643 return ((p2get_mem_info
)(synd_code
, afar
,
3644 mem_sizep
, seg_sizep
, bank_sizep
,
3645 segsp
, banksp
, mcidp
));
3651 * Routine to return a string identifying the physical
3652 * name associated with a cpuid.
3655 cpu_get_cpu_unum(int cpuid
, char *buf
, int buflen
, int *lenp
)
3658 char unum
[UNUM_NAMLEN
];
3660 if (&plat_get_cpu_unum
) {
3661 if ((ret
= plat_get_cpu_unum(cpuid
, unum
, UNUM_NAMLEN
, lenp
))
3668 if (*lenp
>= buflen
)
3669 return (ENAMETOOLONG
);
3671 (void) strncpy(buf
, unum
, buflen
);
3677 * This routine exports the name buffer size.
3680 cpu_get_name_bufsize()
3682 return (UNUM_NAMLEN
);
3686 * Historical function, apparantly not used.
3690 cpu_read_paddr(struct async_flt
*ecc
, short verbose
, short ce_err
)
3694 * Historical function only called for SBus errors in debugging.
3698 read_ecc_data(struct async_flt
*aflt
, short verbose
, short ce_err
)
3702 * Clear the AFSR sticky bits. The routine returns a non-zero value if
3703 * any of the AFSR's sticky errors are detected. If a non-null pointer to
3704 * an async fault structure argument is passed in, the captured error state
3705 * (AFSR, AFAR) info will be returned in the structure.
3708 clear_errors(ch_async_flt_t
*ch_flt
)
3710 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
3711 ch_cpu_errors_t cpu_error_regs
;
3713 get_cpu_error_state(&cpu_error_regs
);
3715 if (ch_flt
!= NULL
) {
3716 aflt
->flt_stat
= cpu_error_regs
.afsr
& C_AFSR_MASK
;
3717 aflt
->flt_addr
= cpu_error_regs
.afar
;
3718 ch_flt
->afsr_ext
= cpu_error_regs
.afsr_ext
;
3719 ch_flt
->afsr_errs
= (cpu_error_regs
.afsr
& C_AFSR_ALL_ERRS
) |
3720 (cpu_error_regs
.afsr_ext
& C_AFSR_EXT_ALL_ERRS
);
3721 #if defined(SERRANO)
3722 ch_flt
->afar2
= cpu_error_regs
.afar2
;
3723 #endif /* SERRANO */
3726 set_cpu_error_state(&cpu_error_regs
);
3728 return (((cpu_error_regs
.afsr
& C_AFSR_ALL_ERRS
) |
3729 (cpu_error_regs
.afsr_ext
& C_AFSR_EXT_ALL_ERRS
)) != 0);
3733 * Clear any AFSR error bits, and check for persistence.
3735 * It would be desirable to also insist that syndrome match. PCI handling
3736 * has already filled flt_synd. For errors trapped by CPU we only fill
3737 * flt_synd when we queue the event, so we do not have a valid flt_synd
3738 * during initial classification (it is valid if we're called as part of
3739 * subsequent low-pil additional classification attempts). We could try
3740 * to determine which syndrome to use: we know we're only called for
3741 * CE/RCE (Jalapeno & Serrano) and CE/EMC (others) so the syndrome to use
3742 * would be esynd/none and esynd/msynd, respectively. If that is
3743 * implemented then what do we do in the case that we do experience an
3744 * error on the same afar but with different syndrome? At the very least
3745 * we should count such occurences. Anyway, for now, we'll leave it as
3746 * it has been for ages.
3749 clear_ecc(struct async_flt
*aflt
)
3751 ch_cpu_errors_t cpu_error_regs
;
3754 * Snapshot the AFSR and AFAR and clear any errors
3756 get_cpu_error_state(&cpu_error_regs
);
3757 set_cpu_error_state(&cpu_error_regs
);
3760 * If any of the same memory access error bits are still on and
3761 * the AFAR matches, return that the error is persistent.
3763 return ((cpu_error_regs
.afsr
& (C_AFSR_MEMORY
& aflt
->flt_stat
)) != 0 &&
3764 cpu_error_regs
.afar
== aflt
->flt_addr
);
3768 * Turn off all cpu error detection, normally only used for panics.
3771 cpu_disable_errors(void)
3773 xt_all(set_error_enable_tl1
, EN_REG_DISABLE
, EER_SET_ABSOLUTE
);
3776 * With error detection now turned off, check the other cpus
3777 * logout areas for any unlogged errors.
3779 if (enable_check_other_cpus_logout
) {
3780 cpu_check_other_cpus_logout();
3782 * Make a second pass over the logout areas, in case
3783 * there is a failing CPU in an error-trap loop which
3784 * will write to the logout area once it is emptied.
3786 cpu_check_other_cpus_logout();
3794 cpu_enable_errors(void)
3796 xt_all(set_error_enable_tl1
, EN_REG_ENABLE
, EER_SET_ABSOLUTE
);
3800 * Flush the entire ecache using displacement flush by reading through a
3801 * physical address range twice as large as the Ecache.
3804 cpu_flush_ecache(void)
3806 flush_ecache(ecache_flushaddr
, cpunodes
[CPU
->cpu_id
].ecache_size
,
3807 cpunodes
[CPU
->cpu_id
].ecache_linesize
);
3811 * Return CPU E$ set size - E$ size divided by the associativity.
3812 * We use this function in places where the CPU_PRIVATE ptr may not be
3813 * initialized yet. Note that for send_mondo and in the Ecache scrubber,
3814 * we're guaranteed that CPU_PRIVATE is initialized. Also, cpunodes is set
3815 * up before the kernel switches from OBP's to the kernel's trap table, so
3816 * we don't have to worry about cpunodes being unitialized.
3819 cpu_ecache_set_size(struct cpu
*cp
)
3821 if (CPU_PRIVATE(cp
))
3822 return (CPU_PRIVATE_VAL(cp
, chpr_ec_set_size
));
3824 return (cpunodes
[cp
->cpu_id
].ecache_size
/ cpu_ecache_nway());
3828 * Flush Ecache line.
3829 * Uses ASI_EC_DIAG for Cheetah+ and Jalapeno.
3830 * Uses normal displacement flush for Cheetah.
3833 cpu_flush_ecache_line(ch_async_flt_t
*ch_flt
)
3835 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
3836 int ec_set_size
= cpu_ecache_set_size(CPU
);
3838 ecache_flush_line(aflt
->flt_addr
, ec_set_size
);
3842 * Scrub physical address.
3843 * Scrub code is different depending upon whether this a Cheetah+ with 2-way
3844 * Ecache or direct-mapped Ecache.
3847 cpu_scrubphys(struct async_flt
*aflt
)
3849 int ec_set_size
= cpu_ecache_set_size(CPU
);
3851 scrubphys(aflt
->flt_addr
, ec_set_size
);
3855 * Clear physical address.
3856 * Scrub code is different depending upon whether this a Cheetah+ with 2-way
3857 * Ecache or direct-mapped Ecache.
3860 cpu_clearphys(struct async_flt
*aflt
)
3862 int lsize
= cpunodes
[CPU
->cpu_id
].ecache_linesize
;
3863 int ec_set_size
= cpu_ecache_set_size(CPU
);
3866 clearphys(aflt
->flt_addr
, ec_set_size
, lsize
);
3869 #if defined(CPU_IMP_ECACHE_ASSOC)
3871 * Check for a matching valid line in all the sets.
3872 * If found, return set# + 1. Otherwise return 0.
3875 cpu_ecache_line_valid(ch_async_flt_t
*ch_flt
)
3877 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
3878 int totalsize
= cpunodes
[CPU
->cpu_id
].ecache_size
;
3879 int ec_set_size
= cpu_ecache_set_size(CPU
);
3880 ch_ec_data_t
*ecp
= &ch_flt
->flt_diag_data
.chd_ec_data
[0];
3881 int nway
= cpu_ecache_nway();
3884 for (i
= 0; i
< nway
; i
++, ecp
++) {
3885 if (!cpu_ectag_line_invalid(totalsize
, ecp
->ec_tag
) &&
3886 (aflt
->flt_addr
& P2ALIGN(C_AFAR_PA
, ec_set_size
)) ==
3887 cpu_ectag_to_pa(ec_set_size
, ecp
->ec_tag
))
3892 #endif /* CPU_IMP_ECACHE_ASSOC */
3895 * Check whether a line in the given logout info matches the specified
3896 * fault address. If reqval is set then the line must not be Invalid.
3897 * Returns 0 on failure; on success (way + 1) is returned an *level is
3898 * set to 2 for l2$ or 3 for l3$.
3901 cpu_matching_ecache_line(uint64_t faddr
, void *data
, int reqval
, int *level
)
3903 ch_diag_data_t
*cdp
= data
;
3905 int totalsize
, ec_set_size
;
3909 uint64_t addr
, tagpa
;
3910 int ispanther
= IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
);
3913 * Check the l2$ logout data
3916 ecp
= &cdp
->chd_l2_data
[0];
3917 ec_set_size
= PN_L2_SET_SIZE
;
3920 ecp
= &cdp
->chd_ec_data
[0];
3921 ec_set_size
= cpu_ecache_set_size(CPU
);
3922 ways
= cpu_ecache_nway();
3923 totalsize
= cpunodes
[CPU
->cpu_id
].ecache_size
;
3925 /* remove low order PA bits from fault address not used in PA tag */
3926 addr
= faddr
& P2ALIGN(C_AFAR_PA
, ec_set_size
);
3927 for (i
= 0; i
< ways
; i
++, ecp
++) {
3929 tagpa
= PN_L2TAG_TO_PA(ecp
->ec_tag
);
3930 tagvalid
= !PN_L2_LINE_INVALID(ecp
->ec_tag
);
3932 tagpa
= cpu_ectag_to_pa(ec_set_size
, ecp
->ec_tag
);
3933 tagvalid
= !cpu_ectag_line_invalid(totalsize
,
3936 if (tagpa
== addr
&& (!reqval
|| tagvalid
)) {
3943 if (match
|| !ispanther
)
3946 /* For Panther we also check the l3$ */
3947 ecp
= &cdp
->chd_ec_data
[0];
3948 ec_set_size
= PN_L3_SET_SIZE
;
3950 addr
= faddr
& P2ALIGN(C_AFAR_PA
, ec_set_size
);
3952 for (i
= 0; i
< ways
; i
++, ecp
++) {
3953 if (PN_L3TAG_TO_PA(ecp
->ec_tag
) == addr
&& (!reqval
||
3954 !PN_L3_LINE_INVALID(ecp
->ec_tag
))) {
3964 #if defined(CPU_IMP_L1_CACHE_PARITY)
3966 * Record information related to the source of an Dcache Parity Error.
3969 cpu_dcache_parity_info(ch_async_flt_t
*ch_flt
)
3971 int dc_set_size
= dcache_size
/ CH_DCACHE_NWAY
;
3975 * Since instruction decode cannot be done at high PIL
3976 * just examine the entire Dcache to locate the error.
3978 if (ch_flt
->parity_data
.dpe
.cpl_lcnt
== 0) {
3979 ch_flt
->parity_data
.dpe
.cpl_way
= -1;
3980 ch_flt
->parity_data
.dpe
.cpl_off
= -1;
3982 for (index
= 0; index
< dc_set_size
; index
+= dcache_linesize
)
3983 cpu_dcache_parity_check(ch_flt
, index
);
3987 * Check all ways of the Dcache at a specified index for good parity.
3990 cpu_dcache_parity_check(ch_async_flt_t
*ch_flt
, int index
)
3992 int dc_set_size
= dcache_size
/ CH_DCACHE_NWAY
;
3993 uint64_t parity_bits
, pbits
, data_word
;
3994 static int parity_bits_popc
[] = { 0, 1, 1, 0 };
3995 int way
, word
, data_byte
;
3996 ch_dc_data_t
*dcp
= &ch_flt
->parity_data
.dpe
.cpl_dc
[0];
3997 ch_dc_data_t tmp_dcp
;
3999 for (way
= 0; way
< CH_DCACHE_NWAY
; way
++, dcp
++) {
4001 * Perform diagnostic read.
4003 get_dcache_dtag(index
+ way
* dc_set_size
,
4004 (uint64_t *)&tmp_dcp
);
4007 * Check tag for even parity.
4008 * Sum of 1 bits (including parity bit) should be even.
4010 if (popc64(tmp_dcp
.dc_tag
& CHP_DCTAG_PARMASK
) & 1) {
4012 * If this is the first error log detailed information
4013 * about it and check the snoop tag. Otherwise just
4014 * record the fact that we found another error.
4016 if (ch_flt
->parity_data
.dpe
.cpl_lcnt
== 0) {
4017 ch_flt
->parity_data
.dpe
.cpl_way
= way
;
4018 ch_flt
->parity_data
.dpe
.cpl_cache
=
4020 ch_flt
->parity_data
.dpe
.cpl_tag
|= CHP_DC_TAG
;
4022 if (popc64(tmp_dcp
.dc_sntag
&
4023 CHP_DCSNTAG_PARMASK
) & 1) {
4024 ch_flt
->parity_data
.dpe
.cpl_tag
|=
4026 ch_flt
->parity_data
.dpe
.cpl_lcnt
++;
4029 bcopy(&tmp_dcp
, dcp
, sizeof (ch_dc_data_t
));
4032 ch_flt
->parity_data
.dpe
.cpl_lcnt
++;
4035 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
)) {
4037 * Panther has more parity bits than the other
4038 * processors for covering dcache data and so each
4039 * byte of data in each word has its own parity bit.
4041 parity_bits
= tmp_dcp
.dc_pn_data_parity
;
4042 for (word
= 0; word
< 4; word
++) {
4043 data_word
= tmp_dcp
.dc_data
[word
];
4044 pbits
= parity_bits
& PN_DC_DATA_PARITY_MASK
;
4045 for (data_byte
= 0; data_byte
< 8;
4047 if (((popc64(data_word
&
4048 PN_DC_DATA_PARITY_MASK
)) & 1) ^
4050 cpu_record_dc_data_parity(
4051 ch_flt
, dcp
, &tmp_dcp
, way
,
4061 * Check data array for even parity.
4062 * The 8 parity bits are grouped into 4 pairs each
4063 * of which covers a 64-bit word. The endianness is
4064 * reversed -- the low-order parity bits cover the
4065 * high-order data words.
4067 parity_bits
= tmp_dcp
.dc_utag
>> 8;
4068 for (word
= 0; word
< 4; word
++) {
4069 pbits
= (parity_bits
>> (6 - word
* 2)) & 3;
4070 if ((popc64(tmp_dcp
.dc_data
[word
]) +
4071 parity_bits_popc
[pbits
]) & 1) {
4072 cpu_record_dc_data_parity(ch_flt
, dcp
,
4073 &tmp_dcp
, way
, word
);
4081 cpu_record_dc_data_parity(ch_async_flt_t
*ch_flt
,
4082 ch_dc_data_t
*dest_dcp
, ch_dc_data_t
*src_dcp
, int way
, int word
)
4085 * If this is the first error log detailed information about it.
4086 * Otherwise just record the fact that we found another error.
4088 if (ch_flt
->parity_data
.dpe
.cpl_lcnt
== 0) {
4089 ch_flt
->parity_data
.dpe
.cpl_way
= way
;
4090 ch_flt
->parity_data
.dpe
.cpl_cache
= CPU_DC_PARITY
;
4091 ch_flt
->parity_data
.dpe
.cpl_off
= word
* 8;
4092 bcopy(src_dcp
, dest_dcp
, sizeof (ch_dc_data_t
));
4094 ch_flt
->parity_data
.dpe
.cpl_lcnt
++;
4098 * Record information related to the source of an Icache Parity Error.
4100 * Called with the Icache disabled so any diagnostic accesses are safe.
4103 cpu_icache_parity_info(ch_async_flt_t
*ch_flt
)
4109 if (CPU_PRIVATE(CPU
)) {
4110 ic_set_size
= CPU_PRIVATE_VAL(CPU
, chpr_icache_size
) /
4112 ic_linesize
= CPU_PRIVATE_VAL(CPU
, chpr_icache_linesize
);
4114 ic_set_size
= icache_size
/ CH_ICACHE_NWAY
;
4115 ic_linesize
= icache_linesize
;
4118 ch_flt
->parity_data
.ipe
.cpl_way
= -1;
4119 ch_flt
->parity_data
.ipe
.cpl_off
= -1;
4121 for (index
= 0; index
< ic_set_size
; index
+= ic_linesize
)
4122 cpu_icache_parity_check(ch_flt
, index
);
4126 * Check all ways of the Icache at a specified index for good parity.
4129 cpu_icache_parity_check(ch_async_flt_t
*ch_flt
, int index
)
4131 uint64_t parmask
, pn_inst_parity
;
4134 int flt_index
, way
, instr
, num_instr
;
4135 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
4136 ch_ic_data_t
*icp
= &ch_flt
->parity_data
.ipe
.cpl_ic
[0];
4137 ch_ic_data_t tmp_icp
;
4139 if (CPU_PRIVATE(CPU
)) {
4140 ic_set_size
= CPU_PRIVATE_VAL(CPU
, chpr_icache_size
) /
4142 ic_linesize
= CPU_PRIVATE_VAL(CPU
, chpr_icache_linesize
);
4144 ic_set_size
= icache_size
/ CH_ICACHE_NWAY
;
4145 ic_linesize
= icache_linesize
;
4149 * Panther has twice as many instructions per icache line and the
4150 * instruction parity bit is in a different location.
4152 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
)) {
4153 num_instr
= PN_IC_DATA_REG_SIZE
/ sizeof (uint64_t);
4154 pn_inst_parity
= PN_ICDATA_PARITY_BIT_MASK
;
4156 num_instr
= CH_IC_DATA_REG_SIZE
/ sizeof (uint64_t);
4161 * Index at which we expect to find the parity error.
4163 flt_index
= P2ALIGN(aflt
->flt_addr
% ic_set_size
, ic_linesize
);
4165 for (way
= 0; way
< CH_ICACHE_NWAY
; way
++, icp
++) {
4167 * Diagnostic reads expect address argument in ASI format.
4169 get_icache_dtag(2 * (index
+ way
* ic_set_size
),
4170 (uint64_t *)&tmp_icp
);
4173 * If this is the index in which we expect to find the
4174 * error log detailed information about each of the ways.
4175 * This information will be displayed later if we can't
4176 * determine the exact way in which the error is located.
4178 if (flt_index
== index
)
4179 bcopy(&tmp_icp
, icp
, sizeof (ch_ic_data_t
));
4182 * Check tag for even parity.
4183 * Sum of 1 bits (including parity bit) should be even.
4185 if (popc64(tmp_icp
.ic_patag
& CHP_ICPATAG_PARMASK
) & 1) {
4187 * If this way is the one in which we expected
4188 * to find the error record the way and check the
4189 * snoop tag. Otherwise just record the fact we
4190 * found another error.
4192 if (flt_index
== index
) {
4193 ch_flt
->parity_data
.ipe
.cpl_way
= way
;
4194 ch_flt
->parity_data
.ipe
.cpl_tag
|= CHP_IC_TAG
;
4196 if (popc64(tmp_icp
.ic_sntag
&
4197 CHP_ICSNTAG_PARMASK
) & 1) {
4198 ch_flt
->parity_data
.ipe
.cpl_tag
|=
4200 ch_flt
->parity_data
.ipe
.cpl_lcnt
++;
4204 ch_flt
->parity_data
.ipe
.cpl_lcnt
++;
4209 * Check instruction data for even parity.
4210 * Bits participating in parity differ for PC-relative
4211 * versus non-PC-relative instructions.
4213 for (instr
= 0; instr
< num_instr
; instr
++) {
4214 parmask
= (tmp_icp
.ic_data
[instr
] &
4215 CH_ICDATA_PRED_ISPCREL
) ?
4216 (CHP_ICDATA_PCREL_PARMASK
| pn_inst_parity
) :
4217 (CHP_ICDATA_NPCREL_PARMASK
| pn_inst_parity
);
4218 if (popc64(tmp_icp
.ic_data
[instr
] & parmask
) & 1) {
4220 * If this way is the one in which we expected
4221 * to find the error record the way and offset.
4222 * Otherwise just log the fact we found another
4225 if (flt_index
== index
) {
4226 ch_flt
->parity_data
.ipe
.cpl_way
= way
;
4227 ch_flt
->parity_data
.ipe
.cpl_off
=
4230 ch_flt
->parity_data
.ipe
.cpl_lcnt
++;
4238 * Record information related to the source of an Pcache Parity Error.
4241 cpu_pcache_parity_info(ch_async_flt_t
*ch_flt
)
4243 int pc_set_size
= CH_PCACHE_SIZE
/ CH_PCACHE_NWAY
;
4247 * Since instruction decode cannot be done at high PIL just
4248 * examine the entire Pcache to check for any parity errors.
4250 if (ch_flt
->parity_data
.dpe
.cpl_lcnt
== 0) {
4251 ch_flt
->parity_data
.dpe
.cpl_way
= -1;
4252 ch_flt
->parity_data
.dpe
.cpl_off
= -1;
4254 for (index
= 0; index
< pc_set_size
; index
+= CH_PCACHE_LSIZE
)
4255 cpu_pcache_parity_check(ch_flt
, index
);
4259 * Check all ways of the Pcache at a specified index for good parity.
4262 cpu_pcache_parity_check(ch_async_flt_t
*ch_flt
, int index
)
4264 int pc_set_size
= CH_PCACHE_SIZE
/ CH_PCACHE_NWAY
;
4265 int pc_data_words
= CH_PC_DATA_REG_SIZE
/ sizeof (uint64_t);
4266 int way
, word
, pbit
, parity_bits
;
4267 ch_pc_data_t
*pcp
= &ch_flt
->parity_data
.dpe
.cpl_pc
[0];
4268 ch_pc_data_t tmp_pcp
;
4270 for (way
= 0; way
< CH_PCACHE_NWAY
; way
++, pcp
++) {
4272 * Perform diagnostic read.
4274 get_pcache_dtag(index
+ way
* pc_set_size
,
4275 (uint64_t *)&tmp_pcp
);
4277 * Check data array for odd parity. There are 8 parity
4278 * bits (bits 57:50 of ASI_PCACHE_STATUS_DATA) and each
4279 * of those bits covers exactly 8 bytes of the data
4282 * parity bit P$ data bytes covered
4283 * ---------- ---------------------
4293 parity_bits
= PN_PC_PARITY_BITS(tmp_pcp
.pc_status
);
4294 for (word
= 0; word
< pc_data_words
; word
++) {
4295 pbit
= (parity_bits
>> (pc_data_words
- word
- 1)) & 1;
4296 if ((popc64(tmp_pcp
.pc_data
[word
]) & 1) ^ pbit
) {
4298 * If this is the first error log detailed
4299 * information about it. Otherwise just record
4300 * the fact that we found another error.
4302 if (ch_flt
->parity_data
.dpe
.cpl_lcnt
== 0) {
4303 ch_flt
->parity_data
.dpe
.cpl_way
= way
;
4304 ch_flt
->parity_data
.dpe
.cpl_cache
=
4306 ch_flt
->parity_data
.dpe
.cpl_off
=
4307 word
* sizeof (uint64_t);
4308 bcopy(&tmp_pcp
, pcp
,
4309 sizeof (ch_pc_data_t
));
4311 ch_flt
->parity_data
.dpe
.cpl_lcnt
++;
4319 * Add L1 Data cache data to the ereport payload.
4322 cpu_payload_add_dcache(struct async_flt
*aflt
, nvlist_t
*nvl
)
4324 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
4326 ch_dc_data_t dcdata
[CH_DCACHE_NWAY
];
4328 int i
, ways_to_check
, ways_logged
= 0;
4331 * If this is an D$ fault then there may be multiple
4332 * ways captured in the ch_parity_log_t structure.
4333 * Otherwise, there will be at most one way captured
4334 * in the ch_diag_data_t struct.
4335 * Check each way to see if it should be encoded.
4337 if (ch_flt
->flt_type
== CPU_DC_PARITY
)
4338 ways_to_check
= CH_DCACHE_NWAY
;
4341 for (i
= 0; i
< ways_to_check
; i
++) {
4342 if (ch_flt
->flt_type
== CPU_DC_PARITY
)
4343 dcp
= &ch_flt
->parity_data
.dpe
.cpl_dc
[i
];
4345 dcp
= &ch_flt
->flt_diag_data
.chd_dc_data
;
4346 if (dcp
->dc_logflag
== DC_LOGFLAG_MAGIC
) {
4347 bcopy(dcp
, &dcdata
[ways_logged
],
4348 sizeof (ch_dc_data_t
));
4354 * Add the dcache data to the payload.
4356 fm_payload_set(nvl
, FM_EREPORT_PAYLOAD_NAME_L1D_WAYS
,
4357 DATA_TYPE_UINT8
, (uint8_t)ways_logged
, NULL
);
4358 if (ways_logged
!= 0) {
4359 nelem
= sizeof (ch_dc_data_t
) / sizeof (uint64_t) * ways_logged
;
4360 fm_payload_set(nvl
, FM_EREPORT_PAYLOAD_NAME_L1D_DATA
,
4361 DATA_TYPE_UINT64_ARRAY
, nelem
, (uint64_t *)dcdata
, NULL
);
4366 * Add L1 Instruction cache data to the ereport payload.
4369 cpu_payload_add_icache(struct async_flt
*aflt
, nvlist_t
*nvl
)
4371 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
4373 ch_ic_data_t icdata
[CH_ICACHE_NWAY
];
4375 int i
, ways_to_check
, ways_logged
= 0;
4378 * If this is an I$ fault then there may be multiple
4379 * ways captured in the ch_parity_log_t structure.
4380 * Otherwise, there will be at most one way captured
4381 * in the ch_diag_data_t struct.
4382 * Check each way to see if it should be encoded.
4384 if (ch_flt
->flt_type
== CPU_IC_PARITY
)
4385 ways_to_check
= CH_ICACHE_NWAY
;
4388 for (i
= 0; i
< ways_to_check
; i
++) {
4389 if (ch_flt
->flt_type
== CPU_IC_PARITY
)
4390 icp
= &ch_flt
->parity_data
.ipe
.cpl_ic
[i
];
4392 icp
= &ch_flt
->flt_diag_data
.chd_ic_data
;
4393 if (icp
->ic_logflag
== IC_LOGFLAG_MAGIC
) {
4394 bcopy(icp
, &icdata
[ways_logged
],
4395 sizeof (ch_ic_data_t
));
4401 * Add the icache data to the payload.
4403 fm_payload_set(nvl
, FM_EREPORT_PAYLOAD_NAME_L1I_WAYS
,
4404 DATA_TYPE_UINT8
, (uint8_t)ways_logged
, NULL
);
4405 if (ways_logged
!= 0) {
4406 nelem
= sizeof (ch_ic_data_t
) / sizeof (uint64_t) * ways_logged
;
4407 fm_payload_set(nvl
, FM_EREPORT_PAYLOAD_NAME_L1I_DATA
,
4408 DATA_TYPE_UINT64_ARRAY
, nelem
, (uint64_t *)icdata
, NULL
);
4412 #endif /* CPU_IMP_L1_CACHE_PARITY */
4415 * Add ecache data to payload.
4418 cpu_payload_add_ecache(struct async_flt
*aflt
, nvlist_t
*nvl
)
4420 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
4422 ch_ec_data_t ecdata
[CHD_EC_DATA_SETS
];
4424 int i
, ways_logged
= 0;
4427 * Check each way to see if it should be encoded
4428 * and concatinate it into a temporary buffer.
4430 for (i
= 0; i
< CHD_EC_DATA_SETS
; i
++) {
4431 ecp
= &ch_flt
->flt_diag_data
.chd_ec_data
[i
];
4432 if (ecp
->ec_logflag
== EC_LOGFLAG_MAGIC
) {
4433 bcopy(ecp
, &ecdata
[ways_logged
],
4434 sizeof (ch_ec_data_t
));
4440 * Panther CPUs have an additional level of cache and so
4441 * what we just collected was the L3 (ecache) and not the
4444 if (IS_PANTHER(cpunodes
[aflt
->flt_inst
].implementation
)) {
4446 * Add the L3 (ecache) data to the payload.
4448 fm_payload_set(nvl
, FM_EREPORT_PAYLOAD_NAME_L3_WAYS
,
4449 DATA_TYPE_UINT8
, (uint8_t)ways_logged
, NULL
);
4450 if (ways_logged
!= 0) {
4451 nelem
= sizeof (ch_ec_data_t
) /
4452 sizeof (uint64_t) * ways_logged
;
4453 fm_payload_set(nvl
, FM_EREPORT_PAYLOAD_NAME_L3_DATA
,
4454 DATA_TYPE_UINT64_ARRAY
, nelem
,
4455 (uint64_t *)ecdata
, NULL
);
4459 * Now collect the L2 cache.
4462 for (i
= 0; i
< PN_L2_NWAYS
; i
++) {
4463 ecp
= &ch_flt
->flt_diag_data
.chd_l2_data
[i
];
4464 if (ecp
->ec_logflag
== EC_LOGFLAG_MAGIC
) {
4465 bcopy(ecp
, &ecdata
[ways_logged
],
4466 sizeof (ch_ec_data_t
));
4473 * Add the L2 cache data to the payload.
4475 fm_payload_set(nvl
, FM_EREPORT_PAYLOAD_NAME_L2_WAYS
,
4476 DATA_TYPE_UINT8
, (uint8_t)ways_logged
, NULL
);
4477 if (ways_logged
!= 0) {
4478 nelem
= sizeof (ch_ec_data_t
) /
4479 sizeof (uint64_t) * ways_logged
;
4480 fm_payload_set(nvl
, FM_EREPORT_PAYLOAD_NAME_L2_DATA
,
4481 DATA_TYPE_UINT64_ARRAY
, nelem
, (uint64_t *)ecdata
, NULL
);
4486 * Initialize cpu scheme for specified cpu.
4489 cpu_fmri_cpu_set(nvlist_t
*cpu_fmri
, int cpuid
)
4491 char sbuf
[21]; /* sizeof (UINT64_MAX) + '\0' */
4494 mask
= cpunodes
[cpuid
].version
;
4495 (void) snprintf(sbuf
, sizeof (sbuf
), "%llX",
4496 (u_longlong_t
)cpunodes
[cpuid
].device_id
);
4497 (void) fm_fmri_cpu_set(cpu_fmri
, FM_CPU_SCHEME_VERSION
, NULL
,
4498 cpuid
, &mask
, (const char *)sbuf
);
4502 * Returns ereport resource type.
4505 cpu_error_to_resource_type(struct async_flt
*aflt
)
4507 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
4509 switch (ch_flt
->flt_type
) {
4513 case CPU_UE_ECACHE_RETIRE
:
4516 * If AFSR error bit indicates L2$ Data for Cheetah,
4517 * Cheetah+ or Jaguar, or L3$ Data for Panther, return
4518 * E$ Data type, otherwise, return CPU type.
4520 if (cpu_error_is_ecache_data(aflt
->flt_inst
,
4522 return (ERRTYPE_ECACHE_DATA
);
4523 return (ERRTYPE_CPU
);
4533 return (ERRTYPE_MEMORY
);
4539 case CPU_ITLB_PARITY
:
4540 case CPU_DTLB_PARITY
:
4541 return (ERRTYPE_CPU
);
4543 return (ERRTYPE_UNKNOWN
);
4547 * Encode the data saved in the ch_async_flt_t struct into
4548 * the FM ereport payload.
4551 cpu_payload_add_aflt(struct async_flt
*aflt
, nvlist_t
*payload
,
4552 nvlist_t
*resource
, int *afar_status
, int *synd_status
)
4554 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
4555 *synd_status
= AFLT_STAT_INVALID
;
4556 *afar_status
= AFLT_STAT_INVALID
;
4558 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_AFSR
) {
4559 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_AFSR
,
4560 DATA_TYPE_UINT64
, aflt
->flt_stat
, NULL
);
4563 if ((aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_AFSR_EXT
) &&
4564 IS_PANTHER(cpunodes
[aflt
->flt_inst
].implementation
)) {
4565 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_AFSR_EXT
,
4566 DATA_TYPE_UINT64
, ch_flt
->afsr_ext
, NULL
);
4569 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_AFAR_STATUS
) {
4570 *afar_status
= afsr_to_afar_status(ch_flt
->afsr_errs
,
4572 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_AFAR_STATUS
,
4573 DATA_TYPE_UINT8
, (uint8_t)*afar_status
, NULL
);
4576 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_AFAR
) {
4577 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_AFAR
,
4578 DATA_TYPE_UINT64
, aflt
->flt_addr
, NULL
);
4581 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_PC
) {
4582 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_PC
,
4583 DATA_TYPE_UINT64
, (uint64_t)aflt
->flt_pc
, NULL
);
4586 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_TL
) {
4587 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_TL
,
4588 DATA_TYPE_UINT8
, (uint8_t)aflt
->flt_tl
, NULL
);
4591 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_TT
) {
4592 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_TT
,
4593 DATA_TYPE_UINT8
, flt_to_trap_type(aflt
), NULL
);
4596 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_PRIV
) {
4597 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_PRIV
,
4598 DATA_TYPE_BOOLEAN_VALUE
,
4599 (aflt
->flt_priv
? B_TRUE
: B_FALSE
), NULL
);
4602 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_ME
) {
4603 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_ME
,
4604 DATA_TYPE_BOOLEAN_VALUE
,
4605 (aflt
->flt_stat
& C_AFSR_ME
) ? B_TRUE
: B_FALSE
, NULL
);
4608 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_SYND_STATUS
) {
4609 *synd_status
= afsr_to_synd_status(aflt
->flt_inst
,
4610 ch_flt
->afsr_errs
, ch_flt
->flt_bit
);
4611 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_SYND_STATUS
,
4612 DATA_TYPE_UINT8
, (uint8_t)*synd_status
, NULL
);
4615 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_SYND
) {
4616 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_SYND
,
4617 DATA_TYPE_UINT16
, (uint16_t)aflt
->flt_synd
, NULL
);
4620 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_ERR_TYPE
) {
4621 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_ERR_TYPE
,
4622 DATA_TYPE_STRING
, flt_to_error_type(aflt
), NULL
);
4625 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_ERR_DISP
) {
4626 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_ERR_DISP
,
4627 DATA_TYPE_UINT64
, aflt
->flt_disp
, NULL
);
4630 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAGS_L2
)
4631 cpu_payload_add_ecache(aflt
, payload
);
4633 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_COPYFUNCTION
) {
4634 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_COPYFUNCTION
,
4635 DATA_TYPE_UINT8
, (uint8_t)aflt
->flt_status
& 0xff, NULL
);
4638 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_HOWDETECTED
) {
4639 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_HOWDETECTED
,
4640 DATA_TYPE_UINT8
, (uint8_t)(aflt
->flt_status
>> 8), NULL
);
4643 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_INSTRBLOCK
) {
4644 fm_payload_set(payload
, FM_EREPORT_PAYLOAD_NAME_INSTRBLOCK
,
4645 DATA_TYPE_UINT32_ARRAY
, 16,
4646 (uint32_t *)&ch_flt
->flt_fpdata
, NULL
);
4649 #if defined(CPU_IMP_L1_CACHE_PARITY)
4650 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAGS_L1D
)
4651 cpu_payload_add_dcache(aflt
, payload
);
4652 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAGS_L1I
)
4653 cpu_payload_add_icache(aflt
, payload
);
4654 #endif /* CPU_IMP_L1_CACHE_PARITY */
4656 #if defined(CHEETAH_PLUS)
4657 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAGS_L1P
)
4658 cpu_payload_add_pcache(aflt
, payload
);
4659 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAGS_TLB
)
4660 cpu_payload_add_tlb(aflt
, payload
);
4661 #endif /* CHEETAH_PLUS */
4663 * Create the FMRI that goes into the payload
4664 * and contains the unum info if necessary.
4666 if (aflt
->flt_payload
& FM_EREPORT_PAYLOAD_FLAG_RESOURCE
) {
4667 char unum
[UNUM_NAMLEN
] = "";
4668 char sid
[DIMM_SERIAL_ID_LEN
] = "";
4669 int len
, ret
, rtype
, synd_code
;
4670 uint64_t offset
= (uint64_t)-1;
4672 rtype
= cpu_error_to_resource_type(aflt
);
4675 case ERRTYPE_MEMORY
:
4676 case ERRTYPE_ECACHE_DATA
:
4679 * Memory errors, do unum lookup
4681 if (*afar_status
== AFLT_STAT_INVALID
)
4684 if (rtype
== ERRTYPE_ECACHE_DATA
)
4685 aflt
->flt_status
|= ECC_ECACHE
;
4687 aflt
->flt_status
&= ~ECC_ECACHE
;
4689 synd_code
= synd_to_synd_code(*synd_status
,
4690 aflt
->flt_synd
, ch_flt
->flt_bit
);
4692 if (cpu_get_mem_unum_synd(synd_code
, aflt
, unum
) != 0)
4695 ret
= cpu_get_mem_sid(unum
, sid
, DIMM_SERIAL_ID_LEN
,
4699 (void) cpu_get_mem_offset(aflt
->flt_addr
,
4703 fm_fmri_mem_set(resource
, FM_MEM_SCHEME_VERSION
,
4704 NULL
, unum
, (ret
== 0) ? sid
: NULL
, offset
);
4705 fm_payload_set(payload
,
4706 FM_EREPORT_PAYLOAD_NAME_RESOURCE
,
4707 DATA_TYPE_NVLIST
, resource
, NULL
);
4712 * On-board processor array error, add cpu resource.
4714 cpu_fmri_cpu_set(resource
, aflt
->flt_inst
);
4715 fm_payload_set(payload
,
4716 FM_EREPORT_PAYLOAD_NAME_RESOURCE
,
4717 DATA_TYPE_NVLIST
, resource
, NULL
);
4724 * Initialize the way info if necessary.
4727 cpu_ereport_init(struct async_flt
*aflt
)
4729 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
4730 ch_ec_data_t
*ecp
= &ch_flt
->flt_diag_data
.chd_ec_data
[0];
4731 ch_ec_data_t
*l2p
= &ch_flt
->flt_diag_data
.chd_l2_data
[0];
4735 * Initialize the info in the CPU logout structure.
4736 * The I$/D$ way information is not initialized here
4737 * since it is captured in the logout assembly code.
4739 for (i
= 0; i
< CHD_EC_DATA_SETS
; i
++)
4740 (ecp
+ i
)->ec_way
= i
;
4742 for (i
= 0; i
< PN_L2_NWAYS
; i
++)
4743 (l2p
+ i
)->ec_way
= i
;
4747 * Returns whether fault address is valid for this error bit and
4748 * whether the address is "in memory" (i.e. pf_is_memory returns 1).
4751 cpu_flt_in_memory(ch_async_flt_t
*ch_flt
, uint64_t t_afsr_bit
)
4753 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
4755 return ((t_afsr_bit
& C_AFSR_MEMORY
) &&
4756 afsr_to_afar_status(ch_flt
->afsr_errs
, t_afsr_bit
) ==
4758 pf_is_memory(aflt
->flt_addr
>> MMU_PAGESHIFT
));
4762 * Returns whether fault address is valid based on the error bit for the
4763 * one event being queued and whether the address is "in memory".
4766 cpu_flt_in_memory_one_event(ch_async_flt_t
*ch_flt
, uint64_t t_afsr_bit
)
4768 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
4770 uint64_t afsr_errs
, afsr_ow
, *ow_bits
;
4772 if (!(t_afsr_bit
& C_AFSR_MEMORY
) ||
4773 !pf_is_memory(aflt
->flt_addr
>> MMU_PAGESHIFT
))
4776 afsr_errs
= ch_flt
->afsr_errs
;
4777 afar_status
= afsr_to_afar_status(afsr_errs
, t_afsr_bit
);
4779 switch (afar_status
) {
4780 case AFLT_STAT_VALID
:
4783 case AFLT_STAT_AMBIGUOUS
:
4785 * Status is ambiguous since another error bit (or bits)
4786 * of equal priority to the specified bit on in the afsr,
4787 * so check those bits. Return 1 only if the bits on in the
4788 * same class as the t_afsr_bit are also C_AFSR_MEMORY bits.
4789 * Otherwise not all the equal priority bits are for memory
4790 * errors, so return 0.
4792 ow_bits
= afar_overwrite
;
4793 while ((afsr_ow
= *ow_bits
++) != 0) {
4795 * Get other bits that are on in t_afsr_bit's priority
4796 * class to check for Memory Error bits only.
4798 if (afsr_ow
& t_afsr_bit
) {
4799 if ((afsr_errs
& afsr_ow
) & ~C_AFSR_MEMORY
)
4813 cpu_log_diag_info(ch_async_flt_t
*ch_flt
)
4815 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
4816 ch_dc_data_t
*dcp
= &ch_flt
->flt_diag_data
.chd_dc_data
;
4817 ch_ic_data_t
*icp
= &ch_flt
->flt_diag_data
.chd_ic_data
;
4818 ch_ec_data_t
*ecp
= &ch_flt
->flt_diag_data
.chd_ec_data
[0];
4819 #if defined(CPU_IMP_ECACHE_ASSOC)
4821 #endif /* CPU_IMP_ECACHE_ASSOC */
4824 * Check if the CPU log out captured was valid.
4826 if (ch_flt
->flt_diag_data
.chd_afar
== LOGOUT_INVALID
||
4827 ch_flt
->flt_data_incomplete
)
4830 #if defined(CPU_IMP_ECACHE_ASSOC)
4831 nway
= cpu_ecache_nway();
4832 i
= cpu_ecache_line_valid(ch_flt
);
4833 if (i
== 0 || i
> nway
) {
4834 for (i
= 0; i
< nway
; i
++)
4835 ecp
[i
].ec_logflag
= EC_LOGFLAG_MAGIC
;
4837 ecp
[i
- 1].ec_logflag
= EC_LOGFLAG_MAGIC
;
4838 #else /* CPU_IMP_ECACHE_ASSOC */
4839 ecp
->ec_logflag
= EC_LOGFLAG_MAGIC
;
4840 #endif /* CPU_IMP_ECACHE_ASSOC */
4842 #if defined(CHEETAH_PLUS)
4843 pn_cpu_log_diag_l2_info(ch_flt
);
4844 #endif /* CHEETAH_PLUS */
4846 if (CH_DCTAG_MATCH(dcp
->dc_tag
, aflt
->flt_addr
)) {
4847 dcp
->dc_way
= CH_DCIDX_TO_WAY(dcp
->dc_idx
);
4848 dcp
->dc_logflag
= DC_LOGFLAG_MAGIC
;
4851 if (CH_ICTAG_MATCH(icp
, aflt
->flt_addr
)) {
4852 if (IS_PANTHER(cpunodes
[aflt
->flt_inst
].implementation
))
4853 icp
->ic_way
= PN_ICIDX_TO_WAY(icp
->ic_idx
);
4855 icp
->ic_way
= CH_ICIDX_TO_WAY(icp
->ic_idx
);
4856 icp
->ic_logflag
= IC_LOGFLAG_MAGIC
;
4861 * Cheetah ECC calculation.
4863 * We only need to do the calculation on the data bits and can ignore check
4864 * bit and Mtag bit terms in the calculation.
4866 static uint64_t ch_ecc_table
[9][2] = {
4868 * low order 64-bits high-order 64-bits
4870 { 0x46bffffeccd1177f, 0x488800022100014c },
4871 { 0x42fccc81331ff77f, 0x14424f1010249184 },
4872 { 0x8898827c222f1ffe, 0x22c1222808184aaf },
4873 { 0xf7632203e131ccf1, 0xe1241121848292b8 },
4874 { 0x7f5511421b113809, 0x901c88d84288aafe },
4875 { 0x1d49412184882487, 0x8f338c87c044c6ef },
4876 { 0xf552181014448344, 0x7ff8f4443e411911 },
4877 { 0x2189240808f24228, 0xfeeff8cc81333f42 },
4878 { 0x3280008440001112, 0xfee88b337ffffd62 },
4882 * 64-bit population count, use well-known popcnt trick.
4883 * We could use the UltraSPARC V9 POPC instruction, but some
4884 * CPUs including Cheetahplus and Jaguar do not support that
4888 popc64(uint64_t val
)
4892 for (cnt
= 0; val
!= 0; val
&= val
- 1)
4898 * Generate the 9 ECC bits for the 128-bit chunk based on the table above.
4899 * Note that xor'ing an odd number of 1 bits == 1 and xor'ing an even number
4900 * of 1 bits == 0, so we can just use the least significant bit of the popcnt
4901 * instead of doing all the xor's.
4904 us3_gen_ecc(uint64_t data_low
, uint64_t data_high
)
4909 for (bitno
= 0; bitno
< 9; bitno
++) {
4910 s
= (popc64(data_low
& ch_ecc_table
[bitno
][0]) +
4911 popc64(data_high
& ch_ecc_table
[bitno
][1])) & 1;
4912 synd
|= (s
<< bitno
);
4919 * Queue one event based on ecc_type_to_info entry. If the event has an AFT1
4920 * tag associated with it or is a fatal event (aflt_panic set), it is sent to
4921 * the UE event queue. Otherwise it is dispatched to the CE event queue.
4924 cpu_queue_one_event(ch_async_flt_t
*ch_flt
, char *reason
,
4925 ecc_type_to_info_t
*eccp
, ch_diag_data_t
*cdp
)
4927 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
4930 strlen(reason
) + strlen(eccp
->ec_reason
) < MAX_REASON_STRING
) {
4931 (void) strcat(reason
, eccp
->ec_reason
);
4934 ch_flt
->flt_bit
= eccp
->ec_afsr_bit
;
4935 ch_flt
->flt_type
= eccp
->ec_flt_type
;
4936 if (cdp
!= NULL
&& cdp
->chd_afar
!= LOGOUT_INVALID
)
4937 ch_flt
->flt_diag_data
= *cdp
;
4939 ch_flt
->flt_diag_data
.chd_afar
= LOGOUT_INVALID
;
4940 aflt
->flt_in_memory
=
4941 cpu_flt_in_memory_one_event(ch_flt
, ch_flt
->flt_bit
);
4943 if (ch_flt
->flt_bit
& C_AFSR_MSYND_ERRS
)
4944 aflt
->flt_synd
= GET_M_SYND(aflt
->flt_stat
);
4945 else if (ch_flt
->flt_bit
& (C_AFSR_ESYND_ERRS
| C_AFSR_EXT_ESYND_ERRS
))
4946 aflt
->flt_synd
= GET_E_SYND(aflt
->flt_stat
);
4950 aflt
->flt_payload
= eccp
->ec_err_payload
;
4952 if (aflt
->flt_panic
|| (eccp
->ec_afsr_bit
&
4953 (C_AFSR_LEVEL1
| C_AFSR_EXT_LEVEL1
)))
4954 cpu_errorq_dispatch(eccp
->ec_err_class
,
4955 (void *)ch_flt
, sizeof (ch_async_flt_t
), ue_queue
,
4958 cpu_errorq_dispatch(eccp
->ec_err_class
,
4959 (void *)ch_flt
, sizeof (ch_async_flt_t
), ce_queue
,
4964 * Queue events on async event queue one event per error bit. First we
4965 * queue the events that we "expect" for the given trap, then we queue events
4966 * that we may not expect. Return number of events queued.
4969 cpu_queue_events(ch_async_flt_t
*ch_flt
, char *reason
, uint64_t t_afsr_errs
,
4970 ch_cpu_logout_t
*clop
)
4972 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
4973 ecc_type_to_info_t
*eccp
;
4975 uint64_t primary_afar
= aflt
->flt_addr
, primary_afsr
= aflt
->flt_stat
;
4976 #if defined(CHEETAH_PLUS)
4977 uint64_t orig_t_afsr_errs
;
4979 uint64_t primary_afsr_ext
= ch_flt
->afsr_ext
;
4980 uint64_t primary_afsr_errs
= ch_flt
->afsr_errs
;
4981 ch_diag_data_t
*cdp
= NULL
;
4983 t_afsr_errs
&= ((C_AFSR_ALL_ERRS
& ~C_AFSR_ME
) | C_AFSR_EXT_ALL_ERRS
);
4985 #if defined(CHEETAH_PLUS)
4986 orig_t_afsr_errs
= t_afsr_errs
;
4989 * For Cheetah+, log the shadow AFSR/AFAR bits first.
4993 * Set the AFSR and AFAR fields to the shadow registers. The
4994 * flt_addr and flt_stat fields will be reset to the primaries
4995 * below, but the sdw_addr and sdw_stat will stay as the
4998 cdp
= &clop
->clo_sdw_data
;
4999 aflt
->flt_addr
= ch_flt
->flt_sdw_afar
= cdp
->chd_afar
;
5000 aflt
->flt_stat
= ch_flt
->flt_sdw_afsr
= cdp
->chd_afsr
;
5001 ch_flt
->afsr_ext
= ch_flt
->flt_sdw_afsr_ext
= cdp
->chd_afsr_ext
;
5002 ch_flt
->afsr_errs
= (cdp
->chd_afsr_ext
& C_AFSR_EXT_ALL_ERRS
) |
5003 (cdp
->chd_afsr
& C_AFSR_ALL_ERRS
);
5006 * If the primary and shadow AFSR differ, tag the shadow as
5009 if ((primary_afar
!= cdp
->chd_afar
) ||
5010 (primary_afsr_errs
!= ch_flt
->afsr_errs
)) {
5011 aflt
->flt_stat
|= (1ull << C_AFSR_FIRSTFLT_SHIFT
);
5015 * Check AFSR bits as well as AFSR_EXT bits in order of
5016 * the AFAR overwrite priority. Our stored AFSR_EXT value
5017 * is expected to be zero for those CPUs which do not have
5018 * an AFSR_EXT register.
5020 for (eccp
= ecc_type_to_info
; eccp
->ec_desc
!= NULL
; eccp
++) {
5021 if ((eccp
->ec_afsr_bit
&
5022 (ch_flt
->afsr_errs
& t_afsr_errs
)) &&
5023 ((eccp
->ec_flags
& aflt
->flt_status
) != 0)) {
5024 cpu_queue_one_event(ch_flt
, reason
, eccp
, cdp
);
5026 t_afsr_errs
&= ~eccp
->ec_afsr_bit
;
5032 * If the ME bit is on in the primary AFSR turn all the
5033 * error bits on again that may set the ME bit to make
5034 * sure we see the ME AFSR error logs.
5036 if ((primary_afsr
& C_AFSR_ME
) != 0)
5037 t_afsr_errs
= (orig_t_afsr_errs
& C_AFSR_ALL_ME_ERRS
);
5039 #endif /* CHEETAH_PLUS */
5042 cdp
= &clop
->clo_data
;
5045 * Queue expected errors, error bit and fault type must match
5046 * in the ecc_type_to_info table.
5048 for (eccp
= ecc_type_to_info
; t_afsr_errs
!= 0 && eccp
->ec_desc
!= NULL
;
5050 if ((eccp
->ec_afsr_bit
& t_afsr_errs
) != 0 &&
5051 (eccp
->ec_flags
& aflt
->flt_status
) != 0) {
5052 #if defined(SERRANO)
5054 * For FRC/FRU errors on Serrano the afar2 captures
5055 * the address and the associated data is
5056 * in the shadow logout area.
5058 if (eccp
->ec_afsr_bit
& (C_AFSR_FRC
| C_AFSR_FRU
)) {
5060 cdp
= &clop
->clo_sdw_data
;
5061 aflt
->flt_addr
= ch_flt
->afar2
;
5064 cdp
= &clop
->clo_data
;
5065 aflt
->flt_addr
= primary_afar
;
5068 aflt
->flt_addr
= primary_afar
;
5069 #endif /* SERRANO */
5070 aflt
->flt_stat
= primary_afsr
;
5071 ch_flt
->afsr_ext
= primary_afsr_ext
;
5072 ch_flt
->afsr_errs
= primary_afsr_errs
;
5073 cpu_queue_one_event(ch_flt
, reason
, eccp
, cdp
);
5075 t_afsr_errs
&= ~eccp
->ec_afsr_bit
;
5081 * Queue unexpected errors, error bit only match.
5083 for (eccp
= ecc_type_to_info
; t_afsr_errs
!= 0 && eccp
->ec_desc
!= NULL
;
5085 if (eccp
->ec_afsr_bit
& t_afsr_errs
) {
5086 #if defined(SERRANO)
5088 * For FRC/FRU errors on Serrano the afar2 captures
5089 * the address and the associated data is
5090 * in the shadow logout area.
5092 if (eccp
->ec_afsr_bit
& (C_AFSR_FRC
| C_AFSR_FRU
)) {
5094 cdp
= &clop
->clo_sdw_data
;
5095 aflt
->flt_addr
= ch_flt
->afar2
;
5098 cdp
= &clop
->clo_data
;
5099 aflt
->flt_addr
= primary_afar
;
5102 aflt
->flt_addr
= primary_afar
;
5103 #endif /* SERRANO */
5104 aflt
->flt_stat
= primary_afsr
;
5105 ch_flt
->afsr_ext
= primary_afsr_ext
;
5106 ch_flt
->afsr_errs
= primary_afsr_errs
;
5107 cpu_queue_one_event(ch_flt
, reason
, eccp
, cdp
);
5109 t_afsr_errs
&= ~eccp
->ec_afsr_bit
;
5117 * Return trap type number.
5120 flt_to_trap_type(struct async_flt
*aflt
)
5122 if (aflt
->flt_status
& ECC_I_TRAP
)
5123 return (TRAP_TYPE_ECC_I
);
5124 if (aflt
->flt_status
& ECC_D_TRAP
)
5125 return (TRAP_TYPE_ECC_D
);
5126 if (aflt
->flt_status
& ECC_F_TRAP
)
5127 return (TRAP_TYPE_ECC_F
);
5128 if (aflt
->flt_status
& ECC_C_TRAP
)
5129 return (TRAP_TYPE_ECC_C
);
5130 if (aflt
->flt_status
& ECC_DP_TRAP
)
5131 return (TRAP_TYPE_ECC_DP
);
5132 if (aflt
->flt_status
& ECC_IP_TRAP
)
5133 return (TRAP_TYPE_ECC_IP
);
5134 if (aflt
->flt_status
& ECC_ITLB_TRAP
)
5135 return (TRAP_TYPE_ECC_ITLB
);
5136 if (aflt
->flt_status
& ECC_DTLB_TRAP
)
5137 return (TRAP_TYPE_ECC_DTLB
);
5138 return (TRAP_TYPE_UNKNOWN
);
5142 * Decide an error type based on detector and leaky/partner tests.
5143 * The following array is used for quick translation - it must
5144 * stay in sync with ce_dispact_t.
5147 static char *cetypes
[] = {
5158 flt_to_error_type(struct async_flt
*aflt
)
5160 ce_dispact_t dispact
, disp
;
5161 uchar_t dtcrinfo
, ptnrinfo
, lkyinfo
;
5164 * The memory payload bundle is shared by some events that do
5165 * not perform any classification. For those flt_disp will be
5166 * 0 and we will return "unknown".
5168 if (!ce_disp_inited
|| !aflt
->flt_in_memory
|| aflt
->flt_disp
== 0)
5169 return (cetypes
[CE_DISP_UNKNOWN
]);
5171 dtcrinfo
= CE_XDIAG_DTCRINFO(aflt
->flt_disp
);
5174 * It is also possible that no scrub/classification was performed
5175 * by the detector, for instance where a disrupting error logged
5176 * in the AFSR while CEEN was off in cpu_deferred_error.
5178 if (!CE_XDIAG_EXT_ALG_APPLIED(dtcrinfo
))
5179 return (cetypes
[CE_DISP_UNKNOWN
]);
5182 * Lookup type in initial classification/action table
5184 dispact
= CE_DISPACT(ce_disp_table
,
5185 CE_XDIAG_AFARMATCHED(dtcrinfo
),
5186 CE_XDIAG_STATE(dtcrinfo
),
5187 CE_XDIAG_CE1SEEN(dtcrinfo
),
5188 CE_XDIAG_CE2SEEN(dtcrinfo
));
5191 * A bad lookup is not something to panic production systems for.
5193 ASSERT(dispact
!= CE_DISP_BAD
);
5194 if (dispact
== CE_DISP_BAD
)
5195 return (cetypes
[CE_DISP_UNKNOWN
]);
5197 disp
= CE_DISP(dispact
);
5200 case CE_DISP_UNKNOWN
:
5201 case CE_DISP_INTERMITTENT
:
5204 case CE_DISP_POSS_PERS
:
5206 * "Possible persistent" errors to which we have applied a valid
5207 * leaky test can be separated into "persistent" or "leaky".
5209 lkyinfo
= CE_XDIAG_LKYINFO(aflt
->flt_disp
);
5210 if (CE_XDIAG_TESTVALID(lkyinfo
)) {
5211 if (CE_XDIAG_CE1SEEN(lkyinfo
) ||
5212 CE_XDIAG_CE2SEEN(lkyinfo
))
5213 disp
= CE_DISP_LEAKY
;
5215 disp
= CE_DISP_PERS
;
5219 case CE_DISP_POSS_STICKY
:
5221 * Promote "possible sticky" results that have been
5222 * confirmed by a partner test to "sticky". Unconfirmed
5223 * "possible sticky" events are left at that status - we do not
5224 * guess at any bad reader/writer etc status here.
5226 ptnrinfo
= CE_XDIAG_PTNRINFO(aflt
->flt_disp
);
5227 if (CE_XDIAG_TESTVALID(ptnrinfo
) &&
5228 CE_XDIAG_CE1SEEN(ptnrinfo
) && CE_XDIAG_CE2SEEN(ptnrinfo
))
5229 disp
= CE_DISP_STICKY
;
5232 * Promote "possible sticky" results on a uniprocessor
5235 if (disp
== CE_DISP_POSS_STICKY
&&
5236 CE_XDIAG_SKIPCODE(disp
) == CE_XDIAG_SKIP_UNIPROC
)
5237 disp
= CE_DISP_STICKY
;
5241 disp
= CE_DISP_UNKNOWN
;
5245 return (cetypes
[disp
]);
5249 * Given the entire afsr, the specific bit to check and a prioritized list of
5250 * error bits, determine the validity of the various overwrite priority
5251 * features of the AFSR/AFAR: AFAR, ESYND and MSYND, each of which have
5252 * different overwrite priorities.
5254 * Given a specific afsr error bit and the entire afsr, there are three cases:
5255 * INVALID: The specified bit is lower overwrite priority than some other
5256 * error bit which is on in the afsr (or IVU/IVC).
5257 * VALID: The specified bit is higher priority than all other error bits
5258 * which are on in the afsr.
5259 * AMBIGUOUS: Another error bit (or bits) of equal priority to the specified
5260 * bit is on in the afsr.
5263 afsr_to_overw_status(uint64_t afsr
, uint64_t afsr_bit
, uint64_t *ow_bits
)
5267 while ((afsr_ow
= *ow_bits
++) != 0) {
5269 * If bit is in the priority class, check to see if another
5270 * bit in the same class is on => ambiguous. Otherwise,
5271 * the value is valid. If the bit is not on at this priority
5272 * class, but a higher priority bit is on, then the value is
5275 if (afsr_ow
& afsr_bit
) {
5277 * If equal pri bit is on, ambiguous.
5279 if (afsr
& (afsr_ow
& ~afsr_bit
))
5280 return (AFLT_STAT_AMBIGUOUS
);
5281 return (AFLT_STAT_VALID
);
5282 } else if (afsr
& afsr_ow
)
5287 * We didn't find a match or a higher priority bit was on. Not
5288 * finding a match handles the case of invalid AFAR for IVC, IVU.
5290 return (AFLT_STAT_INVALID
);
5294 afsr_to_afar_status(uint64_t afsr
, uint64_t afsr_bit
)
5296 #if defined(SERRANO)
5297 if (afsr_bit
& (C_AFSR_FRC
| C_AFSR_FRU
))
5298 return (afsr_to_overw_status(afsr
, afsr_bit
, afar2_overwrite
));
5300 #endif /* SERRANO */
5301 return (afsr_to_overw_status(afsr
, afsr_bit
, afar_overwrite
));
5305 afsr_to_esynd_status(uint64_t afsr
, uint64_t afsr_bit
)
5307 return (afsr_to_overw_status(afsr
, afsr_bit
, esynd_overwrite
));
5311 afsr_to_msynd_status(uint64_t afsr
, uint64_t afsr_bit
)
5313 return (afsr_to_overw_status(afsr
, afsr_bit
, msynd_overwrite
));
5317 afsr_to_synd_status(uint_t cpuid
, uint64_t afsr
, uint64_t afsr_bit
)
5319 #if defined(CHEETAH_PLUS)
5321 * The M_SYND overwrite policy is combined with the E_SYND overwrite
5322 * policy for Cheetah+ and separate for Panther CPUs.
5324 if (afsr_bit
& C_AFSR_MSYND_ERRS
) {
5325 if (IS_PANTHER(cpunodes
[cpuid
].implementation
))
5326 return (afsr_to_msynd_status(afsr
, afsr_bit
));
5328 return (afsr_to_esynd_status(afsr
, afsr_bit
));
5329 } else if (afsr_bit
& (C_AFSR_ESYND_ERRS
| C_AFSR_EXT_ESYND_ERRS
)) {
5330 if (IS_PANTHER(cpunodes
[cpuid
].implementation
))
5331 return (afsr_to_pn_esynd_status(afsr
, afsr_bit
));
5333 return (afsr_to_esynd_status(afsr
, afsr_bit
));
5334 #else /* CHEETAH_PLUS */
5335 if (afsr_bit
& C_AFSR_MSYND_ERRS
) {
5336 return (afsr_to_msynd_status(afsr
, afsr_bit
));
5337 } else if (afsr_bit
& (C_AFSR_ESYND_ERRS
| C_AFSR_EXT_ESYND_ERRS
)) {
5338 return (afsr_to_esynd_status(afsr
, afsr_bit
));
5339 #endif /* CHEETAH_PLUS */
5341 return (AFLT_STAT_INVALID
);
5346 * Slave CPU stick synchronization.
5349 sticksync_slave(void)
5357 /* wait for the master side */
5358 while (stick_sync_cmd
!= SLAVE_START
)
5361 * Synchronization should only take a few tries at most. But in the
5362 * odd case where the cpu isn't cooperating we'll keep trying. A cpu
5363 * without it's stick synchronized wouldn't be a good citizen.
5365 while (slave_done
== 0) {
5367 * Time skew calculation.
5369 av_tskew
= tskew
= 0;
5371 for (i
= 0; i
< stick_iter
; i
++) {
5372 /* make location hot */
5373 timestamp
[EV_A_START
] = 0;
5374 stick_timestamp(×tamp
[EV_A_START
]);
5376 /* tell the master we're ready */
5377 stick_sync_cmd
= MASTER_START
;
5380 while (stick_sync_cmd
!= SLAVE_CONT
)
5383 stick_timestamp(×tamp
[EV_B_END
]);
5385 /* calculate time skew */
5386 tskew
= ((timestamp
[EV_B_END
] - timestamp
[EV_B_START
])
5387 - (timestamp
[EV_A_END
] - timestamp
[EV_A_START
]))
5390 /* keep running count */
5395 * Adjust stick for time skew if not within the max allowed;
5396 * otherwise we're all done.
5398 if (stick_iter
!= 0)
5399 av_tskew
= av_tskew
/stick_iter
;
5400 if (ABS(av_tskew
) > stick_tsk
) {
5402 * If the skew is 1 (the slave's STICK register
5403 * is 1 STICK ahead of the master's), stick_adj
5404 * could fail to adjust the slave's STICK register
5405 * if the STICK read on the slave happens to
5406 * align with the increment of the STICK.
5407 * Therefore, we increment the skew to 2.
5411 stick_adj(-av_tskew
);
5415 if (tries
< DSYNC_ATTEMPTS
)
5416 stick_sync_stats
[CPU
->cpu_id
].skew_val
[tries
] =
5423 /* allow the master to finish */
5424 stick_sync_cmd
= EVENT_NULL
;
5429 * Master CPU side of stick synchronization.
5430 * - timestamp end of Event A
5431 * - timestamp beginning of Event B
5434 sticksync_master(void)
5439 /* tell the slave we've started */
5441 stick_sync_cmd
= SLAVE_START
;
5443 while (slave_done
== 0) {
5444 for (i
= 0; i
< stick_iter
; i
++) {
5445 /* wait for the slave */
5446 while (stick_sync_cmd
!= MASTER_START
)
5449 stick_timestamp(×tamp
[EV_A_END
]);
5451 /* make location hot */
5452 timestamp
[EV_B_START
] = 0;
5453 stick_timestamp(×tamp
[EV_B_START
]);
5455 /* tell the slave to continue */
5456 stick_sync_cmd
= SLAVE_CONT
;
5459 /* wait while slave calculates time skew */
5460 while (stick_sync_cmd
== SLAVE_CONT
)
5467 * Cheetah/Cheetah+ have disrupting error for copyback's, so we don't need to
5468 * do Spitfire hack of xcall'ing all the cpus to ask to check for them. Also,
5469 * in cpu_async_panic_callb, each cpu checks for CPU events on its way to
5474 cpu_check_allcpus(struct async_flt
*aflt
)
5477 struct kmem_cache
*ch_private_cache
;
5480 * Cpu private unitialization. Uninitialize the Ecache scrubber and
5481 * deallocate the scrubber data structures and cpu_private data structure.
5484 cpu_uninit_private(struct cpu
*cp
)
5486 cheetah_private_t
*chprp
= CPU_PRIVATE(cp
);
5489 cpu_uninit_ecache_scrub_dr(cp
);
5490 CPU_PRIVATE(cp
) = NULL
;
5491 ch_err_tl1_paddrs
[cp
->cpu_id
] = NULL
;
5492 kmem_cache_free(ch_private_cache
, chprp
);
5493 cmp_delete_cpu(cp
->cpu_id
);
5498 * Cheetah Cache Scrubbing
5500 * The primary purpose of Cheetah cache scrubbing is to reduce the exposure
5501 * of E$ tags, D$ data, and I$ data to cosmic ray events since they are not
5502 * protected by either parity or ECC.
5504 * We currently default the E$ and D$ scan rate to 100 (scan 10% of the
5505 * cache per second). Due to the the specifics of how the I$ control
5506 * logic works with respect to the ASI used to scrub I$ lines, the entire
5507 * I$ is scanned at once.
5511 * Tuneables to enable and disable the scrubbing of the caches, and to tune
5512 * scrubbing behavior. These may be changed via /etc/system or using mdb
5513 * on a running system.
5515 int dcache_scrub_enable
= 1; /* D$ scrubbing is on by default */
5518 * The following are the PIL levels that the softints/cross traps will fire at.
5520 uint_t ecache_scrub_pil
= PIL_9
; /* E$ scrub PIL for cross traps */
5521 uint_t dcache_scrub_pil
= PIL_9
; /* D$ scrub PIL for cross traps */
5522 uint_t icache_scrub_pil
= PIL_9
; /* I$ scrub PIL for cross traps */
5524 #if defined(JALAPENO)
5527 * Due to several errata (82, 85, 86), we don't enable the L2$ scrubber
5530 int ecache_scrub_enable
= 0;
5532 #else /* JALAPENO */
5535 * With all other cpu types, E$ scrubbing is on by default
5537 int ecache_scrub_enable
= 1;
5539 #endif /* JALAPENO */
5542 #if defined(CHEETAH_PLUS) || defined(JALAPENO) || defined(SERRANO)
5545 * The I$ scrubber tends to cause latency problems for real-time SW, so it
5546 * is disabled by default on non-Cheetah systems
5548 int icache_scrub_enable
= 0;
5551 * Tuneables specifying the scrub calls per second and the scan rate
5554 * The cyclic times are set during boot based on the following values.
5555 * Changing these values in mdb after this time will have no effect. If
5556 * a different value is desired, it must be set in /etc/system before a
5559 int ecache_calls_a_sec
= 1;
5560 int dcache_calls_a_sec
= 2;
5561 int icache_calls_a_sec
= 2;
5563 int ecache_scan_rate_idle
= 1;
5564 int ecache_scan_rate_busy
= 1;
5565 int dcache_scan_rate_idle
= 1;
5566 int dcache_scan_rate_busy
= 1;
5567 int icache_scan_rate_idle
= 1;
5568 int icache_scan_rate_busy
= 1;
5570 #else /* CHEETAH_PLUS || JALAPENO || SERRANO */
5572 int icache_scrub_enable
= 1; /* I$ scrubbing is on by default */
5574 int ecache_calls_a_sec
= 100; /* E$ scrub calls per seconds */
5575 int dcache_calls_a_sec
= 100; /* D$ scrub calls per seconds */
5576 int icache_calls_a_sec
= 100; /* I$ scrub calls per seconds */
5578 int ecache_scan_rate_idle
= 100; /* E$ scan rate (in tenths of a %) */
5579 int ecache_scan_rate_busy
= 100; /* E$ scan rate (in tenths of a %) */
5580 int dcache_scan_rate_idle
= 100; /* D$ scan rate (in tenths of a %) */
5581 int dcache_scan_rate_busy
= 100; /* D$ scan rate (in tenths of a %) */
5582 int icache_scan_rate_idle
= 100; /* I$ scan rate (in tenths of a %) */
5583 int icache_scan_rate_busy
= 100; /* I$ scan rate (in tenths of a %) */
5585 #endif /* CHEETAH_PLUS || JALAPENO || SERRANO */
5588 * In order to scrub on offline cpus, a cross trap is sent. The handler will
5589 * increment the outstanding request counter and schedule a softint to run
5592 extern xcfunc_t cache_scrubreq_tl1
;
5595 * These are the softint functions for each cache scrubber
5597 static uint_t
scrub_ecache_line_intr(caddr_t arg1
, caddr_t arg2
);
5598 static uint_t
scrub_dcache_line_intr(caddr_t arg1
, caddr_t arg2
);
5599 static uint_t
scrub_icache_line_intr(caddr_t arg1
, caddr_t arg2
);
5602 * The cache scrub info table contains cache specific information
5603 * and allows for some of the scrub code to be table driven, reducing
5604 * duplication of cache similar code.
5606 * This table keeps a copy of the value in the calls per second variable
5607 * (?cache_calls_a_sec). This makes it much more difficult for someone
5608 * to cause us problems (for example, by setting ecache_calls_a_sec to 0 in
5609 * mdb in a misguided attempt to disable the scrubber).
5612 int *csi_enable
; /* scrubber enable flag */
5613 int csi_freq
; /* scrubber calls per second */
5614 int csi_index
; /* index to chsm_outstanding[] */
5615 uint64_t csi_inum
; /* scrubber interrupt number */
5616 cyclic_id_t csi_omni_cyc_id
; /* omni cyclic ID */
5617 cyclic_id_t csi_offline_cyc_id
; /* offline cyclic ID */
5618 char csi_name
[3]; /* cache name for this scrub entry */
5619 } cache_scrub_info
[] = {
5620 { &ecache_scrub_enable
, 0, CACHE_SCRUBBER_INFO_E
, 0, 0, 0, "E$"},
5621 { &dcache_scrub_enable
, 0, CACHE_SCRUBBER_INFO_D
, 0, 0, 0, "D$"},
5622 { &icache_scrub_enable
, 0, CACHE_SCRUBBER_INFO_I
, 0, 0, 0, "I$"}
5626 * If scrubbing is enabled, increment the outstanding request counter. If it
5627 * is 1 (meaning there were no previous requests outstanding), call
5628 * setsoftint_tl1 through xt_one_unchecked, which eventually ends up doing
5632 do_scrub(struct scrub_info
*csi
)
5634 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(CPU
, chpr_scrub_misc
);
5635 int index
= csi
->csi_index
;
5636 uint32_t *outstanding
= &csmp
->chsm_outstanding
[index
];
5638 if (*(csi
->csi_enable
) && (csmp
->chsm_enable
[index
])) {
5639 if (atomic_inc_32_nv(outstanding
) == 1) {
5640 xt_one_unchecked(CPU
->cpu_id
, setsoftint_tl1
,
5647 * Omni cyclics don't fire on offline cpus, so we use another cyclic to
5648 * cross-trap the offline cpus.
5651 do_scrub_offline(struct scrub_info
*csi
)
5653 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(CPU
, chpr_scrub_misc
);
5655 if (CPUSET_ISNULL(cpu_offline_set
)) {
5657 * No offline cpus - nothing to do
5662 if (*(csi
->csi_enable
) && (csmp
->chsm_enable
[csi
->csi_index
])) {
5663 xt_some(cpu_offline_set
, cache_scrubreq_tl1
, csi
->csi_inum
,
5669 * This is the initial setup for the scrubber cyclics - it sets the
5670 * interrupt level, frequency, and function to call.
5674 cpu_scrub_cyclic_setup(void *arg
, cpu_t
*cpu
, cyc_handler_t
*hdlr
,
5677 struct scrub_info
*csi
= (struct scrub_info
*)arg
;
5679 ASSERT(csi
!= NULL
);
5680 hdlr
->cyh_func
= (cyc_func_t
)do_scrub
;
5681 hdlr
->cyh_level
= CY_LOW_LEVEL
;
5682 hdlr
->cyh_arg
= arg
;
5684 when
->cyt_when
= 0; /* Start immediately */
5685 when
->cyt_interval
= NANOSEC
/ csi
->csi_freq
;
5689 * Initialization for cache scrubbing.
5690 * This routine is called AFTER all cpus have had cpu_init_private called
5691 * to initialize their private data areas.
5694 cpu_init_cache_scrub(void)
5697 struct scrub_info
*csi
;
5698 cyc_omni_handler_t omni_hdlr
;
5699 cyc_handler_t offline_hdlr
;
5703 * save away the maximum number of lines for the D$
5705 dcache_nlines
= dcache_size
/ dcache_linesize
;
5708 * register the softints for the cache scrubbing
5710 cache_scrub_info
[CACHE_SCRUBBER_INFO_E
].csi_inum
=
5711 add_softintr(ecache_scrub_pil
, scrub_ecache_line_intr
,
5712 (caddr_t
)&cache_scrub_info
[CACHE_SCRUBBER_INFO_E
], SOFTINT_MT
);
5713 cache_scrub_info
[CACHE_SCRUBBER_INFO_E
].csi_freq
= ecache_calls_a_sec
;
5715 cache_scrub_info
[CACHE_SCRUBBER_INFO_D
].csi_inum
=
5716 add_softintr(dcache_scrub_pil
, scrub_dcache_line_intr
,
5717 (caddr_t
)&cache_scrub_info
[CACHE_SCRUBBER_INFO_D
], SOFTINT_MT
);
5718 cache_scrub_info
[CACHE_SCRUBBER_INFO_D
].csi_freq
= dcache_calls_a_sec
;
5720 cache_scrub_info
[CACHE_SCRUBBER_INFO_I
].csi_inum
=
5721 add_softintr(icache_scrub_pil
, scrub_icache_line_intr
,
5722 (caddr_t
)&cache_scrub_info
[CACHE_SCRUBBER_INFO_I
], SOFTINT_MT
);
5723 cache_scrub_info
[CACHE_SCRUBBER_INFO_I
].csi_freq
= icache_calls_a_sec
;
5726 * start the scrubbing for all the caches
5728 mutex_enter(&cpu_lock
);
5729 for (i
= 0; i
< CACHE_SCRUBBER_COUNT
; i
++) {
5731 csi
= &cache_scrub_info
[i
];
5733 if (!(*csi
->csi_enable
))
5737 * force the following to be true:
5738 * 1 <= calls_a_sec <= hz
5740 if (csi
->csi_freq
> hz
) {
5741 cmn_err(CE_NOTE
, "%s scrub calls_a_sec set too high "
5742 "(%d); resetting to hz (%d)", csi
->csi_name
,
5745 } else if (csi
->csi_freq
< 1) {
5746 cmn_err(CE_NOTE
, "%s scrub calls_a_sec set too low "
5747 "(%d); resetting to 1", csi
->csi_name
,
5752 omni_hdlr
.cyo_online
= cpu_scrub_cyclic_setup
;
5753 omni_hdlr
.cyo_offline
= NULL
;
5754 omni_hdlr
.cyo_arg
= (void *)csi
;
5756 offline_hdlr
.cyh_func
= (cyc_func_t
)do_scrub_offline
;
5757 offline_hdlr
.cyh_arg
= (void *)csi
;
5758 offline_hdlr
.cyh_level
= CY_LOW_LEVEL
;
5760 when
.cyt_when
= 0; /* Start immediately */
5761 when
.cyt_interval
= NANOSEC
/ csi
->csi_freq
;
5763 csi
->csi_omni_cyc_id
= cyclic_add_omni(&omni_hdlr
);
5764 csi
->csi_offline_cyc_id
= cyclic_add(&offline_hdlr
, &when
);
5766 register_cpu_setup_func(cpu_scrub_cpu_setup
, NULL
);
5767 mutex_exit(&cpu_lock
);
5771 * Indicate that the specified cpu is idle.
5774 cpu_idle_ecache_scrub(struct cpu
*cp
)
5776 if (CPU_PRIVATE(cp
) != NULL
) {
5777 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(cp
, chpr_scrub_misc
);
5778 csmp
->chsm_ecache_busy
= ECACHE_CPU_IDLE
;
5783 * Indicate that the specified cpu is busy.
5786 cpu_busy_ecache_scrub(struct cpu
*cp
)
5788 if (CPU_PRIVATE(cp
) != NULL
) {
5789 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(cp
, chpr_scrub_misc
);
5790 csmp
->chsm_ecache_busy
= ECACHE_CPU_BUSY
;
5795 * Initialization for cache scrubbing for the specified cpu.
5798 cpu_init_ecache_scrub_dr(struct cpu
*cp
)
5800 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(cp
, chpr_scrub_misc
);
5801 int cpuid
= cp
->cpu_id
;
5803 /* initialize the number of lines in the caches */
5804 csmp
->chsm_ecache_nlines
= cpunodes
[cpuid
].ecache_size
/
5805 cpunodes
[cpuid
].ecache_linesize
;
5806 csmp
->chsm_icache_nlines
= CPU_PRIVATE_VAL(cp
, chpr_icache_size
) /
5807 CPU_PRIVATE_VAL(cp
, chpr_icache_linesize
);
5810 * do_scrub() and do_scrub_offline() check both the global
5811 * ?cache_scrub_enable and this per-cpu enable variable. All scrubbers
5812 * check this value before scrubbing. Currently, we use it to
5813 * disable the E$ scrubber on multi-core cpus or while running at
5814 * slowed speed. For now, just turn everything on and allow
5815 * cpu_init_private() to change it if necessary.
5817 csmp
->chsm_enable
[CACHE_SCRUBBER_INFO_E
] = 1;
5818 csmp
->chsm_enable
[CACHE_SCRUBBER_INFO_D
] = 1;
5819 csmp
->chsm_enable
[CACHE_SCRUBBER_INFO_I
] = 1;
5821 cpu_busy_ecache_scrub(cp
);
5825 * Un-initialization for cache scrubbing for the specified cpu.
5828 cpu_uninit_ecache_scrub_dr(struct cpu
*cp
)
5830 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(cp
, chpr_scrub_misc
);
5833 * un-initialize bookkeeping for cache scrubbing
5835 bzero(csmp
, sizeof (ch_scrub_misc_t
));
5837 cpu_idle_ecache_scrub(cp
);
5841 * Called periodically on each CPU to scrub the D$.
5844 scrub_dcache(int how_many
)
5847 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(CPU
, chpr_scrub_misc
);
5848 int index
= csmp
->chsm_flush_index
[CACHE_SCRUBBER_INFO_D
];
5851 * scrub the desired number of lines
5853 for (i
= 0; i
< how_many
; i
++) {
5857 dcache_inval_line(index
);
5860 * calculate the next D$ line to scrub, assumes
5861 * that dcache_nlines is a power of 2
5863 index
= (index
+ 1) & (dcache_nlines
- 1);
5867 * set the scrub index for the next visit
5869 csmp
->chsm_flush_index
[CACHE_SCRUBBER_INFO_D
] = index
;
5873 * Handler for D$ scrub inum softint. Call scrub_dcache until
5874 * we decrement the outstanding request count to zero.
5878 scrub_dcache_line_intr(caddr_t arg1
, caddr_t arg2
)
5883 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(CPU
, chpr_scrub_misc
);
5884 uint32_t *countp
= &csmp
->chsm_outstanding
[CACHE_SCRUBBER_INFO_D
];
5885 struct scrub_info
*csi
= (struct scrub_info
*)arg1
;
5886 int scan_rate
= (csmp
->chsm_ecache_busy
== ECACHE_CPU_IDLE
) ?
5887 dcache_scan_rate_idle
: dcache_scan_rate_busy
;
5890 * The scan rates are expressed in units of tenths of a
5891 * percent. A scan rate of 1000 (100%) means the whole
5892 * cache is scanned every second.
5894 how_many
= (dcache_nlines
* scan_rate
) / (1000 * csi
->csi_freq
);
5897 outstanding
= *countp
;
5898 for (i
= 0; i
< outstanding
; i
++) {
5899 scrub_dcache(how_many
);
5901 } while (atomic_add_32_nv(countp
, -outstanding
));
5903 return (DDI_INTR_CLAIMED
);
5907 * Called periodically on each CPU to scrub the I$. The I$ is scrubbed
5908 * by invalidating lines. Due to the characteristics of the ASI which
5909 * is used to invalidate an I$ line, the entire I$ must be invalidated
5910 * vs. an individual I$ line.
5913 scrub_icache(int how_many
)
5916 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(CPU
, chpr_scrub_misc
);
5917 int index
= csmp
->chsm_flush_index
[CACHE_SCRUBBER_INFO_I
];
5918 int icache_nlines
= csmp
->chsm_icache_nlines
;
5921 * scrub the desired number of lines
5923 for (i
= 0; i
< how_many
; i
++) {
5925 * since the entire I$ must be scrubbed at once,
5926 * wait until the index wraps to zero to invalidate
5934 * calculate the next I$ line to scrub, assumes
5935 * that chsm_icache_nlines is a power of 2
5937 index
= (index
+ 1) & (icache_nlines
- 1);
5941 * set the scrub index for the next visit
5943 csmp
->chsm_flush_index
[CACHE_SCRUBBER_INFO_I
] = index
;
5947 * Handler for I$ scrub inum softint. Call scrub_icache until
5948 * we decrement the outstanding request count to zero.
5952 scrub_icache_line_intr(caddr_t arg1
, caddr_t arg2
)
5957 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(CPU
, chpr_scrub_misc
);
5958 uint32_t *countp
= &csmp
->chsm_outstanding
[CACHE_SCRUBBER_INFO_I
];
5959 struct scrub_info
*csi
= (struct scrub_info
*)arg1
;
5960 int scan_rate
= (csmp
->chsm_ecache_busy
== ECACHE_CPU_IDLE
) ?
5961 icache_scan_rate_idle
: icache_scan_rate_busy
;
5962 int icache_nlines
= csmp
->chsm_icache_nlines
;
5965 * The scan rates are expressed in units of tenths of a
5966 * percent. A scan rate of 1000 (100%) means the whole
5967 * cache is scanned every second.
5969 how_many
= (icache_nlines
* scan_rate
) / (1000 * csi
->csi_freq
);
5972 outstanding
= *countp
;
5973 for (i
= 0; i
< outstanding
; i
++) {
5974 scrub_icache(how_many
);
5976 } while (atomic_add_32_nv(countp
, -outstanding
));
5978 return (DDI_INTR_CLAIMED
);
5982 * Called periodically on each CPU to scrub the E$.
5985 scrub_ecache(int how_many
)
5987 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(CPU
, chpr_scrub_misc
);
5989 int cpuid
= CPU
->cpu_id
;
5990 int index
= csmp
->chsm_flush_index
[CACHE_SCRUBBER_INFO_E
];
5991 int nlines
= csmp
->chsm_ecache_nlines
;
5992 int linesize
= cpunodes
[cpuid
].ecache_linesize
;
5993 int ec_set_size
= cpu_ecache_set_size(CPU
);
5996 * scrub the desired number of lines
5998 for (i
= 0; i
< how_many
; i
++) {
6002 ecache_flush_line(ecache_flushaddr
+ (index
* linesize
),
6006 * calculate the next E$ line to scrub based on twice
6007 * the number of E$ lines (to displace lines containing
6008 * flush area data), assumes that the number of lines
6011 index
= (index
+ 1) & ((nlines
<< 1) - 1);
6015 * set the ecache scrub index for the next visit
6017 csmp
->chsm_flush_index
[CACHE_SCRUBBER_INFO_E
] = index
;
6021 * Handler for E$ scrub inum softint. Call the E$ scrubber until
6022 * we decrement the outstanding request count to zero.
6024 * Due to interactions with cpu_scrub_cpu_setup(), the outstanding count may
6025 * become negative after the atomic_add_32_nv(). This is not a problem, as
6026 * the next trip around the loop won't scrub anything, and the next add will
6027 * reset the count back to zero.
6031 scrub_ecache_line_intr(caddr_t arg1
, caddr_t arg2
)
6036 ch_scrub_misc_t
*csmp
= CPU_PRIVATE_PTR(CPU
, chpr_scrub_misc
);
6037 uint32_t *countp
= &csmp
->chsm_outstanding
[CACHE_SCRUBBER_INFO_E
];
6038 struct scrub_info
*csi
= (struct scrub_info
*)arg1
;
6039 int scan_rate
= (csmp
->chsm_ecache_busy
== ECACHE_CPU_IDLE
) ?
6040 ecache_scan_rate_idle
: ecache_scan_rate_busy
;
6041 int ecache_nlines
= csmp
->chsm_ecache_nlines
;
6044 * The scan rates are expressed in units of tenths of a
6045 * percent. A scan rate of 1000 (100%) means the whole
6046 * cache is scanned every second.
6048 how_many
= (ecache_nlines
* scan_rate
) / (1000 * csi
->csi_freq
);
6051 outstanding
= *countp
;
6052 for (i
= 0; i
< outstanding
; i
++) {
6053 scrub_ecache(how_many
);
6055 } while (atomic_add_32_nv(countp
, -outstanding
));
6057 return (DDI_INTR_CLAIMED
);
6061 * Timeout function to reenable CE
6064 cpu_delayed_check_ce_errors(void *arg
)
6066 if (!taskq_dispatch(ch_check_ce_tq
, cpu_check_ce_errors
, arg
,
6068 (void) timeout(cpu_delayed_check_ce_errors
, arg
,
6069 drv_usectohz((clock_t)cpu_ceen_delay_secs
* MICROSEC
));
6074 * CE Deferred Re-enable after trap.
6076 * When the CPU gets a disrupting trap for any of the errors
6077 * controlled by the CEEN bit, CEEN is disabled in the trap handler
6078 * immediately. To eliminate the possibility of multiple CEs causing
6079 * recursive stack overflow in the trap handler, we cannot
6080 * reenable CEEN while still running in the trap handler. Instead,
6081 * after a CE is logged on a CPU, we schedule a timeout function,
6082 * cpu_check_ce_errors(), to trigger after cpu_ceen_delay_secs
6083 * seconds. This function will check whether any further CEs
6084 * have occurred on that CPU, and if none have, will reenable CEEN.
6086 * If further CEs have occurred while CEEN is disabled, another
6087 * timeout will be scheduled. This is to ensure that the CPU can
6088 * make progress in the face of CE 'storms', and that it does not
6089 * spend all its time logging CE errors.
6092 cpu_check_ce_errors(void *arg
)
6094 int cpuid
= (int)(uintptr_t)arg
;
6098 * We acquire cpu_lock.
6100 ASSERT(curthread
->t_pil
== 0);
6103 * verify that the cpu is still around, DR
6104 * could have got there first ...
6106 mutex_enter(&cpu_lock
);
6107 cp
= cpu_get(cpuid
);
6109 mutex_exit(&cpu_lock
);
6113 * make sure we don't migrate across CPUs
6114 * while checking our CE status.
6119 * If we are running on the CPU that got the
6120 * CE, we can do the checks directly.
6122 if (cp
->cpu_id
== CPU
->cpu_id
) {
6123 mutex_exit(&cpu_lock
);
6124 cpu_check_ce(TIMEOUT_CEEN_CHECK
, 0, 0, 0);
6131 * send an x-call to get the CPU that originally
6132 * got the CE to do the necessary checks. If we can't
6133 * send the x-call, reschedule the timeout, otherwise we
6134 * lose CEEN forever on that CPU.
6136 if (CPU_XCALL_READY(cp
->cpu_id
) && (!(cp
->cpu_flags
& CPU_QUIESCED
))) {
6137 xc_one(cp
->cpu_id
, (xcfunc_t
*)cpu_check_ce
,
6138 TIMEOUT_CEEN_CHECK
, 0);
6139 mutex_exit(&cpu_lock
);
6142 * When the CPU is not accepting xcalls, or
6143 * the processor is offlined, we don't want to
6144 * incur the extra overhead of trying to schedule the
6145 * CE timeout indefinitely. However, we don't want to lose
6146 * CE checking forever.
6148 * Keep rescheduling the timeout, accepting the additional
6149 * overhead as the cost of correctness in the case where we get
6150 * a CE, disable CEEN, offline the CPU during the
6151 * the timeout interval, and then online it at some
6152 * point in the future. This is unlikely given the short
6153 * cpu_ceen_delay_secs.
6155 mutex_exit(&cpu_lock
);
6156 (void) timeout(cpu_delayed_check_ce_errors
,
6157 (void *)(uintptr_t)cp
->cpu_id
,
6158 drv_usectohz((clock_t)cpu_ceen_delay_secs
* MICROSEC
));
6163 * This routine will check whether CEs have occurred while
6164 * CEEN is disabled. Any CEs detected will be logged and, if
6165 * possible, scrubbed.
6167 * The memscrubber will also use this routine to clear any errors
6168 * caused by its scrubbing with CEEN disabled.
6170 * flag == SCRUBBER_CEEN_CHECK
6171 * called from memscrubber, just check/scrub, no reset
6172 * paddr physical addr. for start of scrub pages
6173 * vaddr virtual addr. for scrub area
6174 * psz page size of area to be scrubbed
6176 * flag == TIMEOUT_CEEN_CHECK
6177 * timeout function has triggered, reset timeout or CEEN
6179 * Note: We must not migrate cpus during this function. This can be
6180 * achieved by one of:
6181 * - invoking as target of an x-call in which case we're at XCALL_PIL
6182 * The flag value must be first xcall argument.
6183 * - disabling kernel preemption. This should be done for very short
6184 * periods so is not suitable for SCRUBBER_CEEN_CHECK where we might
6185 * scrub an extended area with cpu_check_block. The call for
6186 * TIMEOUT_CEEN_CHECK uses this so cpu_check_ce must be kept
6187 * brief for this case.
6188 * - binding to a cpu, eg with thread_affinity_set(). This is used
6189 * in the SCRUBBER_CEEN_CHECK case, but is not practical for
6190 * the TIMEOUT_CEEN_CHECK because both need cpu_lock.
6193 cpu_check_ce(int flag
, uint64_t pa
, caddr_t va
, uint_t psz
)
6195 ch_cpu_errors_t cpu_error_regs
;
6196 uint64_t ec_err_enable
;
6197 uint64_t page_offset
;
6200 get_cpu_error_state(&cpu_error_regs
);
6203 * If no CEEN errors have occurred during the timeout
6204 * interval, it is safe to re-enable CEEN and exit.
6206 if (((cpu_error_regs
.afsr
& C_AFSR_CECC_ERRS
) |
6207 (cpu_error_regs
.afsr_ext
& C_AFSR_EXT_CECC_ERRS
)) == 0) {
6208 if (flag
== TIMEOUT_CEEN_CHECK
&&
6209 !((ec_err_enable
= get_error_enable()) & EN_REG_CEEN
))
6210 set_error_enable(ec_err_enable
| EN_REG_CEEN
);
6215 * Ensure that CEEN was not reenabled (maybe by DR) before
6216 * we log/clear the error.
6218 if ((ec_err_enable
= get_error_enable()) & EN_REG_CEEN
)
6219 set_error_enable(ec_err_enable
& ~EN_REG_CEEN
);
6222 * log/clear the CE. If CE_CEEN_DEFER is passed, the
6223 * timeout will be rescheduled when the error is logged.
6225 if (!((cpu_error_regs
.afsr
& cpu_ce_not_deferred
) |
6226 (cpu_error_regs
.afsr_ext
& cpu_ce_not_deferred_ext
)))
6227 cpu_ce_detected(&cpu_error_regs
,
6228 CE_CEEN_DEFER
| CE_CEEN_TIMEOUT
);
6230 cpu_ce_detected(&cpu_error_regs
, CE_CEEN_TIMEOUT
);
6233 * If the memory scrubber runs while CEEN is
6234 * disabled, (or if CEEN is disabled during the
6235 * scrub as a result of a CE being triggered by
6236 * it), the range being scrubbed will not be
6237 * completely cleaned. If there are multiple CEs
6238 * in the range at most two of these will be dealt
6239 * with, (one by the trap handler and one by the
6240 * timeout). It is also possible that none are dealt
6241 * with, (CEEN disabled and another CE occurs before
6242 * the timeout triggers). So to ensure that the
6243 * memory is actually scrubbed, we have to access each
6244 * memory location in the range and then check whether
6245 * that access causes a CE.
6247 if (flag
== SCRUBBER_CEEN_CHECK
&& va
) {
6248 if ((cpu_error_regs
.afar
>= pa
) &&
6249 (cpu_error_regs
.afar
< (pa
+ psz
))) {
6251 * Force a load from physical memory for each
6252 * 64-byte block, then check AFSR to determine
6253 * whether this access caused an error.
6255 * This is a slow way to do a scrub, but as it will
6256 * only be invoked when the memory scrubber actually
6257 * triggered a CE, it should not happen too
6260 * cut down what we need to check as the scrubber
6261 * has verified up to AFAR, so get it's offset
6262 * into the page and start there.
6264 page_offset
= (uint64_t)(cpu_error_regs
.afar
&
6266 va
= (caddr_t
)(va
+ (P2ALIGN(page_offset
, 64)));
6267 psz
-= (uint_t
)(P2ALIGN(page_offset
, 64));
6268 cpu_check_block((caddr_t
)(P2ALIGN((uint64_t)va
, 64)),
6274 * Reset error enable if this CE is not masked.
6276 if ((flag
== TIMEOUT_CEEN_CHECK
) &&
6277 (cpu_error_regs
.afsr
& cpu_ce_not_deferred
))
6278 set_error_enable(ec_err_enable
| EN_REG_CEEN
);
6283 * Attempt a cpu logout for an error that we did not trap for, such
6284 * as a CE noticed with CEEN off. It is assumed that we are still running
6285 * on the cpu that took the error and that we cannot migrate. Returns
6286 * 0 on success, otherwise nonzero.
6289 cpu_ce_delayed_ec_logout(uint64_t afar
)
6291 ch_cpu_logout_t
*clop
;
6293 if (CPU_PRIVATE(CPU
) == NULL
)
6296 clop
= CPU_PRIVATE_PTR(CPU
, chpr_cecc_logout
);
6297 if (atomic_cas_64(&clop
->clo_data
.chd_afar
, LOGOUT_INVALID
, afar
) !=
6301 cpu_delayed_logout(afar
, clop
);
6306 * We got an error while CEEN was disabled. We
6307 * need to clean up after it and log whatever
6308 * information we have on the CE.
6311 cpu_ce_detected(ch_cpu_errors_t
*cpu_error_regs
, int flag
)
6313 ch_async_flt_t ch_flt
;
6314 struct async_flt
*aflt
;
6315 char pr_reason
[MAX_REASON_STRING
];
6317 bzero(&ch_flt
, sizeof (ch_async_flt_t
));
6318 ch_flt
.flt_trapped_ce
= flag
;
6319 aflt
= (struct async_flt
*)&ch_flt
;
6320 aflt
->flt_stat
= cpu_error_regs
->afsr
& C_AFSR_MASK
;
6321 ch_flt
.afsr_ext
= cpu_error_regs
->afsr_ext
;
6322 ch_flt
.afsr_errs
= (cpu_error_regs
->afsr_ext
& C_AFSR_EXT_ALL_ERRS
) |
6323 (cpu_error_regs
->afsr
& C_AFSR_ALL_ERRS
);
6324 aflt
->flt_addr
= cpu_error_regs
->afar
;
6325 #if defined(SERRANO)
6326 ch_flt
.afar2
= cpu_error_regs
->afar2
;
6327 #endif /* SERRANO */
6328 aflt
->flt_pc
= NULL
;
6329 aflt
->flt_priv
= ((cpu_error_regs
->afsr
& C_AFSR_PRIV
) != 0);
6331 aflt
->flt_panic
= 0;
6332 cpu_log_and_clear_ce(&ch_flt
);
6335 * check if we caused any errors during cleanup
6337 if (clear_errors(&ch_flt
)) {
6338 pr_reason
[0] = '\0';
6339 (void) cpu_queue_events(&ch_flt
, pr_reason
, ch_flt
.afsr_errs
,
6345 * Log/clear CEEN-controlled disrupting errors
6348 cpu_log_and_clear_ce(ch_async_flt_t
*ch_flt
)
6350 struct async_flt
*aflt
;
6351 uint64_t afsr
, afsr_errs
;
6352 ch_cpu_logout_t
*clop
;
6353 char pr_reason
[MAX_REASON_STRING
];
6354 on_trap_data_t
*otp
= curthread
->t_ontrap
;
6356 aflt
= (struct async_flt
*)ch_flt
;
6357 afsr
= aflt
->flt_stat
;
6358 afsr_errs
= ch_flt
->afsr_errs
;
6359 aflt
->flt_id
= gethrtime_waitfree();
6360 aflt
->flt_bus_id
= getprocessorid();
6361 aflt
->flt_inst
= CPU
->cpu_id
;
6362 aflt
->flt_prot
= AFLT_PROT_NONE
;
6363 aflt
->flt_class
= CPU_FAULT
;
6364 aflt
->flt_status
= ECC_C_TRAP
;
6366 pr_reason
[0] = '\0';
6368 * Get the CPU log out info for Disrupting Trap.
6370 if (CPU_PRIVATE(CPU
) == NULL
) {
6372 ch_flt
->flt_diag_data
.chd_afar
= LOGOUT_INVALID
;
6374 clop
= CPU_PRIVATE_PTR(CPU
, chpr_cecc_logout
);
6377 if (clop
&& ch_flt
->flt_trapped_ce
& CE_CEEN_TIMEOUT
) {
6378 ch_cpu_errors_t cpu_error_regs
;
6380 get_cpu_error_state(&cpu_error_regs
);
6381 (void) cpu_ce_delayed_ec_logout(cpu_error_regs
.afar
);
6382 clop
->clo_data
.chd_afsr
= cpu_error_regs
.afsr
;
6383 clop
->clo_data
.chd_afar
= cpu_error_regs
.afar
;
6384 clop
->clo_data
.chd_afsr_ext
= cpu_error_regs
.afsr_ext
;
6385 clop
->clo_sdw_data
.chd_afsr
= cpu_error_regs
.shadow_afsr
;
6386 clop
->clo_sdw_data
.chd_afar
= cpu_error_regs
.shadow_afar
;
6387 clop
->clo_sdw_data
.chd_afsr_ext
=
6388 cpu_error_regs
.shadow_afsr_ext
;
6389 #if defined(SERRANO)
6390 clop
->clo_data
.chd_afar2
= cpu_error_regs
.afar2
;
6391 #endif /* SERRANO */
6392 ch_flt
->flt_data_incomplete
= 1;
6395 * The logging/clear code expects AFSR/AFAR to be cleared.
6396 * The trap handler does it for CEEN enabled errors
6397 * so we need to do it here.
6399 set_cpu_error_state(&cpu_error_regs
);
6402 #if defined(JALAPENO) || defined(SERRANO)
6404 * FRC: Can't scrub memory as we don't have AFAR for Jalapeno.
6405 * For Serrano, even thou we do have the AFAR, we still do the
6406 * scrub on the RCE side since that's where the error type can
6407 * be properly classified as intermittent, persistent, etc.
6409 * CE/RCE: If error is in memory and AFAR is valid, scrub the memory.
6410 * Must scrub memory before cpu_queue_events, as scrubbing memory sets
6411 * the flt_status bits.
6413 if ((afsr
& (C_AFSR_CE
|C_AFSR_RCE
)) &&
6414 (cpu_flt_in_memory(ch_flt
, (afsr
& C_AFSR_CE
)) ||
6415 cpu_flt_in_memory(ch_flt
, (afsr
& C_AFSR_RCE
)))) {
6416 cpu_ce_scrub_mem_err(aflt
, B_TRUE
);
6418 #else /* JALAPENO || SERRANO */
6420 * CE/EMC: If error is in memory and AFAR is valid, scrub the memory.
6421 * Must scrub memory before cpu_queue_events, as scrubbing memory sets
6422 * the flt_status bits.
6424 if (afsr
& (C_AFSR_CE
|C_AFSR_EMC
)) {
6425 if (cpu_flt_in_memory(ch_flt
, (afsr
& C_AFSR_CE
)) ||
6426 cpu_flt_in_memory(ch_flt
, (afsr
& C_AFSR_EMC
))) {
6427 cpu_ce_scrub_mem_err(aflt
, B_TRUE
);
6431 #endif /* JALAPENO || SERRANO */
6434 * Update flt_prot if this error occurred under on_trap protection.
6436 if (otp
!= NULL
&& (otp
->ot_prot
& OT_DATA_EC
))
6437 aflt
->flt_prot
= AFLT_PROT_EC
;
6440 * Queue events on the async event queue, one event per error bit.
6442 if (cpu_queue_events(ch_flt
, pr_reason
, afsr_errs
, clop
) == 0 ||
6443 (afsr_errs
& (C_AFSR_CECC_ERRS
| C_AFSR_EXT_CECC_ERRS
)) == 0) {
6444 ch_flt
->flt_type
= CPU_INV_AFSR
;
6445 cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_INVALID_AFSR
,
6446 (void *)ch_flt
, sizeof (ch_async_flt_t
), ue_queue
,
6451 * Zero out + invalidate CPU logout.
6454 bzero(clop
, sizeof (ch_cpu_logout_t
));
6455 clop
->clo_data
.chd_afar
= LOGOUT_INVALID
;
6459 * If either a CPC, WDC or EDC error has occurred while CEEN
6460 * was disabled, we need to flush either the entire
6463 #if defined(JALAPENO) || defined(SERRANO)
6464 if (afsr
& (C_AFSR_EDC
| C_AFSR_CPC
| C_AFSR_CPU
| C_AFSR_WDC
))
6465 #else /* JALAPENO || SERRANO */
6466 if (afsr_errs
& (C_AFSR_EDC
| C_AFSR_CPC
| C_AFSR_WDC
| C_AFSR_L3_EDC
|
6467 C_AFSR_L3_CPC
| C_AFSR_L3_WDC
))
6468 #endif /* JALAPENO || SERRANO */
6469 cpu_error_ecache_flush(ch_flt
);
6474 * depending on the error type, we determine whether we
6475 * need to flush the entire ecache or just a line.
6478 cpu_error_ecache_flush_required(ch_async_flt_t
*ch_flt
)
6480 struct async_flt
*aflt
;
6482 uint64_t afsr_errs
= ch_flt
->afsr_errs
;
6484 aflt
= (struct async_flt
*)ch_flt
;
6485 afsr
= aflt
->flt_stat
;
6488 * If we got multiple errors, no point in trying
6489 * the individual cases, just flush the whole cache
6491 if (afsr
& C_AFSR_ME
) {
6492 return (ECACHE_FLUSH_ALL
);
6496 * If either a CPC, WDC or EDC error has occurred while CEEN
6497 * was disabled, we need to flush entire E$. We can't just
6498 * flush the cache line affected as the ME bit
6499 * is not set when multiple correctable errors of the same
6500 * type occur, so we might have multiple CPC or EDC errors,
6501 * with only the first recorded.
6503 #if defined(JALAPENO) || defined(SERRANO)
6504 if (afsr
& (C_AFSR_CPC
| C_AFSR_CPU
| C_AFSR_EDC
| C_AFSR_WDC
)) {
6505 #else /* JALAPENO || SERRANO */
6506 if (afsr_errs
& (C_AFSR_CPC
| C_AFSR_EDC
| C_AFSR_WDC
| C_AFSR_L3_CPC
|
6507 C_AFSR_L3_EDC
| C_AFSR_L3_WDC
)) {
6508 #endif /* JALAPENO || SERRANO */
6509 return (ECACHE_FLUSH_ALL
);
6512 #if defined(JALAPENO) || defined(SERRANO)
6514 * If only UE or RUE is set, flush the Ecache line, otherwise
6515 * flush the entire Ecache.
6517 if (afsr
& (C_AFSR_UE
|C_AFSR_RUE
)) {
6518 if ((afsr
& C_AFSR_ALL_ERRS
) == C_AFSR_UE
||
6519 (afsr
& C_AFSR_ALL_ERRS
) == C_AFSR_RUE
) {
6520 return (ECACHE_FLUSH_LINE
);
6522 return (ECACHE_FLUSH_ALL
);
6525 #else /* JALAPENO || SERRANO */
6527 * If UE only is set, flush the Ecache line, otherwise
6528 * flush the entire Ecache.
6530 if (afsr_errs
& C_AFSR_UE
) {
6531 if ((afsr_errs
& (C_AFSR_ALL_ERRS
| C_AFSR_EXT_ALL_ERRS
)) ==
6533 return (ECACHE_FLUSH_LINE
);
6535 return (ECACHE_FLUSH_ALL
);
6538 #endif /* JALAPENO || SERRANO */
6541 * EDU: If EDU only is set, flush the ecache line, otherwise
6542 * flush the entire Ecache.
6544 if (afsr_errs
& (C_AFSR_EDU
| C_AFSR_L3_EDU
)) {
6545 if (((afsr_errs
& ~C_AFSR_EDU
) == 0) ||
6546 ((afsr_errs
& ~C_AFSR_L3_EDU
) == 0)) {
6547 return (ECACHE_FLUSH_LINE
);
6549 return (ECACHE_FLUSH_ALL
);
6554 * BERR: If BERR only is set, flush the Ecache line, otherwise
6555 * flush the entire Ecache.
6557 if (afsr_errs
& C_AFSR_BERR
) {
6558 if ((afsr_errs
& ~C_AFSR_BERR
) == 0) {
6559 return (ECACHE_FLUSH_LINE
);
6561 return (ECACHE_FLUSH_ALL
);
6569 cpu_error_ecache_flush(ch_async_flt_t
*ch_flt
)
6571 int ecache_flush_flag
=
6572 cpu_error_ecache_flush_required(ch_flt
);
6575 * Flush Ecache line or entire Ecache based on above checks.
6577 if (ecache_flush_flag
== ECACHE_FLUSH_ALL
)
6579 else if (ecache_flush_flag
== ECACHE_FLUSH_LINE
) {
6580 cpu_flush_ecache_line(ch_flt
);
6586 * Extract the PA portion from the E$ tag.
6589 cpu_ectag_to_pa(int setsize
, uint64_t tag
)
6591 if (IS_JAGUAR(cpunodes
[CPU
->cpu_id
].implementation
))
6592 return (JG_ECTAG_TO_PA(setsize
, tag
));
6593 else if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
))
6594 return (PN_L3TAG_TO_PA(tag
));
6596 return (CH_ECTAG_TO_PA(setsize
, tag
));
6600 * Convert the E$ tag PA into an E$ subblock index.
6603 cpu_ectag_pa_to_subblk(int cachesize
, uint64_t subaddr
)
6605 if (IS_JAGUAR(cpunodes
[CPU
->cpu_id
].implementation
))
6606 return (JG_ECTAG_PA_TO_SUBBLK(cachesize
, subaddr
));
6607 else if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
))
6608 /* Panther has only one subblock per line */
6611 return (CH_ECTAG_PA_TO_SUBBLK(cachesize
, subaddr
));
6615 * All subblocks in an E$ line must be invalid for
6616 * the line to be invalid.
6619 cpu_ectag_line_invalid(int cachesize
, uint64_t tag
)
6621 if (IS_JAGUAR(cpunodes
[CPU
->cpu_id
].implementation
))
6622 return (JG_ECTAG_LINE_INVALID(cachesize
, tag
));
6623 else if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
))
6624 return (PN_L3_LINE_INVALID(tag
));
6626 return (CH_ECTAG_LINE_INVALID(cachesize
, tag
));
6630 * Extract state bits for a subblock given the tag. Note that for Panther
6631 * this works on both l2 and l3 tags.
6634 cpu_ectag_pa_to_subblk_state(int cachesize
, uint64_t subaddr
, uint64_t tag
)
6636 if (IS_JAGUAR(cpunodes
[CPU
->cpu_id
].implementation
))
6637 return (JG_ECTAG_PA_TO_SUBBLK_STATE(cachesize
, subaddr
, tag
));
6638 else if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
))
6639 return (tag
& CH_ECSTATE_MASK
);
6641 return (CH_ECTAG_PA_TO_SUBBLK_STATE(cachesize
, subaddr
, tag
));
6645 * Cpu specific initialization.
6650 #ifdef CHEETAHPLUS_ERRATUM_25
6651 if (cheetah_sendmondo_recover
) {
6652 cheetah_nudge_init();
6658 cpu_ereport_post(struct async_flt
*aflt
)
6660 char *cpu_type
, buf
[FM_MAX_CLASS
];
6661 nv_alloc_t
*nva
= NULL
;
6662 nvlist_t
*ereport
, *detector
, *resource
;
6663 errorq_elem_t
*eqep
;
6664 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
6665 char unum
[UNUM_NAMLEN
];
6668 plat_ecc_ch_async_flt_t plat_ecc_ch_flt
;
6670 if (aflt
->flt_panic
|| panicstr
) {
6671 eqep
= errorq_reserve(ereport_errorq
);
6674 ereport
= errorq_elem_nvl(ereport_errorq
, eqep
);
6675 nva
= errorq_elem_nva(ereport_errorq
, eqep
);
6677 ereport
= fm_nvlist_create(nva
);
6681 * Create the scheme "cpu" FMRI.
6683 detector
= fm_nvlist_create(nva
);
6684 resource
= fm_nvlist_create(nva
);
6685 switch (cpunodes
[aflt
->flt_inst
].implementation
) {
6687 cpu_type
= FM_EREPORT_CPU_USIII
;
6689 case CHEETAH_PLUS_IMPL
:
6690 cpu_type
= FM_EREPORT_CPU_USIIIplus
;
6693 cpu_type
= FM_EREPORT_CPU_USIIIi
;
6696 cpu_type
= FM_EREPORT_CPU_USIIIiplus
;
6699 cpu_type
= FM_EREPORT_CPU_USIV
;
6702 cpu_type
= FM_EREPORT_CPU_USIVplus
;
6705 cpu_type
= FM_EREPORT_CPU_UNSUPPORTED
;
6709 cpu_fmri_cpu_set(detector
, aflt
->flt_inst
);
6712 * Encode all the common data into the ereport.
6714 (void) snprintf(buf
, FM_MAX_CLASS
, "%s.%s.%s",
6715 FM_ERROR_CPU
, cpu_type
, aflt
->flt_erpt_class
);
6717 fm_ereport_set(ereport
, FM_EREPORT_VERSION
, buf
,
6718 fm_ena_generate_cpu(aflt
->flt_id
, aflt
->flt_inst
, FM_ENA_FMT1
),
6722 * Encode the error specific data that was saved in
6723 * the async_flt structure into the ereport.
6725 cpu_payload_add_aflt(aflt
, ereport
, resource
,
6726 &plat_ecc_ch_flt
.ecaf_afar_status
,
6727 &plat_ecc_ch_flt
.ecaf_synd_status
);
6729 if (aflt
->flt_panic
|| panicstr
) {
6730 errorq_commit(ereport_errorq
, eqep
, ERRORQ_SYNC
);
6732 (void) fm_ereport_post(ereport
, EVCH_TRYHARD
);
6733 fm_nvlist_destroy(ereport
, FM_NVA_FREE
);
6734 fm_nvlist_destroy(detector
, FM_NVA_FREE
);
6735 fm_nvlist_destroy(resource
, FM_NVA_FREE
);
6738 * Send the enhanced error information (plat_ecc_error2_data_t)
6739 * to the SC olny if it can process it.
6742 if (&plat_ecc_capability_sc_get
&&
6743 plat_ecc_capability_sc_get(PLAT_ECC_ERROR2_MESSAGE
)) {
6744 msg_type
= cpu_flt_bit_to_plat_error(aflt
);
6745 if (msg_type
!= PLAT_ECC_ERROR2_NONE
) {
6747 * If afar status is not invalid do a unum lookup.
6749 if (plat_ecc_ch_flt
.ecaf_afar_status
!=
6750 AFLT_STAT_INVALID
) {
6751 synd_code
= synd_to_synd_code(
6752 plat_ecc_ch_flt
.ecaf_synd_status
,
6753 aflt
->flt_synd
, ch_flt
->flt_bit
);
6754 (void) cpu_get_mem_unum_synd(synd_code
,
6759 plat_ecc_ch_flt
.ecaf_sdw_afar
= ch_flt
->flt_sdw_afar
;
6760 plat_ecc_ch_flt
.ecaf_sdw_afsr
= ch_flt
->flt_sdw_afsr
;
6761 plat_ecc_ch_flt
.ecaf_afsr_ext
= ch_flt
->afsr_ext
;
6762 plat_ecc_ch_flt
.ecaf_sdw_afsr_ext
=
6763 ch_flt
->flt_sdw_afsr_ext
;
6765 if (&plat_log_fruid_error2
)
6766 plat_log_fruid_error2(msg_type
, unum
, aflt
,
6773 cpu_run_bus_error_handlers(struct async_flt
*aflt
, int expected
)
6778 bzero(&de
, sizeof (ddi_fm_error_t
));
6780 de
.fme_version
= DDI_FME_VERSION
;
6781 de
.fme_ena
= fm_ena_generate_cpu(aflt
->flt_id
, aflt
->flt_inst
,
6783 de
.fme_flag
= expected
;
6784 de
.fme_bus_specific
= (void *)aflt
->flt_addr
;
6785 status
= ndi_fm_handler_dispatch(ddi_root_node(), NULL
, &de
);
6786 if ((aflt
->flt_prot
== AFLT_PROT_NONE
) && (status
== DDI_FM_FATAL
))
6787 aflt
->flt_panic
= 1;
6791 cpu_errorq_dispatch(char *error_class
, void *payload
, size_t payload_sz
,
6792 errorq_t
*eqp
, uint_t flag
)
6794 struct async_flt
*aflt
= (struct async_flt
*)payload
;
6796 aflt
->flt_erpt_class
= error_class
;
6797 errorq_dispatch(eqp
, payload
, payload_sz
, flag
);
6801 * This routine may be called by the IO module, but does not do
6802 * anything in this cpu module. The SERD algorithm is handled by
6803 * cpumem-diagnosis engine instead.
6807 cpu_ce_count_unum(struct async_flt
*ecc
, int len
, char *unum
)
6811 adjust_hw_copy_limits(int ecache_size
)
6814 * Set hw copy limits.
6816 * /etc/system will be parsed later and can override one or more
6817 * of these settings.
6819 * At this time, ecache size seems only mildly relevant.
6820 * We seem to run into issues with the d-cache and stalls
6823 * Cycle measurement indicates that 2 byte aligned copies fare
6824 * little better than doing things with VIS at around 512 bytes.
6825 * 4 byte aligned shows promise until around 1024 bytes. 8 Byte
6826 * aligned is faster whenever the source and destination data
6827 * in cache and the total size is less than 2 Kbytes. The 2K
6828 * limit seems to be driven by the 2K write cache.
6829 * When more than 2K of copies are done in non-VIS mode, stores
6830 * backup in the write cache. In VIS mode, the write cache is
6831 * bypassed, allowing faster cache-line writes aligned on cache
6834 * In addition, in non-VIS mode, there is no prefetching, so
6835 * for larger copies, the advantage of prefetching to avoid even
6836 * occasional cache misses is enough to justify using the VIS code.
6838 * During testing, it was discovered that netbench ran 3% slower
6839 * when hw_copy_limit_8 was 2K or larger. Apparently for server
6840 * applications, data is only used once (copied to the output
6841 * buffer, then copied by the network device off the system). Using
6842 * the VIS copy saves more L2 cache state. Network copies are
6843 * around 1.3K to 1.5K in size for historical reasons.
6845 * Therefore, a limit of 1K bytes will be used for the 8 byte
6846 * aligned copy even for large caches and 8 MB ecache. The
6847 * infrastructure to allow different limits for different sized
6848 * caches is kept to allow further tuning in later releases.
6851 if (min_ecache_size
== 0 && use_hw_bcopy
) {
6853 * First time through - should be before /etc/system
6855 * Could skip the checks for zero but this lets us
6856 * preserve any debugger rewrites.
6858 if (hw_copy_limit_1
== 0) {
6859 hw_copy_limit_1
= VIS_COPY_THRESHOLD
;
6860 priv_hcl_1
= hw_copy_limit_1
;
6862 if (hw_copy_limit_2
== 0) {
6863 hw_copy_limit_2
= 2 * VIS_COPY_THRESHOLD
;
6864 priv_hcl_2
= hw_copy_limit_2
;
6866 if (hw_copy_limit_4
== 0) {
6867 hw_copy_limit_4
= 4 * VIS_COPY_THRESHOLD
;
6868 priv_hcl_4
= hw_copy_limit_4
;
6870 if (hw_copy_limit_8
== 0) {
6871 hw_copy_limit_8
= 4 * VIS_COPY_THRESHOLD
;
6872 priv_hcl_8
= hw_copy_limit_8
;
6874 min_ecache_size
= ecache_size
;
6877 * MP initialization. Called *after* /etc/system has
6878 * been parsed. One CPU has already been initialized.
6879 * Need to cater for /etc/system having scragged one
6882 if (ecache_size
== min_ecache_size
) {
6884 * Same size ecache. We do nothing unless we
6885 * have a pessimistic ecache setting. In that
6886 * case we become more optimistic (if the cache is
6889 if (hw_copy_limit_8
== 4 * VIS_COPY_THRESHOLD
) {
6891 * Need to adjust hw_copy_limit* from our
6892 * pessimistic uniprocessor value to a more
6893 * optimistic UP value *iff* it hasn't been
6896 if ((ecache_size
> 1048576) &&
6897 (priv_hcl_8
== hw_copy_limit_8
)) {
6898 if (ecache_size
<= 2097152)
6899 hw_copy_limit_8
= 4 *
6901 else if (ecache_size
<= 4194304)
6902 hw_copy_limit_8
= 4 *
6905 hw_copy_limit_8
= 4 *
6907 priv_hcl_8
= hw_copy_limit_8
;
6910 } else if (ecache_size
< min_ecache_size
) {
6912 * A different ecache size. Can this even happen?
6914 if (priv_hcl_8
== hw_copy_limit_8
) {
6916 * The previous value that we set
6917 * is unchanged (i.e., it hasn't been
6918 * scragged by /etc/system). Rewrite it.
6920 if (ecache_size
<= 1048576)
6921 hw_copy_limit_8
= 8 *
6923 else if (ecache_size
<= 2097152)
6924 hw_copy_limit_8
= 8 *
6926 else if (ecache_size
<= 4194304)
6927 hw_copy_limit_8
= 8 *
6930 hw_copy_limit_8
= 10 *
6932 priv_hcl_8
= hw_copy_limit_8
;
6933 min_ecache_size
= ecache_size
;
6940 * Called from illegal instruction trap handler to see if we can attribute
6941 * the trap to a fpras check.
6944 fpras_chktrap(struct regs
*rp
)
6947 struct fpras_chkfngrp
*cgp
;
6948 uintptr_t tpc
= (uintptr_t)rp
->r_pc
;
6950 if (fpras_chkfngrps
== NULL
)
6953 cgp
= &fpras_chkfngrps
[CPU
->cpu_id
];
6954 for (op
= 0; op
< FPRAS_NCOPYOPS
; ++op
) {
6955 if (tpc
>= (uintptr_t)&cgp
->fpras_fn
[op
].fpras_blk0
&&
6956 tpc
< (uintptr_t)&cgp
->fpras_fn
[op
].fpras_chkresult
)
6959 if (op
== FPRAS_NCOPYOPS
)
6963 * This is an fpRAS failure caught through an illegal
6964 * instruction - trampoline.
6966 rp
->r_pc
= (uintptr_t)&cgp
->fpras_fn
[op
].fpras_trampoline
;
6967 rp
->r_npc
= rp
->r_pc
+ 4;
6972 * fpras_failure is called when a fpras check detects a bad calculation
6973 * result or an illegal instruction trap is attributed to an fpras
6974 * check. In all cases we are still bound to CPU.
6977 fpras_failure(int op
, int how
)
6979 int use_hw_bcopy_orig
, use_hw_bzero_orig
;
6980 uint_t hcl1_orig
, hcl2_orig
, hcl4_orig
, hcl8_orig
;
6981 ch_async_flt_t ch_flt
;
6982 struct async_flt
*aflt
= (struct async_flt
*)&ch_flt
;
6983 struct fpras_chkfn
*sfp
, *cfp
;
6984 uint32_t *sip
, *cip
;
6988 * We're running on a sick CPU. Avoid further FPU use at least for
6989 * the time in which we dispatch an ereport and (if applicable) panic.
6991 use_hw_bcopy_orig
= use_hw_bcopy
;
6992 use_hw_bzero_orig
= use_hw_bzero
;
6993 hcl1_orig
= hw_copy_limit_1
;
6994 hcl2_orig
= hw_copy_limit_2
;
6995 hcl4_orig
= hw_copy_limit_4
;
6996 hcl8_orig
= hw_copy_limit_8
;
6997 use_hw_bcopy
= use_hw_bzero
= 0;
6998 hw_copy_limit_1
= hw_copy_limit_2
= hw_copy_limit_4
=
6999 hw_copy_limit_8
= 0;
7001 bzero(&ch_flt
, sizeof (ch_async_flt_t
));
7002 aflt
->flt_id
= gethrtime_waitfree();
7003 aflt
->flt_class
= CPU_FAULT
;
7004 aflt
->flt_inst
= CPU
->cpu_id
;
7005 aflt
->flt_status
= (how
<< 8) | op
;
7006 aflt
->flt_payload
= FM_EREPORT_PAYLOAD_FPU_HWCOPY
;
7007 ch_flt
.flt_type
= CPU_FPUERR
;
7010 * We must panic if the copy operation had no lofault protection -
7011 * ie, don't panic for copyin, copyout, kcopy and bcopy called
7012 * under on_fault and do panic for unprotected bcopy and hwblkpagecopy.
7014 aflt
->flt_panic
= (curthread
->t_lofault
== NULL
);
7017 * XOR the source instruction block with the copied instruction
7018 * block - this will show us which bit(s) are corrupted.
7020 sfp
= (struct fpras_chkfn
*)fpras_chkfn_type1
;
7021 cfp
= &fpras_chkfngrps
[CPU
->cpu_id
].fpras_fn
[op
];
7022 if (op
== FPRAS_BCOPY
|| op
== FPRAS_COPYOUT
) {
7023 sip
= &sfp
->fpras_blk0
[0];
7024 cip
= &cfp
->fpras_blk0
[0];
7026 sip
= &sfp
->fpras_blk1
[0];
7027 cip
= &cfp
->fpras_blk1
[0];
7029 for (i
= 0; i
< 16; ++i
, ++sip
, ++cip
)
7030 ch_flt
.flt_fpdata
[i
] = *sip
^ *cip
;
7032 cpu_errorq_dispatch(FM_EREPORT_CPU_USIII_FPU_HWCOPY
, (void *)&ch_flt
,
7033 sizeof (ch_async_flt_t
), ue_queue
, aflt
->flt_panic
);
7035 if (aflt
->flt_panic
)
7036 fm_panic("FPU failure on CPU %d", CPU
->cpu_id
);
7039 * We get here for copyin/copyout and kcopy or bcopy where the
7040 * caller has used on_fault. We will flag the error so that
7041 * the process may be killed The trap_async_hwerr mechanism will
7042 * take appropriate further action (such as a reboot, contract
7043 * notification etc). Since we may be continuing we will
7044 * restore the global hardware copy acceleration switches.
7046 * When we return from this function to the copy function we want to
7047 * avoid potentially bad data being used, ie we want the affected
7048 * copy function to return an error. The caller should therefore
7049 * invoke its lofault handler (which always exists for these functions)
7050 * which will return the appropriate error.
7052 ttolwp(curthread
)->lwp_pcb
.pcb_flags
|= ASYNC_HWERR
;
7055 use_hw_bcopy
= use_hw_bcopy_orig
;
7056 use_hw_bzero
= use_hw_bzero_orig
;
7057 hw_copy_limit_1
= hcl1_orig
;
7058 hw_copy_limit_2
= hcl2_orig
;
7059 hw_copy_limit_4
= hcl4_orig
;
7060 hw_copy_limit_8
= hcl8_orig
;
7065 #define VIS_BLOCKSIZE 64
7068 dtrace_blksuword32_err(uintptr_t addr
, uint32_t *data
)
7072 watched
= watch_disable_addr((void *)addr
, VIS_BLOCKSIZE
, S_WRITE
);
7073 ret
= dtrace_blksuword32(addr
, data
, 0);
7075 watch_enable_addr((void *)addr
, VIS_BLOCKSIZE
, S_WRITE
);
7081 * Called when a cpu enters the CPU_FAULTED state (by the cpu placing the
7082 * faulted cpu into that state). Cross-trap to the faulted cpu to clear
7083 * CEEN from the EER to disable traps for further disrupting error types
7084 * on that cpu. We could cross-call instead, but that has a larger
7085 * instruction and data footprint than cross-trapping, and the cpu is known
7090 cpu_faulted_enter(struct cpu
*cp
)
7092 xt_one(cp
->cpu_id
, set_error_enable_tl1
, EN_REG_CEEN
, EER_SET_CLRBITS
);
7096 * Called when a cpu leaves the CPU_FAULTED state to return to one of
7097 * offline, spare, or online (by the cpu requesting this state change).
7098 * First we cross-call to clear the AFSR (and AFSR_EXT on Panther) of
7099 * disrupting error bits that have accumulated without trapping, then
7100 * we cross-trap to re-enable CEEN controlled traps.
7103 cpu_faulted_exit(struct cpu
*cp
)
7105 ch_cpu_errors_t cpu_error_regs
;
7107 cpu_error_regs
.afsr
= C_AFSR_CECC_ERRS
;
7108 if (IS_PANTHER(cpunodes
[cp
->cpu_id
].implementation
))
7109 cpu_error_regs
.afsr_ext
&= C_AFSR_EXT_CECC_ERRS
;
7110 xc_one(cp
->cpu_id
, (xcfunc_t
*)set_cpu_error_state
,
7111 (uint64_t)&cpu_error_regs
, 0);
7113 xt_one(cp
->cpu_id
, set_error_enable_tl1
, EN_REG_CEEN
, EER_SET_SETBITS
);
7117 * Return 1 if the errors in ch_flt's AFSR are secondary errors caused by
7118 * the errors in the original AFSR, 0 otherwise.
7120 * For all procs if the initial error was a BERR or TO, then it is possible
7121 * that we may have caused a secondary BERR or TO in the process of logging the
7122 * inital error via cpu_run_bus_error_handlers(). If this is the case then
7123 * if the request was protected then a panic is still not necessary, if not
7124 * protected then aft_panic is already set - so either way there's no need
7125 * to set aft_panic for the secondary error.
7127 * For Cheetah and Jalapeno, if the original error was a UE which occurred on
7128 * a store merge, then the error handling code will call cpu_deferred_error().
7129 * When clear_errors() is called, it will determine that secondary errors have
7130 * occurred - in particular, the store merge also caused a EDU and WDU that
7131 * weren't discovered until this point.
7133 * We do three checks to verify that we are in this case. If we pass all three
7134 * checks, we return 1 to indicate that we should not panic. If any unexpected
7135 * errors occur, we return 0.
7137 * For Cheetah+ and derivative procs, the store merge causes a DUE, which is
7138 * handled in cpu_disrupting_errors(). Since this function is not even called
7139 * in the case we are interested in, we just return 0 for these processors.
7143 cpu_check_secondary_errors(ch_async_flt_t
*ch_flt
, uint64_t t_afsr_errs
,
7146 #if defined(CHEETAH_PLUS)
7147 #else /* CHEETAH_PLUS */
7148 struct async_flt
*aflt
= (struct async_flt
*)ch_flt
;
7149 #endif /* CHEETAH_PLUS */
7152 * Was the original error a BERR or TO and only a BERR or TO
7153 * (multiple errors are also OK)
7155 if ((t_afsr_errs
& ~(C_AFSR_BERR
| C_AFSR_TO
| C_AFSR_ME
)) == 0) {
7157 * Is the new error a BERR or TO and only a BERR or TO
7158 * (multiple errors are also OK)
7160 if ((ch_flt
->afsr_errs
&
7161 ~(C_AFSR_BERR
| C_AFSR_TO
| C_AFSR_ME
)) == 0)
7165 #if defined(CHEETAH_PLUS)
7167 #else /* CHEETAH_PLUS */
7169 * Now look for secondary effects of a UE on cheetah/jalapeno
7171 * Check the original error was a UE, and only a UE. Note that
7172 * the ME bit will cause us to fail this check.
7174 if (t_afsr_errs
!= C_AFSR_UE
)
7178 * Check the secondary errors were exclusively an EDU and/or WDU.
7180 if ((ch_flt
->afsr_errs
& ~(C_AFSR_EDU
|C_AFSR_WDU
)) != 0)
7184 * Check the AFAR of the original error and secondary errors
7185 * match to the 64-byte boundary
7187 if (P2ALIGN(aflt
->flt_addr
, 64) != P2ALIGN(t_afar
, 64))
7191 * We've passed all the checks, so it's a secondary error!
7194 #endif /* CHEETAH_PLUS */
7198 * Translate the flt_bit or flt_type into an error type. First, flt_bit
7199 * is checked for any valid errors. If found, the error type is
7200 * returned. If not found, the flt_type is checked for L1$ parity errors.
7204 cpu_flt_bit_to_plat_error(struct async_flt
*aflt
)
7206 #if defined(JALAPENO)
7208 * Currently, logging errors to the SC is not supported on Jalapeno
7210 return (PLAT_ECC_ERROR2_NONE
);
7212 ch_async_flt_t
*ch_flt
= (ch_async_flt_t
*)aflt
;
7214 switch (ch_flt
->flt_bit
) {
7216 return (PLAT_ECC_ERROR2_CE
);
7221 return (PLAT_ECC_ERROR2_L2_CE
);
7223 return (PLAT_ECC_ERROR2_EMC
);
7225 return (PLAT_ECC_ERROR2_IVC
);
7227 return (PLAT_ECC_ERROR2_UE
);
7232 return (PLAT_ECC_ERROR2_L2_UE
);
7234 return (PLAT_ECC_ERROR2_IVU
);
7236 return (PLAT_ECC_ERROR2_TO
);
7238 return (PLAT_ECC_ERROR2_BERR
);
7239 #if defined(CHEETAH_PLUS)
7244 return (PLAT_ECC_ERROR2_L3_CE
);
7246 return (PLAT_ECC_ERROR2_IMC
);
7248 return (PLAT_ECC_ERROR2_L2_TSCE
);
7250 return (PLAT_ECC_ERROR2_L2_THCE
);
7251 case C_AFSR_L3_MECC
:
7252 return (PLAT_ECC_ERROR2_L3_MECC
);
7253 case C_AFSR_L3_THCE
:
7254 return (PLAT_ECC_ERROR2_L3_THCE
);
7259 return (PLAT_ECC_ERROR2_L3_UE
);
7261 return (PLAT_ECC_ERROR2_DUE
);
7263 return (PLAT_ECC_ERROR2_DTO
);
7265 return (PLAT_ECC_ERROR2_DBERR
);
7266 #endif /* CHEETAH_PLUS */
7268 switch (ch_flt
->flt_type
) {
7269 #if defined(CPU_IMP_L1_CACHE_PARITY)
7271 return (PLAT_ECC_ERROR2_IPE
);
7273 if (IS_PANTHER(cpunodes
[CPU
->cpu_id
].implementation
)) {
7274 if (ch_flt
->parity_data
.dpe
.cpl_cache
==
7276 return (PLAT_ECC_ERROR2_PCACHE
);
7279 return (PLAT_ECC_ERROR2_DPE
);
7280 #endif /* CPU_IMP_L1_CACHE_PARITY */
7281 case CPU_ITLB_PARITY
:
7282 return (PLAT_ECC_ERROR2_ITLB
);
7283 case CPU_DTLB_PARITY
:
7284 return (PLAT_ECC_ERROR2_DTLB
);
7286 return (PLAT_ECC_ERROR2_NONE
);
7289 #endif /* JALAPENO */