Merge illumos-gate
[unleashed/lotheac.git] / usr / src / uts / i86pc / cpu / generic_cpu / gcpu_mca.c
blobc0530a78325a911dfa4f041de5197e4a1c204d20
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2010, Intel Corporation.
27 * All rights reserved.
30 #include <sys/mca_x86.h>
31 #include <sys/cpu_module_impl.h>
32 #include <sys/cpu_module_ms.h>
33 #include <sys/cmn_err.h>
34 #include <sys/cpuvar.h>
35 #include <sys/pghw.h>
36 #include <sys/x86_archext.h>
37 #include <sys/sysmacros.h>
38 #include <sys/regset.h>
39 #include <sys/privregs.h>
40 #include <sys/systm.h>
41 #include <sys/types.h>
42 #include <sys/log.h>
43 #include <sys/psw.h>
44 #include <sys/fm/protocol.h>
45 #include <sys/fm/util.h>
46 #include <sys/errorq.h>
47 #include <sys/mca_x86.h>
48 #include <sys/fm/cpu/GMCA.h>
49 #include <sys/fm/smb/fmsmb.h>
50 #include <sys/sysevent.h>
51 #include <sys/ontrap.h>
53 #include "gcpu.h"
55 extern int x86gentopo_legacy; /* x86 generic topology support */
57 static uint_t gcpu_force_addr_in_payload = 0;
60 * Clear to log telemetry found at initialization. While processor docs
61 * say you should process this telemetry on all but Intel family 0x6
62 * there are way too many exceptions and we want to avoid bogus
63 * diagnoses.
65 int gcpu_suppress_log_on_init = 1;
68 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
69 * error logout time. The stack will be included in the ereport if the
70 * error type selects stack inclusion, or in all cases if
71 * gcpu_mca_stack_ereport_include is nonzero.
73 int gcpu_mca_stack_flag = 0;
74 int gcpu_mca_stack_ereport_include = 0;
77 * The number of times to re-read MCA telemetry to try to obtain a
78 * consistent snapshot if we find it to be changing under our feet.
80 int gcpu_mca_telemetry_retries = 5;
82 int gcpu_mca_cmci_throttling_threshold = 10;
83 int gcpu_mca_cmci_reenable_threshold = 1000;
85 static gcpu_error_disp_t gcpu_errtypes[] = {
88 * Unclassified
91 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
92 NULL,
93 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
94 MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
95 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
99 * Microcode ROM Parity Error
102 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
103 NULL,
104 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
105 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
106 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
110 * External - BINIT# from another processor during power-on config
113 FM_EREPORT_CPU_GENERIC_EXTERNAL,
114 NULL,
115 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
116 MCAX86_SIMPLE_EXTERNAL_MASKON,
117 MCAX86_SIMPLE_EXTERNAL_MASKOFF
121 * Functional redundancy check master/slave error
124 FM_EREPORT_CPU_GENERIC_FRC,
125 NULL,
126 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
127 MCAX86_SIMPLE_FRC_MASKON,
128 MCAX86_SIMPLE_FRC_MASKOFF
132 * Internal parity error
135 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
136 NULL,
137 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
138 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
139 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
144 * Internal timer error
147 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
148 NULL,
149 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
150 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
151 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
155 * Internal unclassified
158 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
159 NULL,
160 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
161 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
162 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
166 * Compound error codes - generic memory hierarchy
169 FM_EREPORT_CPU_GENERIC_GENMEMHIER,
170 NULL,
171 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
172 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
173 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
177 * Compound error codes - TLB errors
180 FM_EREPORT_CPU_GENERIC_TLB,
181 "%1$s" "TLB" "%2$s" "_ERR",
182 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
183 MCAX86_COMPOUND_TLB_MASKON,
184 MCAX86_COMPOUND_TLB_MASKOFF
188 * Compound error codes - memory hierarchy
191 FM_EREPORT_CPU_GENERIC_MEMHIER,
192 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
193 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
194 MCAX86_COMPOUND_MEMHIER_MASKON,
195 MCAX86_COMPOUND_MEMHIER_MASKOFF
199 * Compound error codes - bus and interconnect errors
202 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
203 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
204 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
205 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
206 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
209 * Compound error codes - memory controller errors
212 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
213 "MC" "_" "%8$s" "_" "%9$s" "_ERR",
214 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
215 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
216 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
220 static gcpu_error_disp_t gcpu_unknown = {
221 FM_EREPORT_CPU_GENERIC_UNKNOWN,
222 "UNKNOWN",
223 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
228 static errorq_t *gcpu_mca_queue;
229 static kmutex_t gcpu_mca_queue_lock;
231 static int isxpv = 0;
233 static const gcpu_error_disp_t *
234 gcpu_disp_match(uint16_t code)
236 const gcpu_error_disp_t *ged = gcpu_errtypes;
237 int i;
239 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
240 i++, ged++) {
241 uint16_t on = ged->ged_errcode_mask_on;
242 uint16_t off = ged->ged_errcode_mask_off;
244 if ((code & on) == on && (code & off) == 0)
245 return (ged);
248 return (NULL);
251 static uint16_t
252 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
254 return ((code & mask) >> shift);
257 #define BIT_STRIP(code, name) \
258 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
259 MCAX86_ERRCODE_##name##_SHIFT)
261 #define GCPU_MNEMONIC_UNDEF "undefined"
262 #define GCPU_MNEMONIC_RESVD "reserved"
265 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
266 * mnemonics and to ereport class name components.
269 struct gcpu_mnexp {
270 const char *mne_compound; /* used in expanding compound errname */
271 const char *mne_ereport; /* used in expanding ereport class */
274 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
275 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */
276 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */
277 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */
278 { GCPU_MNEMONIC_UNDEF, "" }
281 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
282 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */
283 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */
284 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */
285 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */
288 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
289 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */
290 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */
291 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */
292 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */
293 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */
294 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */
295 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */
296 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */
297 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */
300 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
301 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */
302 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */
303 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */
304 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */
307 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
308 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */
309 { GCPU_MNEMONIC_RESVD, "" },
310 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */
311 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */
314 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */
315 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */
316 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */
319 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
320 { "CH0", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH0 */
321 { "CH1", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH1 */
322 { "CH2", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH2 */
323 { "CH3", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH3 */
324 { "CH4", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH4 */
325 { "CH5", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH5 */
326 { "CH6", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH6 */
327 { "CH7", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH7 */
328 { "CH8", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH8 */
329 { "CH9", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH9 */
330 { "CH10", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH10 */
331 { "CH11", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH11 */
332 { "CH12", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH12 */
333 { "CH13", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH13 */
334 { "CH14", FM_EREPORT_CPU_GENERIC_CCCC }, /* CH14 */
335 { "CH", FM_EREPORT_CPU_GENERIC_CCCC } /* GEN */
338 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
339 { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR }, /* GEN ERR */
340 { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD }, /* READ */
341 { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR }, /* WRITE */
342 { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD }, /* ADDR, CMD */
343 { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
344 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */
345 { GCPU_MNEMONIC_RESVD, ""}, /* RESERVED */
346 { GCPU_MNEMONIC_RESVD, ""} /* RESERVED */
349 enum gcpu_mn_namespace {
350 GCPU_MN_NAMESPACE_COMPOUND,
351 GCPU_MN_NAMESPACE_EREPORT
354 static const char *
355 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val,
356 enum gcpu_mn_namespace nspace)
358 if (val >= tbl_sz || val > 0xff)
359 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */
361 switch (nspace) {
362 case GCPU_MN_NAMESPACE_COMPOUND:
363 return (tbl[val].mne_compound);
364 /*NOTREACHED*/
366 case GCPU_MN_NAMESPACE_EREPORT:
367 return (tbl[val].mne_ereport);
368 /*NOTREACHED*/
370 default:
371 return (GCPU_MNEMONIC_UNDEF);
372 /*NOTREACHED*/
377 * The ereport class leaf component is either a simple string with no
378 * format specifiers, or a string with one or more embedded %n$s specifiers -
379 * positional selection for string arguments. The kernel snprintf does
380 * not support %n$ (and teaching it to do so is too big a headache) so
381 * we will expand this restricted format string ourselves.
384 #define GCPU_CLASS_VARCOMPS 9
386 #define GCPU_MNEMONIC(code, name, nspace) \
387 gcpu_mnemonic(gcpu_##name##_mnemonics, \
388 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
389 BIT_STRIP(code, name), nspace)
391 static void
392 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
393 enum gcpu_mn_namespace nspace)
395 uint16_t code = MCAX86_ERRCODE(status);
396 const char *mn[GCPU_CLASS_VARCOMPS];
397 char *p = buf; /* current position in buf */
398 char *q = buf + buflen; /* pointer past last char in buf */
399 int which, expfmtchar, error;
400 char c;
402 mn[0] = GCPU_MNEMONIC(code, TT, nspace);
403 mn[1] = GCPU_MNEMONIC(code, LL, nspace);
404 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
405 mn[3] = GCPU_MNEMONIC(code, PP, nspace);
406 mn[4] = GCPU_MNEMONIC(code, II, nspace);
407 mn[5] = GCPU_MNEMONIC(code, T, nspace);
408 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
409 mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
410 mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
412 while (p < q - 1 && (c = *fmt++) != '\0') {
413 if (c != '%') {
414 /* not the beginning of a format specifier - copy */
415 *p++ = c;
416 continue;
419 error = 0;
420 which = -1;
421 expfmtchar = -1;
423 nextfmt:
424 if ((c = *fmt++) == '\0')
425 break; /* early termination of fmt specifier */
427 switch (c) {
428 case '1':
429 case '2':
430 case '3':
431 case '4':
432 case '5':
433 case '6':
434 case '7':
435 case '8':
436 case '9':
437 if (which != -1) { /* allow only one positional digit */
438 error++;
439 break;
441 which = c - '1';
442 goto nextfmt;
443 /*NOTREACHED*/
445 case '$':
446 if (which == -1) { /* no position specified */
447 error++;
448 break;
450 expfmtchar = 's';
451 goto nextfmt;
452 /*NOTREACHED*/
454 case 's':
455 if (expfmtchar != 's') {
456 error++;
457 break;
459 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
460 mn[which]);
461 p += strlen(p);
462 break;
464 default:
465 error++;
466 break;
469 if (error)
470 break;
473 *p = '\0'; /* NUL termination */
476 static void
477 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
478 const char *cpuclass, const char *leafclass)
480 char *p = buf; /* current position in buf */
481 char *q = buf + buflen; /* pointer past last char in buf */
483 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
484 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
486 p += strlen(p);
487 if (p >= q)
488 return;
490 if (leafclass == NULL) {
491 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
492 GCPU_MN_NAMESPACE_EREPORT);
493 } else {
494 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
495 leafclass);
500 * Create an "hc" scheme FMRI identifying the given cpu with
501 * motherboard/chip/core/strand instance numbers.
503 static nvlist_t *
504 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
506 nvlist_t *nvl, *fmri;
508 if ((nvl = fm_nvlist_create(nva)) == NULL)
509 return (NULL);
511 if (!x86gentopo_legacy) {
512 fmri = cmi_hdl_smb_bboard(hdl);
513 if (fmri == NULL)
514 return (NULL);
516 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
517 NULL, NULL, fmri, 3,
518 "chip", cmi_hdl_smb_chipid(hdl),
519 "core", cmi_hdl_coreid(hdl),
520 "strand", cmi_hdl_strandid(hdl));
521 } else {
522 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
523 "motherboard", 0,
524 "chip", cmi_hdl_chipid(hdl),
525 "core", cmi_hdl_coreid(hdl),
526 "strand", cmi_hdl_strandid(hdl));
529 return (nvl);
532 int gcpu_bleat_count_thresh = 5;
533 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
536 * Called when we are unable to propogate a logout structure onto an
537 * errorq for subsequent ereport preparation and logging etc. The caller
538 * should usually only decide to call this for severe errors - those we
539 * suspect we may need to panic for.
541 static void
542 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
544 hrtime_t now = gethrtime_waitfree();
545 static hrtime_t gcpu_last_bleat;
546 gcpu_bank_logout_t *gbl;
547 static int bleatcount;
548 int i;
551 * Throttle spamming of the console. The first gcpu_bleat_count_thresh
552 * can come as fast as we like, but once we've spammed that many
553 * to the console we require a minimum interval to pass before
554 * any more complaints.
556 if (++bleatcount > gcpu_bleat_count_thresh) {
557 if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
558 return;
559 else
560 bleatcount = 0;
562 gcpu_last_bleat = now;
564 cmn_err(CE_WARN,
565 "Machine-Check Errors unlogged on chip %d core %d strand %d, "
566 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
567 cmi_hdl_strandid(hdl));
568 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
569 (u_longlong_t)gcl->gcl_mcg_status);
570 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
571 uint64_t status = gbl->gbl_status;
573 if (!(status & MSR_MC_STATUS_VAL))
574 continue;
576 /* Force ADDRV for AMD Family 0xf and above */
577 if (gcpu_force_addr_in_payload)
578 status = status | MSR_MC_STATUS_ADDRV;
580 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
581 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
582 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
583 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
584 i, IA32_MSR_MC(i, STATUS),
585 (u_longlong_t)gbl->gbl_status,
586 (u_longlong_t)gbl->gbl_addr,
587 (u_longlong_t)gbl->gbl_misc);
588 break;
590 case MSR_MC_STATUS_ADDRV:
591 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
592 "STAT 0x%016llx ADDR 0x%016llx",
593 i, IA32_MSR_MC(i, STATUS),
594 (u_longlong_t)gbl->gbl_status,
595 (u_longlong_t)gbl->gbl_addr);
596 break;
598 case MSR_MC_STATUS_MISCV:
599 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
600 "STAT 0x%016llx MISC 0x%016llx",
601 i, IA32_MSR_MC(i, STATUS),
602 (u_longlong_t)gbl->gbl_status,
603 (u_longlong_t)gbl->gbl_misc);
604 break;
606 default:
607 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
608 "STAT 0x%016llx",
609 i, IA32_MSR_MC(i, STATUS),
610 (u_longlong_t)gbl->gbl_status);
611 break;
617 #define _GCPU_BSTATUS(status, what) \
618 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
619 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
621 static void
622 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
623 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
625 uint64_t members = ged ? ged->ged_ereport_members :
626 FM_EREPORT_PAYLOAD_FLAGS_COMMON;
627 uint64_t mcg = gcl->gcl_mcg_status;
628 int mcip = mcg & MCG_STATUS_MCIP;
629 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
630 uint64_t bstat = gbl->gbl_status;
633 * Include the compound error name if requested and if this
634 * is a compound error type.
636 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
637 ged->ged_compound_fmt != NULL) {
638 char buf[FM_MAX_CLASS];
640 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
641 GCPU_MN_NAMESPACE_COMPOUND);
642 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
643 DATA_TYPE_STRING, buf, NULL);
647 * Include disposition information for this error
649 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
650 gbl->gbl_disp != 0) {
651 int i, empty = 1;
652 char buf[128];
653 char *p = buf, *q = buf + 128;
654 static struct _gcpu_disp_name {
655 uint64_t dv;
656 const char *dn;
657 } disp_names[] = {
658 { CMI_ERRDISP_CURCTXBAD,
659 "processor_context_corrupt" },
660 { CMI_ERRDISP_RIPV_INVALID,
661 "return_ip_invalid" },
662 { CMI_ERRDISP_UC_UNCONSTRAINED,
663 "unconstrained" },
664 { CMI_ERRDISP_FORCEFATAL,
665 "forcefatal" },
666 { CMI_ERRDISP_IGNORED,
667 "ignored" },
668 { CMI_ERRDISP_PCC_CLEARED,
669 "corrupt_context_cleared" },
670 { CMI_ERRDISP_UC_CLEARED,
671 "uncorrected_data_cleared" },
672 { CMI_ERRDISP_POISONED,
673 "poisoned" },
674 { CMI_ERRDISP_INCONSISTENT,
675 "telemetry_unstable" },
678 for (i = 0; i < sizeof (disp_names) /
679 sizeof (struct _gcpu_disp_name); i++) {
680 if ((gbl->gbl_disp & disp_names[i].dv) == 0)
681 continue;
683 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
684 "%s%s", empty ? "" : ",", disp_names[i].dn);
685 p += strlen(p);
686 empty = 0;
689 if (p != buf)
690 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
691 DATA_TYPE_STRING, buf, NULL);
695 * If MCG_STATUS is included add that and an indication of whether
696 * this ereport was the result of a machine check or poll.
698 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
699 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
700 DATA_TYPE_UINT64, mcg, NULL);
702 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
703 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
707 * If an instruction pointer is to be included add one provided
708 * MCG_STATUS indicated it is valid; meaningless for polled events.
710 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
711 mcg & MCG_STATUS_EIPV) {
712 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
713 DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
717 * Add an indication of whether the trap occured during privileged code.
719 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
720 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
721 DATA_TYPE_BOOLEAN_VALUE,
722 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
726 * If requested, add the index of the MCA bank. This indicates the
727 * n'th bank of 4 MCA registers, and does not necessarily correspond
728 * to MCi_* - use the bank offset to correlate
730 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
731 fm_payload_set(ereport,
732 /* Bank number */
733 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
734 /* Offset of MCi_CTL */
735 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
736 IA32_MSR_MC(bankno, CTL),
737 NULL);
741 * Add MCi_STATUS if requested, and decode it.
743 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
744 const char *tbes[] = {
745 "No tracking", /* 00 */
746 "Green - below threshold", /* 01 */
747 "Yellow - above threshold", /* 10 */
748 "Reserved" /* 11 */
751 fm_payload_set(ereport,
752 /* Bank MCi_STATUS */
753 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
754 /* Overflow? */
755 _GCPU_BSTATUS(bstat, OVER),
756 /* Uncorrected? */
757 _GCPU_BSTATUS(bstat, UC),
758 /* Enabled? */
759 _GCPU_BSTATUS(bstat, EN),
760 /* Processor context corrupt? */
761 _GCPU_BSTATUS(bstat, PCC),
762 /* Error code */
763 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
764 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
765 /* Model-specific error code */
766 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
767 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
768 NULL);
771 * If MCG_CAP.TES_P indicates that that thresholding info
772 * is present in the architural component of the bank status
773 * then include threshold information for this bank.
775 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
776 fm_payload_set(ereport,
777 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
778 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
779 NULL);
784 * Add MCi_ADDR info if requested and valid. We force addition of
785 * MCi_ADDR, even if its not valid on AMD family 0xf and above,
786 * to aid in analysis of ereports, for WatchDog errors.
788 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
789 ((bstat & MSR_MC_STATUS_ADDRV) ||
790 gcpu_force_addr_in_payload)) {
791 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
792 DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
796 * MCi_MISC if requested and MCi_STATUS.MISCV).
798 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
799 bstat & MSR_MC_STATUS_MISCV) {
800 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
801 DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
807 * Construct and post an ereport based on the logout information from a
808 * single MCA bank. We are not necessarily running on the cpu that
809 * detected the error.
811 static void
812 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
813 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
815 gcpu_data_t *gcpu = gcl->gcl_gcpu;
816 cmi_hdl_t hdl = gcpu->gcpu_hdl;
817 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
818 const char *cpuclass = NULL, *leafclass = NULL;
819 uint16_t code = MCAX86_ERRCODE(status);
820 errorq_elem_t *eqep, *scr_eqep;
821 nvlist_t *ereport, *detector;
822 char buf[FM_MAX_CLASS];
823 const char *classfmt;
824 nv_alloc_t *nva;
826 if (panicstr) {
827 if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
828 return;
829 ereport = errorq_elem_nvl(ereport_errorq, eqep);
832 * Allocate another element for scratch space, but fallback
833 * to the one we have if that fails. We'd like to use the
834 * additional scratch space for nvlist construction.
836 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
837 nva = errorq_elem_nva(ereport_errorq, scr_eqep);
838 else
839 nva = errorq_elem_nva(ereport_errorq, eqep);
840 } else {
841 ereport = fm_nvlist_create(NULL);
842 nva = NULL;
845 if (ereport == NULL)
846 return;
849 * Common payload data required by the protocol:
850 * - ereport class
851 * - detector
852 * - ENA
856 * Ereport class - call into model-specific support to allow it to
857 * provide a cpu class or leaf class, otherwise calculate our own.
859 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
860 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
861 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
862 leafclass);
865 * The detector FMRI.
867 if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
868 nva)) == NULL)
869 detector = gcpu_fmri_create(hdl, nva);
872 * Should we define a new ENA format 3?? for chip/core/strand?
873 * It will be better when virtualized.
875 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
876 fm_ena_generate_cpu(gcl->gcl_timestamp,
877 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
878 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
880 if (panicstr) {
881 fm_nvlist_destroy(detector, FM_NVA_RETAIN);
882 nv_alloc_reset(nva);
883 } else {
884 fm_nvlist_destroy(detector, FM_NVA_FREE);
888 * Add the architectural ereport class-specific payload data.
890 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
893 * Allow model-specific code to add ereport members.
895 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
896 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
899 * Include stack if options is turned on and either selected in
900 * the payload member bitmask or inclusion is forced.
902 if (gcpu_mca_stack_flag &&
903 (cms_ereport_includestack(hdl, mscookie) ==
904 B_TRUE || gcpu_mca_stack_ereport_include)) {
905 fm_payload_stack_add(ereport, gcl->gcl_stack,
906 gcl->gcl_stackdepth);
910 * If injection has taken place anytime in the past then note this
911 * on the ereport.
913 if (cmi_inj_tainted() == B_TRUE) {
914 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
915 B_TRUE, NULL);
919 * Post ereport.
921 if (panicstr) {
922 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
923 if (scr_eqep)
924 errorq_cancel(ereport_errorq, scr_eqep);
925 } else {
926 (void) fm_ereport_post(ereport, EVCH_TRYHARD);
927 fm_nvlist_destroy(ereport, FM_NVA_FREE);
932 /*ARGSUSED*/
933 void
934 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
936 const gcpu_logout_t *gcl = data;
937 const gcpu_bank_logout_t *gbl;
938 int ismc;
939 int i;
941 ismc = gcl->ismc;
942 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
943 const gcpu_error_disp_t *gened;
944 cms_cookie_t mscookie;
946 if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
947 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
948 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
951 * Perform a match based on IA32 MCA architectural
952 * components alone.
954 gened = gcpu_disp_match(code); /* may be NULL */
957 * Now see if an model-specific match can be made.
959 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc,
960 i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
961 gcl->gcl_ms_logout);
964 * Prepare and dispatch an ereport for logging and
965 * diagnosis.
967 gcpu_ereport_post(gcl, i, gened, mscookie,
968 gbl->gbl_status);
969 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
970 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
972 * Telemetry kept changing as we tried to read
973 * it. Force an unknown ereport leafclass but
974 * keep the telemetry unchanged for logging.
976 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
977 gbl->gbl_status);
982 static size_t gcpu_mca_queue_datasz = 0;
985 * The following code is ready to make a weak attempt at growing the
986 * errorq structure size. Since it is not foolproof (we don't know
987 * who may already be producing to the outgoing errorq) our caller
988 * instead assures that we'll always be called with no greater data
989 * size than on our first call.
991 static void
992 gcpu_errorq_init(size_t datasz)
994 int slots;
996 mutex_enter(&gcpu_mca_queue_lock);
998 if (gcpu_mca_queue_datasz >= datasz) {
999 mutex_exit(&gcpu_mca_queue_lock);
1000 return;
1003 membar_producer();
1004 if (gcpu_mca_queue) {
1005 gcpu_mca_queue_datasz = 0;
1006 errorq_destroy(gcpu_mca_queue);
1009 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
1010 slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
1012 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
1013 NULL, slots, datasz, 1, ERRORQ_VITAL);
1015 if (gcpu_mca_queue != NULL)
1016 gcpu_mca_queue_datasz = datasz;
1018 mutex_exit(&gcpu_mca_queue_lock);
1022 * Perform MCA initialization as described in section 14.6 of Intel 64
1023 * and IA-32 Architectures Software Developer's Manual Volume 3A.
1026 static uint_t global_nbanks;
1028 void
1029 gcpu_mca_init(cmi_hdl_t hdl)
1031 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1032 uint64_t cap;
1033 uint_t vendor = cmi_hdl_vendor(hdl);
1034 uint_t family = cmi_hdl_family(hdl);
1035 uint_t rev = cmi_hdl_chiprev(hdl);
1036 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1037 int mcg_ctl_present;
1038 uint_t nbanks;
1039 uint32_t ctl_skip_mask = 0;
1040 uint32_t status_skip_mask = 0;
1041 size_t mslsz;
1042 int i;
1043 int mcg_ctl2_present;
1044 uint32_t cmci_capable = 0;
1045 if (gcpu == NULL)
1046 return;
1048 /* We add MCi_ADDR always for AMD Family 0xf and above */
1049 if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B))
1050 gcpu_force_addr_in_payload = 1;
1053 * Protect from some silly /etc/system settings.
1055 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1056 gcpu_mca_telemetry_retries = 5;
1058 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1059 return;
1062 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1063 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier
1064 * processors, which have their own more primitive way of doing
1065 * machine checks, will not have cmi_mca_init called since their
1066 * CPUID information will not indicate both MCA and MCE features.
1068 ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA));
1071 * Determine whether the IA32_MCG_CTL register is present. If it
1072 * is we will enable all features by writing -1 to it towards
1073 * the end of this initialization; if it is absent then volume 3A
1074 * says we must nonetheless continue to initialize the individual
1075 * banks.
1077 mcg_ctl_present = cap & MCG_CAP_CTL_P;
1078 mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1081 * We squirell values away for inspection/debugging.
1083 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1084 if (mcg_ctl_present)
1085 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1086 &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1089 * Determine the number of error-reporting banks implemented.
1091 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1093 if (nbanks != 0 && global_nbanks == 0)
1094 global_nbanks = nbanks; /* no race - BSP will get here first */
1097 * If someone is hiding the number of banks (perhaps we are fully
1098 * virtualized?) or if this processor has more banks than the
1099 * first to set global_nbanks then bail. The latter requirement
1100 * is because we need to size our errorq data structure and we
1101 * don't want to have to grow the errorq (destroy and recreate)
1102 * which may just lose some telemetry.
1104 if (nbanks == 0 || nbanks > global_nbanks)
1105 return;
1107 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1108 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1111 * Calculate the size we need to allocate for a gcpu_logout_t
1112 * with a gcl_data array big enough for all banks of this cpu.
1113 * Add any space requested by the model-specific logout support.
1115 mslsz = cms_logout_size(hdl);
1116 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1117 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1119 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1120 gcpu_logout_t *gcl;
1122 mca->gcpu_mca_logout[i] = gcl =
1123 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1124 gcl->gcl_gcpu = gcpu;
1125 gcl->gcl_nbanks = nbanks;
1126 gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1127 (char *)(&gcl->gcl_data[0]) + nbanks *
1128 sizeof (gcpu_bank_logout_t);
1133 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1135 mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1136 KM_SLEEP);
1139 * Create our errorq to transport the logout structures. This
1140 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1142 gcpu_errorq_init(mca->gcpu_mca_lgsz);
1145 * Not knowing which, if any, banks are shared between cores we
1146 * assure serialization of MCA bank initialization by each cpu
1147 * on the chip. On chip architectures in which some banks are
1148 * shared this will mean the shared resource is initialized more
1149 * than once - we're simply aiming to avoid simultaneous MSR writes
1150 * to the shared resource.
1152 * Even with these precautions, some platforms may yield a GP fault
1153 * if a core other than a designated master tries to write anything
1154 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform
1155 * those writes under on_trap protection.
1157 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1160 * Initialize poller data, but don't start polling yet.
1162 gcpu_mca_poll_init(hdl);
1165 * Work out which MCA banks we will initialize. In MCA logout
1166 * code we will only read those banks which we initialize here.
1168 for (i = 0; i < nbanks; i++) {
1169 boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1170 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1172 if (!cms_present(hdl)) {
1174 * Model-specific support is not present, try to use
1175 * sane defaults.
1177 * On AMD family 6 processors, reports about spurious
1178 * machine checks indicate that bank 0 should be
1179 * skipped.
1181 * On Intel family 6 processors, the documentation tells
1182 * us not to write to MC0_CTL.
1185 if (i == 0 && family == 6) {
1186 switch (vendor) {
1187 case X86_VENDOR_AMD:
1188 skipstatus = B_TRUE;
1189 /*FALLTHRU*/
1190 case X86_VENDOR_Intel:
1191 skipctl = B_TRUE;
1192 break;
1197 ctl_skip_mask |= skipctl << i;
1198 status_skip_mask |= skipstatus << i;
1200 if (skipctl && skipstatus)
1201 continue;
1204 * Record which MCA banks were enabled, from the point of view
1205 * of the whole chip (if some cores share a bank we must be
1206 * sure either can logout from it).
1208 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1211 * check CMCI capability
1213 if (mcg_ctl2_present) {
1214 uint64_t ctl2;
1215 uint32_t cap = 0;
1216 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1217 if (ctl2 & MSR_MC_CTL2_EN)
1218 continue;
1219 ctl2 |= MSR_MC_CTL2_EN;
1220 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1221 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1222 mca->gcpu_bank_cmci[i].cmci_cap = cap =
1223 (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1224 if (cap)
1225 cmci_capable ++;
1227 * Set threshold to 1 while unset the en field, to avoid
1228 * CMCI trigged before APIC LVT entry init.
1230 ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
1231 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1234 * init cmci related count
1236 mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1237 mca->gcpu_bank_cmci[i].drtcmci = 0;
1238 mca->gcpu_bank_cmci[i].ncmci = 0;
1242 if (cmci_capable)
1243 cmi_enable_cmci = 1;
1246 * Log any valid telemetry lurking in the MCA banks, but do not
1247 * clear the status registers. Ignore the disposition returned -
1248 * we have already paniced or reset for any nasty errors found here.
1250 * Intel vol 3A says that we should not do this on family 0x6,
1251 * and that for any extended family the BIOS clears things
1252 * on power-on reset so you'll only potentially find valid telemetry
1253 * on warm reset (we do it for both - on power-on reset we should
1254 * just see zeroes).
1256 * AMD docs since K7 say we should process anything we find here.
1258 if (!gcpu_suppress_log_on_init &&
1259 (vendor == X86_VENDOR_Intel && family >= 0xf ||
1260 vendor == X86_VENDOR_AMD))
1261 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1262 GCPU_MPT_WHAT_POKE_ERR);
1265 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1266 * model-specific module the power of veto.
1268 for (i = 0; i < nbanks; i++) {
1269 struct gcpu_bios_bankcfg *bcfgp =
1270 mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1273 * Stash inherited bank MCA state, even for banks we will
1274 * not initialize ourselves. Do not read the MISC register
1275 * unconditionally - on some processors that will #GP on
1276 * banks that do not implement the MISC register (would be
1277 * caught by on_trap, anyway).
1279 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1280 &bcfgp->bios_bank_ctl);
1282 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1283 &bcfgp->bios_bank_status);
1285 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) ||
1286 gcpu_force_addr_in_payload) {
1287 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1288 &bcfgp->bios_bank_addr);
1292 * In some old BIOS the status value after boot can indicate
1293 * MISCV when there is actually no MISC register for
1294 * that bank. The following read could therefore
1295 * aggravate a general protection fault. This should be
1296 * caught by on_trap, but the #GP fault handler is busted
1297 * and can suffer a double fault even before we get to
1298 * trap() to check for on_trap protection. Until that
1299 * issue is fixed we remove the one access that we know
1300 * can cause a #GP.
1302 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1303 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1304 * &bcfgp->bios_bank_misc);
1306 bcfgp->bios_bank_misc = 0;
1308 if (!(ctl_skip_mask & (1 << i))) {
1309 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1310 cms_bankctl_val(hdl, i, -1ULL));
1313 if (!(status_skip_mask & (1 << i))) {
1314 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1315 cms_bankstatus_val(hdl, i, 0ULL));
1319 * Now let the model-specific support perform further initialization
1320 * of non-architectural features.
1322 cms_mca_init(hdl, nbanks);
1324 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1325 membar_producer();
1327 /* enable all machine-check features */
1328 if (mcg_ctl_present)
1329 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1330 cms_mcgctl_val(hdl, nbanks, -1ULL));
1332 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1334 /* enable machine-check exception in CR4 */
1335 cmi_hdl_enable_mce(hdl);
1338 static uint64_t
1339 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1340 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1342 int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1343 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1344 int nbanks = mca->gcpu_mca_nbanks;
1345 gcpu_mce_status_t mce;
1346 gcpu_bank_logout_t *gbl;
1347 uint64_t disp = 0;
1348 int i;
1350 if (mcesp == NULL)
1351 mcesp = &mce;
1353 mcesp->mce_nerr = nerr;
1355 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1356 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1357 mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1360 * If this a machine check then if the return instruction pointer
1361 * is not valid the current context is lost.
1363 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1364 disp |= CMI_ERRDISP_RIPV_INVALID;
1365 gcl->ismc = ismc;
1367 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1368 uint64_t mcistatus = gbl->gbl_status;
1369 uint32_t ms_scope;
1370 int pcc, uc;
1371 int poisoned;
1373 if (!(mcistatus & MSR_MC_STATUS_VAL))
1374 continue;
1376 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1377 continue;
1379 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1380 uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1381 mcesp->mce_npcc += pcc;
1382 mcesp->mce_nuc += uc;
1384 ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1385 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1387 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1388 pcc = 0;
1389 mcesp->mce_npcc_ok++;
1390 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1393 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1394 uc = 0;
1395 mcesp->mce_nuc_ok++;
1396 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1399 if (uc) {
1400 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1401 if (poisoned) {
1402 mcesp->mce_nuc_poisoned++;
1403 gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1407 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1409 * We're not being instructed to ignore the error,
1410 * so apply our standard disposition logic to it.
1412 if (uc && !poisoned) {
1413 unconstrained++;
1414 gbl->gbl_disp |= disp |
1415 CMI_ERRDISP_UC_UNCONSTRAINED;
1418 if (pcc && ismc) {
1419 curctxbad++;
1420 gbl->gbl_disp |= disp |
1421 CMI_ERRDISP_CURCTXBAD;
1425 * Even if the above may not indicate that the error
1426 * is terminal, model-specific support may insist
1427 * that we treat it as such. Such errors wil be
1428 * fatal even if discovered via poll.
1430 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1431 forcefatal++;
1432 mcesp->mce_forcefatal++;
1433 gbl->gbl_disp |= disp |
1434 CMI_ERRDISP_FORCEFATAL;
1436 } else {
1437 mcesp->mce_ignored++;
1438 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1442 if (unconstrained > 0)
1443 disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1445 if (curctxbad > 0)
1446 disp |= CMI_ERRDISP_CURCTXBAD;
1448 if (forcefatal > 0)
1449 disp |= CMI_ERRDISP_FORCEFATAL;
1451 if (gcpu_mca_queue != NULL) {
1452 int how;
1454 if (ismc) {
1455 how = cmi_mce_response(rp, disp) ?
1456 ERRORQ_ASYNC : /* no panic, so arrange drain */
1457 ERRORQ_SYNC; /* panic flow will drain */
1458 } else {
1459 how = (disp & CMI_ERRDISP_FORCEFATAL &&
1460 cmi_panic_on_ue()) ?
1461 ERRORQ_SYNC : /* poller will panic */
1462 ERRORQ_ASYNC; /* no panic */
1465 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1466 } else if (disp != 0) {
1467 gcpu_bleat(hdl, gcl);
1470 mcesp->mce_disp = disp;
1472 return (disp);
1476 * Gather error telemetry from our source, and then submit it for
1477 * processing.
1480 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1481 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1483 #define STATUS_EQV(s1, s2) \
1484 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1486 static uint32_t gcpu_deferrred_polled_clears;
1488 static void
1489 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1490 uint64_t status, int what)
1492 uint64_t ctl2;
1494 if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1495 (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1496 !(status & MSR_MC_STATUS_CEC_MASK)))) {
1498 if (!(bank_cmci_p->cmci_enabled)) {
1500 * when cmci is disabled, and the bank has no error or
1501 * no corrected error for
1502 * gcpu_mca_cmci_reenable_threshold consecutive polls,
1503 * turn on this bank's cmci.
1506 bank_cmci_p->drtcmci ++;
1508 if (bank_cmci_p->drtcmci >=
1509 gcpu_mca_cmci_reenable_threshold) {
1511 /* turn on cmci */
1513 (void) cmi_hdl_rdmsr(hdl,
1514 IA32_MSR_MC_CTL2(bank), &ctl2);
1515 ctl2 |= MSR_MC_CTL2_EN;
1516 (void) cmi_hdl_wrmsr(hdl,
1517 IA32_MSR_MC_CTL2(bank), ctl2);
1519 /* reset counter and set flag */
1520 bank_cmci_p->drtcmci = 0;
1521 bank_cmci_p->cmci_enabled = 1;
1523 } else {
1525 * when cmci is enabled,if is in cyclic poll and the
1526 * bank has no error or no corrected error, reset ncmci
1527 * counter
1529 bank_cmci_p->ncmci = 0;
1534 static void
1535 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1536 int what)
1538 uint64_t ctl2 = 0;
1541 * if cmci of this bank occurred beyond
1542 * gcpu_mca_cmci_throttling_threshold between 2 polls,
1543 * turn off this bank's CMCI;
1545 if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1547 /* if it is cmci trap, increase the count */
1548 bank_cmci_p->ncmci++;
1550 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1552 /* turn off cmci */
1554 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1555 &ctl2);
1556 ctl2 &= ~MSR_MC_CTL2_EN;
1557 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1558 ctl2);
1560 /* clear the flag and count */
1562 bank_cmci_p->cmci_enabled = 0;
1563 bank_cmci_p->ncmci = 0;
1568 static void
1569 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1570 cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1572 int i;
1573 gcpu_bank_logout_t *gbl, *pgbl;
1574 uint64_t status;
1576 if (first < 0 || last < 0)
1577 return;
1579 for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1580 status = gbl->gbl_status;
1581 if (status == 0)
1582 continue;
1583 if (clrstatus == B_FALSE)
1584 goto serialize;
1587 * For i86xpv we always clear status in order to invalidate
1588 * the interposed telemetry.
1590 * For native machine checks we always clear status here. For
1591 * native polls we must be a little more cautious since there
1592 * is an outside chance that we may clear telemetry from a
1593 * shared MCA bank on which a sibling core is machine checking.
1595 * For polled observations of errors that look like they may
1596 * produce a machine check (UC/PCC and ENabled, although these
1597 * do not guarantee a machine check on error occurence)
1598 * we will not clear the status at this wakeup unless
1599 * we saw the same status at the previous poll. We will
1600 * always process and log the current observations - it
1601 * is only the clearing of MCi_STATUS which may be
1602 * deferred until the next wakeup.
1604 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1605 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1606 goto serialize;
1610 * We have a polled observation of a machine check
1611 * candidate. If we saw essentially the same status at the
1612 * last poll then clear the status now since this appears
1613 * not to be a #MC candidate after all. If we see quite
1614 * different status now then do not clear, but reconsider at
1615 * the next poll. In no actual machine check clears
1616 * the status in the interim then the status should not
1617 * keep changing forever (meaning we'd never clear it)
1618 * since before long we'll simply have latched the highest-
1619 * priority error and set the OVerflow bit. Nonetheless
1620 * we count how many times we defer clearing and after
1621 * a while insist on clearing the status.
1623 pgbl = &pgcl->gcl_data[i];
1624 if (pgbl->gbl_clrdefcnt != 0) {
1625 /* We deferred clear on this bank at last wakeup */
1626 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1627 pgbl->gbl_clrdefcnt > 5) {
1629 * Status is unchanged so clear it now and,
1630 * since we have already logged this info,
1631 * avoid logging it again.
1633 gbl->gbl_status = 0;
1634 (void) cmi_hdl_wrmsr(hdl,
1635 IA32_MSR_MC(i, STATUS), 0ULL);
1636 } else {
1637 /* Record deferral for next wakeup */
1638 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1640 } else {
1641 /* Record initial deferral for next wakeup */
1642 gbl->gbl_clrdefcnt = 1;
1643 gcpu_deferrred_polled_clears++;
1646 serialize:
1649 * Intel Vol 3A says to execute a serializing
1650 * instruction here, ie CPUID. Well WRMSR is also
1651 * defined to be serializing, so the status clear above
1652 * should suffice. To be a good citizen, and since
1653 * some clears are deferred, we'll execute a CPUID
1654 * instruction here.
1656 struct cpuid_regs tmp;
1657 (void) __cpuid_insn(&tmp);
1662 /*ARGSUSED5*/
1663 void
1664 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1665 gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1667 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1668 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1669 int nbanks = mca->gcpu_mca_nbanks;
1670 gcpu_bank_logout_t *gbl, *pgbl;
1671 gcpu_logout_t *gcl, *pgcl;
1672 int ismc = (rp != NULL);
1673 int ispoll = !ismc;
1674 int i, nerr = 0;
1675 cmi_errno_t err;
1676 uint64_t mcg_status;
1677 uint64_t disp;
1678 uint64_t cap;
1679 int first = -1;
1680 int last = -1;
1681 int willpanic = 0;
1683 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1684 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1685 CMI_SUCCESS) {
1686 if (mcesp != NULL)
1687 mcesp->mce_nerr = mcesp->mce_disp = 0;
1688 return;
1691 if (ismc) {
1692 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1693 } else {
1694 int pidx = mca->gcpu_mca_nextpoll_idx;
1695 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1696 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1698 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */
1699 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */
1700 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */
1703 gcl->gcl_timestamp = gethrtime_waitfree();
1704 gcl->gcl_mcg_status = mcg_status;
1705 gcl->gcl_ip = rp ? rp->r_pc : 0;
1707 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1708 if (cap & MCG_CAP_TES_P)
1709 gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1711 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1712 uint64_t status, status2, addr, misc;
1713 int retries = gcpu_mca_telemetry_retries;
1715 gbl->gbl_status = 0;
1716 gbl->gbl_disp = 0;
1717 gbl->gbl_clrdefcnt = 0;
1720 * Only logout from MCA banks we have initialized from at
1721 * least one core. If a core shares an MCA bank with another
1722 * but perhaps lost the race to initialize it, then it must
1723 * still be allowed to logout from the shared bank.
1725 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1726 continue;
1729 * On a poll look only at the banks we've been asked to check.
1731 if (rp == NULL && !(bankmask & 1 << i))
1732 continue;
1735 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1736 CMI_SUCCESS)
1737 continue;
1739 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1741 retry:
1742 if (!(status & MSR_MC_STATUS_VAL))
1743 continue;
1745 /* First and last bank that have valid status */
1746 if (first < 0)
1747 first = i;
1748 last = i;
1750 addr = -1;
1751 misc = 0;
1753 if ((status & MSR_MC_STATUS_ADDRV) ||
1754 gcpu_force_addr_in_payload)
1755 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1757 if (status & MSR_MC_STATUS_MISCV)
1758 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1760 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1763 * Allow the model-specific code to extract bank telemetry.
1765 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1768 * Not all cpu models assure us that the status/address/misc
1769 * data will not change during the above sequence of MSR reads,
1770 * or that it can only change by the addition of the OVerflow
1771 * bit to the status register. If the status has changed
1772 * other than in the overflow bit then we attempt to reread
1773 * for a consistent snapshot, but eventually give up and
1774 * go with what we've got. We only perform this check
1775 * for a poll - a further #MC during a #MC will reset, and
1776 * polled errors should not overwrite higher-priority
1777 * trapping errors (but could set the overflow bit).
1779 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1780 &status2)) == CMI_SUCCESS) {
1781 if (!STATUS_EQV(status, status2)) {
1782 if (retries-- > 0) {
1783 status = status2;
1784 goto retry;
1785 } else {
1786 gbl->gbl_disp |=
1787 CMI_ERRDISP_INCONSISTENT;
1790 } else if (ispoll && err != CMI_SUCCESS) {
1791 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1794 nerr++;
1795 gbl->gbl_status = status;
1796 gbl->gbl_addr = addr;
1797 gbl->gbl_misc = misc;
1800 * For polled observation, if the count of deferred status
1801 * clears updated in the clear_mc() is nonzero and the
1802 * MCi_STATUS has not changed, the last wakeup has produced
1803 * the ereport of the error. Therefore, clear the status in
1804 * this wakeup to avoid duplicate ereport.
1806 pgbl = &pgcl->gcl_data[i];
1807 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1808 pgbl->gbl_clrdefcnt != 0) {
1809 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1810 gbl->gbl_status = 0;
1811 (void) cmi_hdl_wrmsr(hdl,
1812 IA32_MSR_MC(i, STATUS), 0ULL);
1817 if (gcpu_mca_stack_flag)
1818 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1819 else
1820 gcl->gcl_stackdepth = 0;
1823 * Decide our disposition for this error or errors, and submit for
1824 * logging and subsequent diagnosis.
1826 if (nerr != 0) {
1827 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1829 willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1831 if (!willpanic)
1832 clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1833 } else {
1834 disp = 0;
1835 if (mcesp) {
1836 mcesp->mce_nerr = mcesp->mce_disp = 0;
1841 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1842 * If a second #MC had occured before now the system would have
1843 * reset. We can only do thise once gcpu_mca_process has copied
1844 * the logout structure.
1846 if (ismc && mcg_status & MCG_STATUS_MCIP)
1847 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1850 * At this point we have read and logged all telemetry that is visible
1851 * under the MCA. On architectures for which the NorthBridge is
1852 * on-chip this may include NB-observed errors, but where the NB
1853 * is off chip it may have been the source of the #MC request and
1854 * so we must call into the memory-controller driver to give it
1855 * a chance to log errors.
1857 if (ismc) {
1858 cmi_mc_logout(hdl, 1, willpanic);
1862 int gcpu_mca_trap_vomit_summary = 0;
1865 * On a native machine check exception we come here from mcetrap via
1866 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others
1867 * cpus of the chip, so it is possible that another cpu on this chip could
1868 * initiate a poll while we're in the #mc handler; it is also possible that
1869 * this trap has occured during a poll on this cpu. So we must acquire
1870 * the chip-wide poll lock, but be careful to avoid deadlock.
1872 * The 'data' pointer cannot be NULL due to init order.
1874 uint64_t
1875 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1877 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1878 kmutex_t *poll_lock = NULL;
1879 gcpu_mce_status_t mce;
1880 uint64_t mcg_status;
1881 int tooklock = 0;
1883 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1884 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
1885 return (0);
1888 * Synchronize with any poller from another core that may happen
1889 * to share access to one or more of the MCA banks.
1891 if (gcpu->gcpu_shared != NULL)
1892 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
1894 if (poll_lock != NULL && !mutex_owned(poll_lock)) {
1896 * The lock is not owned by the thread we have
1897 * interrupted. Spin for this adaptive lock.
1899 while (!mutex_tryenter(poll_lock)) {
1900 while (mutex_owner(poll_lock) != NULL)
1903 tooklock = 1;
1906 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
1908 if (tooklock)
1909 mutex_exit(poll_lock);
1912 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
1914 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
1915 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
1916 "%u PCC (%u ok), "
1917 "%u UC (%d ok, %u poisoned), "
1918 "%u forcefatal, %u ignored",
1919 mce.mce_nerr, (u_longlong_t)mce.mce_disp,
1920 mce.mce_npcc, mce.mce_npcc_ok,
1921 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
1922 mce.mce_forcefatal, mce.mce_ignored);
1925 return (mce.mce_disp);
1928 /*ARGSUSED*/
1929 void
1930 gcpu_faulted_enter(cmi_hdl_t hdl)
1932 /* Nothing to do here */
1935 /*ARGSUSED*/
1936 void
1937 gcpu_faulted_exit(cmi_hdl_t hdl)
1939 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1941 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
1945 * Write the requested values to the indicated MSRs. Having no knowledge
1946 * of the model-specific requirements for writing to these model-specific
1947 * registers, we will only blindly write to those MSRs if the 'force'
1948 * argument is nonzero. That option should only be used in prototyping
1949 * and debugging.
1951 /*ARGSUSED*/
1952 cmi_errno_t
1953 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
1954 int force)
1956 int i, errs = 0;
1958 for (i = 0; i < nregs; i++) {
1959 uint_t msr = regs[i].cmr_msrnum;
1960 uint64_t val = regs[i].cmr_msrval;
1962 if (cms_present(hdl)) {
1963 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
1964 errs++;
1965 } else if (force) {
1966 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
1967 } else {
1968 errs++;
1972 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
1975 /* deconfigure gcpu_mca_init() */
1976 void
1977 gcpu_mca_fini(cmi_hdl_t hdl)
1979 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1980 gcpu_mca_t *mca = &gcpu->gcpu_mca;
1981 int i;
1984 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1985 * both MCA and MCE support (i.e., X86FSET_MCA). P5, K6, and earlier
1986 * processors, which have their own more primitive way of doing
1987 * machine checks, will not have cmi_mca_init called since their
1988 * CPUID information will not indicate both MCA and MCE features.
1990 if (!is_x86_feature(x86_featureset, X86FSET_MCA))
1991 return;
1993 * disable machine check in CR4
1995 cmi_ntv_hwdisable_mce(hdl);
1996 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1997 gcpu_mca_poll_fini(hdl);
1998 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
2001 * free resources allocated during init
2003 if (mca->gcpu_bank_cmci != NULL) {
2004 kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) *
2005 mca->gcpu_mca_nbanks);
2008 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
2009 if (mca->gcpu_mca_logout[i] != NULL) {
2010 kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz);
2014 if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) {
2015 kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg,
2016 sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks);