usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright (c) 2010, Intel Corporation.
  27  * All rights reserved.
  28  */
  29
  30 #include <sys/mca_x86.h>
  31 #include <sys/cpu_module_impl.h>
  32 #include <sys/cpu_module_ms.h>
  33 #include <sys/cmn_err.h>
  34 #include <sys/cpuvar.h>
  35 #include <sys/pghw.h>
  36 #include <sys/x86_archext.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/regset.h>
  39 #include <sys/privregs.h>
  40 #include <sys/systm.h>
  41 #include <sys/types.h>
  42 #include <sys/log.h>
  43 #include <sys/psw.h>
  44 #include <sys/fm/protocol.h>
  45 #include <sys/fm/util.h>
  46 #include <sys/errorq.h>
  47 #include <sys/mca_x86.h>
  48 #include <sys/fm/cpu/GMCA.h>
  49 #include <sys/fm/smb/fmsmb.h>
  50 #include <sys/sysevent.h>
  51 #include <sys/ontrap.h>
  52
  53 #include "gcpu.h"
  54
  55 extern int x86gentopo_legacy;   /* x86 generic topology support */
  56
  57 static uint_t gcpu_force_addr_in_payload = 0;
  58
  59 /*
  60  * Clear to log telemetry found at initialization.  While processor docs
  61  * say you should process this telemetry on all but Intel family 0x6
  62  * there are way too many exceptions and we want to avoid bogus
  63  * diagnoses.
  64  */
  65 int gcpu_suppress_log_on_init = 1;
  66
  67 /*
  68  * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
  69  * error logout time.  The stack will be included in the ereport if the
  70  * error type selects stack inclusion, or in all cases if
  71  * gcpu_mca_stack_ereport_include is nonzero.
  72  */
  73 int gcpu_mca_stack_flag = 0;
  74 int gcpu_mca_stack_ereport_include = 0;
  75
  76 /*
  77  * The number of times to re-read MCA telemetry to try to obtain a
  78  * consistent snapshot if we find it to be changing under our feet.
  79  */
  80 int gcpu_mca_telemetry_retries = 5;
  81
  82 int gcpu_mca_cmci_throttling_threshold = 10;
  83 int gcpu_mca_cmci_reenable_threshold = 1000;
  84
  85 static gcpu_error_disp_t gcpu_errtypes[] = {
  86
  87         /*
  88          * Unclassified
  89          */
  90         {
  91                 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
  92                 NULL,
  93                 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
  94                 MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
  95                 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
  96         },
  97
  98         /*
  99          * Microcode ROM Parity Error
 100          */
 101         {
 102                 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
 103                 NULL,
 104                 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
 105                 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
 106                 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
 107         },
 108
 109         /*
 110          * External - BINIT# from another processor during power-on config
 111          */
 112         {
 113                 FM_EREPORT_CPU_GENERIC_EXTERNAL,
 114                 NULL,
 115                 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
 116                 MCAX86_SIMPLE_EXTERNAL_MASKON,
 117                 MCAX86_SIMPLE_EXTERNAL_MASKOFF
 118         },
 119
 120         /*
 121          * Functional redundancy check master/slave error
 122          */
 123         {
 124                 FM_EREPORT_CPU_GENERIC_FRC,
 125                 NULL,
 126                 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
 127                 MCAX86_SIMPLE_FRC_MASKON,
 128                 MCAX86_SIMPLE_FRC_MASKOFF
 129         },
 130
 131         /*
 132          * Internal parity error
 133          */
 134         {
 135                 FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
 136                 NULL,
 137                 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
 138                 MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
 139                 MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
 140         },
 141
 142
 143         /*
 144          * Internal timer error
 145          */
 146         {
 147                 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
 148                 NULL,
 149                 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
 150                 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
 151                 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
 152         },
 153
 154         /*
 155          * Internal unclassified
 156          */
 157         {
 158                 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
 159                 NULL,
 160                 FM_EREPORT_PAYLOAD_FLAGS_COMMON,
 161                 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
 162                 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
 163         },
 164
 165         /*
 166          * Compound error codes - generic memory hierarchy
 167          */
 168         {
 169                 FM_EREPORT_CPU_GENERIC_GENMEMHIER,
 170                 NULL,
 171                 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
 172                 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
 173                 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
 174         },
 175
 176         /*
 177          * Compound error codes - TLB errors
 178          */
 179         {
 180                 FM_EREPORT_CPU_GENERIC_TLB,
 181                 "%1$s" "TLB" "%2$s" "_ERR",
 182                 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
 183                 MCAX86_COMPOUND_TLB_MASKON,
 184                 MCAX86_COMPOUND_TLB_MASKOFF
 185         },
 186
 187         /*
 188          * Compound error codes - memory hierarchy
 189          */
 190         {
 191                 FM_EREPORT_CPU_GENERIC_MEMHIER,
 192                 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
 193                 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
 194                 MCAX86_COMPOUND_MEMHIER_MASKON,
 195                 MCAX86_COMPOUND_MEMHIER_MASKOFF
 196         },
 197
 198         /*
 199          * Compound error codes - bus and interconnect errors
 200          */
 201         {
 202                 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
 203                 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
 204                 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
 205                 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
 206                 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
 207         },
 208         /*
 209          * Compound error codes - memory controller errors
 210          */
 211         {
 212                 FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
 213                 "MC" "_" "%8$s" "_" "%9$s" "_ERR",
 214                 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
 215                 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
 216                 MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
 217         },
 218 };
 219
 220 static gcpu_error_disp_t gcpu_unknown = {
 221         FM_EREPORT_CPU_GENERIC_UNKNOWN,
 222         "UNKNOWN",
 223         FM_EREPORT_PAYLOAD_FLAGS_COMMON,
 224         0,
 225         0
 226 };
 227
 228 static errorq_t *gcpu_mca_queue;
 229 static kmutex_t gcpu_mca_queue_lock;
 230
 231 static int isxpv = 0;
 232
 233 static const gcpu_error_disp_t *
 234 gcpu_disp_match(uint16_t code)
 235 {
 236         const gcpu_error_disp_t *ged = gcpu_errtypes;
 237         int i;
 238
 239         for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
 240             i++, ged++) {
 241                 uint16_t on = ged->ged_errcode_mask_on;
 242                 uint16_t off = ged->ged_errcode_mask_off;
 243
 244                 if ((code & on) == on && (code & off) == 0)
 245                         return (ged);
 246         }
 247
 248         return (NULL);
 249 }
 250
 251 static uint16_t
 252 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
 253 {
 254         return ((code & mask) >> shift);
 255 }
 256
 257 #define BIT_STRIP(code, name) \
 258         bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
 259         MCAX86_ERRCODE_##name##_SHIFT)
 260
 261 #define GCPU_MNEMONIC_UNDEF     "undefined"
 262 #define GCPU_MNEMONIC_RESVD     "reserved"
 263
 264 /*
 265  * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
 266  * mnemonics and to ereport class name components.
 267  */
 268
 269 struct gcpu_mnexp {
 270         const char *mne_compound;       /* used in expanding compound errname */
 271         const char *mne_ereport;        /* used in expanding ereport class */
 272 };
 273
 274 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
 275         { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },               /* INSTR */
 276         { "D", FM_EREPORT_CPU_GENERIC_TT_DATA },                /* DATA */
 277         { "G", FM_EREPORT_CPU_GENERIC_TT_GEN },                 /* GEN */
 278         { GCPU_MNEMONIC_UNDEF, "" }
 279 };
 280
 281 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
 282         { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },                 /* L0 */
 283         { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 },                 /* L1 */
 284         { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 },                 /* L2 */
 285         { "LG", FM_EREPORT_CPU_GENERIC_LL_LG }                  /* LG */
 286 };
 287
 288 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
 289         { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },             /* ERR */
 290         { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD },               /* RD */
 291         { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },               /* WR */
 292         { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },             /* DRD */
 293         { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },             /* DWR */
 294         { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },             /* IRD */
 295         { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },   /* PREFETCH */
 296         { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },         /* EVICT */
 297         { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },         /* SNOOP */
 298 };
 299
 300 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
 301         { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },               /* SRC */
 302         { "RES", FM_EREPORT_CPU_GENERIC_PP_RES },               /* RES */
 303         { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },               /* OBS */
 304         { "", FM_EREPORT_CPU_GENERIC_PP_GEN }                   /* GEN */
 305 };
 306
 307 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
 308         { "M", FM_EREPORT_CPU_GENERIC_II_MEM },                 /* MEM */
 309         { GCPU_MNEMONIC_RESVD, "" },
 310         { "IO", FM_EREPORT_CPU_GENERIC_II_IO },                 /* IO */
 311         { "", FM_EREPORT_CPU_GENERIC_II_GEN }                   /* GEN */
 312 };
 313
 314 static struct gcpu_mnexp gcpu_T_mnemonics[] = {  /* MCAX86_ERRCODE_T_* */
 315         { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },    /* NONE */
 316         { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }         /* TIMEOUT */
 317 };
 318
 319 static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
 320         { "CH0", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH0 */
 321         { "CH1", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH1 */
 322         { "CH2", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH2 */
 323         { "CH3", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH3 */
 324         { "CH4", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH4 */
 325         { "CH5", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH5 */
 326         { "CH6", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH6 */
 327         { "CH7", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH7 */
 328         { "CH8", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH8 */
 329         { "CH9", FM_EREPORT_CPU_GENERIC_CCCC },         /* CH9 */
 330         { "CH10", FM_EREPORT_CPU_GENERIC_CCCC },        /* CH10 */
 331         { "CH11", FM_EREPORT_CPU_GENERIC_CCCC },        /* CH11 */
 332         { "CH12", FM_EREPORT_CPU_GENERIC_CCCC },        /* CH12 */
 333         { "CH13", FM_EREPORT_CPU_GENERIC_CCCC },        /* CH13 */
 334         { "CH14", FM_EREPORT_CPU_GENERIC_CCCC },        /* CH14 */
 335         { "CH", FM_EREPORT_CPU_GENERIC_CCCC }           /* GEN */
 336 };
 337
 338 static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
 339         { "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR },      /* GEN ERR */
 340         { "RD", FM_EREPORT_CPU_GENERIC_MMM_RD },        /* READ  */
 341         { "WR", FM_EREPORT_CPU_GENERIC_MMM_WR },        /* WRITE  */
 342         { "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD },      /* ADDR, CMD  */
 343         { "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
 344         { GCPU_MNEMONIC_RESVD, ""},                     /* RESERVED  */
 345         { GCPU_MNEMONIC_RESVD, ""},                     /* RESERVED  */
 346         { GCPU_MNEMONIC_RESVD, ""}                      /* RESERVED  */
 347 };
 348
 349 enum gcpu_mn_namespace {
 350         GCPU_MN_NAMESPACE_COMPOUND,
 351         GCPU_MN_NAMESPACE_EREPORT
 352 };
 353
 354 static const char *
 355 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val,
 356     enum gcpu_mn_namespace nspace)
 357 {
 358         if (val >= tbl_sz || val > 0xff)
 359                 return (GCPU_MNEMONIC_UNDEF);   /* for all namespaces */
 360
 361         switch (nspace) {
 362         case GCPU_MN_NAMESPACE_COMPOUND:
 363                 return (tbl[val].mne_compound);
 364                 /*NOTREACHED*/
 365
 366         case GCPU_MN_NAMESPACE_EREPORT:
 367                 return (tbl[val].mne_ereport);
 368                 /*NOTREACHED*/
 369
 370         default:
 371                 return (GCPU_MNEMONIC_UNDEF);
 372                 /*NOTREACHED*/
 373         }
 374 }
 375
 376 /*
 377  * The ereport class leaf component is either a simple string with no
 378  * format specifiers, or a string with one or more embedded %n$s specifiers -
 379  * positional selection for string arguments.  The kernel snprintf does
 380  * not support %n$ (and teaching it to do so is too big a headache) so
 381  * we will expand this restricted format string ourselves.
 382  */
 383
 384 #define GCPU_CLASS_VARCOMPS     9
 385
 386 #define GCPU_MNEMONIC(code, name, nspace) \
 387         gcpu_mnemonic(gcpu_##name##_mnemonics, \
 388         sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
 389         BIT_STRIP(code, name), nspace)
 390
 391 static void
 392 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
 393     enum gcpu_mn_namespace nspace)
 394 {
 395         uint16_t code = MCAX86_ERRCODE(status);
 396         const char *mn[GCPU_CLASS_VARCOMPS];
 397         char *p = buf;                  /* current position in buf */
 398         char *q = buf + buflen;         /* pointer past last char in buf */
 399         int which, expfmtchar, error;
 400         char c;
 401
 402         mn[0] = GCPU_MNEMONIC(code, TT, nspace);
 403         mn[1] = GCPU_MNEMONIC(code, LL, nspace);
 404         mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
 405         mn[3] = GCPU_MNEMONIC(code, PP, nspace);
 406         mn[4] = GCPU_MNEMONIC(code, II, nspace);
 407         mn[5] = GCPU_MNEMONIC(code, T, nspace);
 408         mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
 409         mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
 410         mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
 411
 412         while (p < q - 1 && (c = *fmt++) != '\0') {
 413                 if (c != '%') {
 414                         /* not the beginning of a format specifier - copy */
 415                         *p++ = c;
 416                         continue;
 417                 }
 418
 419                 error = 0;
 420                 which = -1;
 421                 expfmtchar = -1;
 422
 423 nextfmt:
 424                 if ((c = *fmt++) == '\0')
 425                         break;  /* early termination of fmt specifier */
 426
 427                 switch (c) {
 428                 case '1':
 429                 case '2':
 430                 case '3':
 431                 case '4':
 432                 case '5':
 433                 case '6':
 434                 case '7':
 435                 case '8':
 436                 case '9':
 437                         if (which != -1) { /* allow only one positional digit */
 438                                 error++;
 439                                 break;
 440                         }
 441                         which = c - '1';
 442                         goto nextfmt;
 443                         /*NOTREACHED*/
 444
 445                 case '$':
 446                         if (which == -1) { /* no position specified */
 447                                 error++;
 448                                 break;
 449                         }
 450                         expfmtchar = 's';
 451                         goto nextfmt;
 452                         /*NOTREACHED*/
 453
 454                 case 's':
 455                         if (expfmtchar != 's') {
 456                                 error++;
 457                                 break;
 458                         }
 459                         (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
 460                             mn[which]);
 461                         p += strlen(p);
 462                         break;
 463
 464                 default:
 465                         error++;
 466                         break;
 467                 }
 468
 469                 if (error)
 470                         break;
 471         }
 472
 473         *p = '\0';      /* NUL termination */
 474 }
 475
 476 static void
 477 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
 478     const char *cpuclass, const char *leafclass)
 479 {
 480         char *p = buf;                  /* current position in buf */
 481         char *q = buf + buflen;         /* pointer past last char in buf */
 482
 483         (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
 484             FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
 485
 486         p += strlen(p);
 487         if (p >= q)
 488                 return;
 489
 490         if (leafclass == NULL) {
 491                 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
 492                     GCPU_MN_NAMESPACE_EREPORT);
 493         } else {
 494                 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
 495                     leafclass);
 496         }
 497 }
 498
 499 /*
 500  * Create an "hc" scheme FMRI identifying the given cpu with
 501  * motherboard/chip/core/strand instance numbers.
 502  */
 503 static nvlist_t *
 504 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
 505 {
 506         nvlist_t *nvl, *fmri;
 507
 508         if ((nvl = fm_nvlist_create(nva)) == NULL)
 509                 return (NULL);
 510
 511         if (!x86gentopo_legacy) {
 512                 fmri = cmi_hdl_smb_bboard(hdl);
 513                 if (fmri == NULL)
 514                         return (NULL);
 515
 516                 fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
 517                     NULL, NULL, fmri, 3,
 518                     "chip", cmi_hdl_smb_chipid(hdl),
 519                     "core", cmi_hdl_coreid(hdl),
 520                     "strand", cmi_hdl_strandid(hdl));
 521         } else {
 522                 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
 523                     "motherboard", 0,
 524                     "chip", cmi_hdl_chipid(hdl),
 525                     "core", cmi_hdl_coreid(hdl),
 526                     "strand", cmi_hdl_strandid(hdl));
 527         }
 528
 529         return (nvl);
 530 }
 531
 532 int gcpu_bleat_count_thresh = 5;
 533 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
 534
 535 /*
 536  * Called when we are unable to propogate a logout structure onto an
 537  * errorq for subsequent ereport preparation and logging etc.  The caller
 538  * should usually only decide to call this for severe errors - those we
 539  * suspect we may need to panic for.
 540  */
 541 static void
 542 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
 543 {
 544         hrtime_t now  = gethrtime_waitfree();
 545         static hrtime_t gcpu_last_bleat;
 546         gcpu_bank_logout_t *gbl;
 547         static int bleatcount;
 548         int i;
 549
 550         /*
 551          * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
 552          * can come as fast as we like, but once we've spammed that many
 553          * to the console we require a minimum interval to pass before
 554          * any more complaints.
 555          */
 556         if (++bleatcount > gcpu_bleat_count_thresh) {
 557                 if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
 558                         return;
 559                 else
 560                         bleatcount = 0;
 561         }
 562         gcpu_last_bleat = now;
 563
 564         cmn_err(CE_WARN,
 565             "Machine-Check Errors unlogged on chip %d core %d strand %d, "
 566             "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
 567             cmi_hdl_strandid(hdl));
 568         cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
 569             (u_longlong_t)gcl->gcl_mcg_status);
 570         for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
 571                 uint64_t status = gbl->gbl_status;
 572
 573                 if (!(status & MSR_MC_STATUS_VAL))
 574                         continue;
 575
 576                 /* Force ADDRV for AMD Family 0xf and above */
 577                 if (gcpu_force_addr_in_payload)
 578                         status = status | MSR_MC_STATUS_ADDRV;
 579
 580                 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
 581                 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
 582                         cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
 583                             "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
 584                             i, IA32_MSR_MC(i, STATUS),
 585                             (u_longlong_t)gbl->gbl_status,
 586                             (u_longlong_t)gbl->gbl_addr,
 587                             (u_longlong_t)gbl->gbl_misc);
 588                         break;
 589
 590                 case MSR_MC_STATUS_ADDRV:
 591                         cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
 592                             "STAT 0x%016llx ADDR 0x%016llx",
 593                             i, IA32_MSR_MC(i, STATUS),
 594                             (u_longlong_t)gbl->gbl_status,
 595                             (u_longlong_t)gbl->gbl_addr);
 596                         break;
 597
 598                 case MSR_MC_STATUS_MISCV:
 599                         cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
 600                             "STAT 0x%016llx MISC 0x%016llx",
 601                             i, IA32_MSR_MC(i, STATUS),
 602                             (u_longlong_t)gbl->gbl_status,
 603                             (u_longlong_t)gbl->gbl_misc);
 604                         break;
 605
 606                 default:
 607                         cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
 608                             "STAT 0x%016llx",
 609                             i, IA32_MSR_MC(i, STATUS),
 610                             (u_longlong_t)gbl->gbl_status);
 611                         break;
 612
 613                 }
 614         }
 615 }
 616
 617 #define _GCPU_BSTATUS(status, what) \
 618         FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
 619         (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
 620
 621 static void
 622 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
 623     uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
 624 {
 625         uint64_t members = ged ? ged->ged_ereport_members :
 626             FM_EREPORT_PAYLOAD_FLAGS_COMMON;
 627         uint64_t mcg = gcl->gcl_mcg_status;
 628         int mcip = mcg & MCG_STATUS_MCIP;
 629         const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
 630         uint64_t bstat = gbl->gbl_status;
 631
 632         /*
 633          * Include the compound error name if requested and if this
 634          * is a compound error type.
 635          */
 636         if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
 637             ged->ged_compound_fmt != NULL) {
 638                 char buf[FM_MAX_CLASS];
 639
 640                 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
 641                     GCPU_MN_NAMESPACE_COMPOUND);
 642                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
 643                     DATA_TYPE_STRING, buf, NULL);
 644         }
 645
 646         /*
 647          * Include disposition information for this error
 648          */
 649         if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
 650             gbl->gbl_disp != 0) {
 651                 int i, empty = 1;
 652                 char buf[128];
 653                 char *p = buf, *q = buf + 128;
 654                 static struct _gcpu_disp_name {
 655                         uint64_t dv;
 656                         const char *dn;
 657                 } disp_names[] = {
 658                         { CMI_ERRDISP_CURCTXBAD,
 659                             "processor_context_corrupt" },
 660                         { CMI_ERRDISP_RIPV_INVALID,
 661                             "return_ip_invalid" },
 662                         { CMI_ERRDISP_UC_UNCONSTRAINED,
 663                             "unconstrained" },
 664                         { CMI_ERRDISP_FORCEFATAL,
 665                             "forcefatal" },
 666                         { CMI_ERRDISP_IGNORED,
 667                             "ignored" },
 668                         { CMI_ERRDISP_PCC_CLEARED,
 669                             "corrupt_context_cleared" },
 670                         { CMI_ERRDISP_UC_CLEARED,
 671                             "uncorrected_data_cleared" },
 672                         { CMI_ERRDISP_POISONED,
 673                             "poisoned" },
 674                         { CMI_ERRDISP_INCONSISTENT,
 675                             "telemetry_unstable" },
 676                 };
 677
 678                 for (i = 0; i < sizeof (disp_names) /
 679                     sizeof (struct _gcpu_disp_name); i++) {
 680                         if ((gbl->gbl_disp & disp_names[i].dv) == 0)
 681                                 continue;
 682
 683                         (void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
 684                             "%s%s", empty ? "" : ",", disp_names[i].dn);
 685                         p += strlen(p);
 686                         empty = 0;
 687                 }
 688
 689                 if (p != buf)
 690                         fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
 691                             DATA_TYPE_STRING, buf, NULL);
 692         }
 693
 694         /*
 695          * If MCG_STATUS is included add that and an indication of whether
 696          * this ereport was the result of a machine check or poll.
 697          */
 698         if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
 699                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
 700                     DATA_TYPE_UINT64, mcg, NULL);
 701
 702                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
 703                     DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
 704         }
 705
 706         /*
 707          * If an instruction pointer is to be included add one provided
 708          * MCG_STATUS indicated it is valid; meaningless for polled events.
 709          */
 710         if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
 711             mcg & MCG_STATUS_EIPV) {
 712                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
 713                     DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
 714         }
 715
 716         /*
 717          * Add an indication of whether the trap occured during privileged code.
 718          */
 719         if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
 720                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
 721                     DATA_TYPE_BOOLEAN_VALUE,
 722                     gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
 723         }
 724
 725         /*
 726          * If requested, add the index of the MCA bank.  This indicates the
 727          * n'th bank of 4 MCA registers, and does not necessarily correspond
 728          * to MCi_* - use the bank offset to correlate
 729          */
 730         if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
 731                 fm_payload_set(ereport,
 732                     /* Bank number */
 733                     FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
 734                     /* Offset of MCi_CTL */
 735                     FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
 736                     IA32_MSR_MC(bankno, CTL),
 737                     NULL);
 738         }
 739
 740         /*
 741          * Add MCi_STATUS if requested, and decode it.
 742          */
 743         if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
 744                 const char *tbes[] = {
 745                         "No tracking",                  /* 00 */
 746                         "Green - below threshold",      /* 01 */
 747                         "Yellow - above threshold",     /* 10 */
 748                         "Reserved"                      /* 11 */
 749                 };
 750
 751                 fm_payload_set(ereport,
 752                     /* Bank MCi_STATUS */
 753                     FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
 754                     /* Overflow? */
 755                     _GCPU_BSTATUS(bstat, OVER),
 756                     /* Uncorrected? */
 757                     _GCPU_BSTATUS(bstat, UC),
 758                     /* Enabled? */
 759                     _GCPU_BSTATUS(bstat, EN),
 760                     /* Processor context corrupt? */
 761                     _GCPU_BSTATUS(bstat, PCC),
 762                     /* Error code */
 763                     FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
 764                     DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
 765                     /* Model-specific error code */
 766                     FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
 767                     DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
 768                     NULL);
 769
 770                 /*
 771                  * If MCG_CAP.TES_P indicates that that thresholding info
 772                  * is present in the architural component of the bank status
 773                  * then include threshold information for this bank.
 774                  */
 775                 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
 776                         fm_payload_set(ereport,
 777                             FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
 778                             DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
 779                             NULL);
 780                 }
 781         }
 782
 783         /*
 784          * Add MCi_ADDR info if requested and valid. We force addition of
 785          * MCi_ADDR, even if its not valid on AMD family 0xf and above,
 786          * to aid in analysis of ereports, for WatchDog errors.
 787          */
 788         if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
 789             ((bstat & MSR_MC_STATUS_ADDRV) ||
 790             gcpu_force_addr_in_payload)) {
 791                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
 792                     DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
 793         }
 794
 795         /*
 796          * MCi_MISC if requested and MCi_STATUS.MISCV).
 797          */
 798         if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
 799             bstat & MSR_MC_STATUS_MISCV) {
 800                 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
 801                     DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
 802         }
 803
 804 }
 805
 806 /*
 807  * Construct and post an ereport based on the logout information from a
 808  * single MCA bank.  We are not necessarily running on the cpu that
 809  * detected the error.
 810  */
 811 static void
 812 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
 813     const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
 814 {
 815         gcpu_data_t *gcpu = gcl->gcl_gcpu;
 816         cmi_hdl_t hdl = gcpu->gcpu_hdl;
 817         const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
 818         const char *cpuclass = NULL, *leafclass = NULL;
 819         uint16_t code = MCAX86_ERRCODE(status);
 820         errorq_elem_t *eqep, *scr_eqep;
 821         nvlist_t *ereport, *detector;
 822         char buf[FM_MAX_CLASS];
 823         const char *classfmt;
 824         nv_alloc_t *nva;
 825
 826         if (panicstr) {
 827                 if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
 828                         return;
 829                 ereport = errorq_elem_nvl(ereport_errorq, eqep);
 830
 831                 /*
 832                  * Allocate another element for scratch space, but fallback
 833                  * to the one we have if that fails.  We'd like to use the
 834                  * additional scratch space for nvlist construction.
 835                  */
 836                 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
 837                         nva = errorq_elem_nva(ereport_errorq, scr_eqep);
 838                 else
 839                         nva = errorq_elem_nva(ereport_errorq, eqep);
 840         } else {
 841                 ereport = fm_nvlist_create(NULL);
 842                 nva = NULL;
 843         }
 844
 845         if (ereport == NULL)
 846                 return;
 847
 848         /*
 849          * Common payload data required by the protocol:
 850          *      - ereport class
 851          *      - detector
 852          *      - ENA
 853          */
 854
 855         /*
 856          * Ereport class - call into model-specific support to allow it to
 857          * provide a cpu class or leaf class, otherwise calculate our own.
 858          */
 859         cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
 860         classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
 861         gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
 862             leafclass);
 863
 864         /*
 865          * The detector FMRI.
 866          */
 867         if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
 868             nva)) == NULL)
 869                 detector = gcpu_fmri_create(hdl, nva);
 870
 871         /*
 872          * Should we define a new ENA format 3?? for chip/core/strand?
 873          * It will be better when virtualized.
 874          */
 875         fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
 876             fm_ena_generate_cpu(gcl->gcl_timestamp,
 877             cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
 878             cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
 879
 880         if (panicstr) {
 881                 fm_nvlist_destroy(detector, FM_NVA_RETAIN);
 882                 nv_alloc_reset(nva);
 883         } else {
 884                 fm_nvlist_destroy(detector, FM_NVA_FREE);
 885         }
 886
 887         /*
 888          * Add the architectural ereport class-specific payload data.
 889          */
 890         gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
 891
 892         /*
 893          * Allow model-specific code to add ereport members.
 894          */
 895         cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
 896             gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
 897
 898         /*
 899          * Include stack if options is turned on and either selected in
 900          * the payload member bitmask or inclusion is forced.
 901          */
 902         if (gcpu_mca_stack_flag &&
 903             (cms_ereport_includestack(hdl, mscookie) ==
 904             B_TRUE || gcpu_mca_stack_ereport_include)) {
 905                 fm_payload_stack_add(ereport, gcl->gcl_stack,
 906                     gcl->gcl_stackdepth);
 907         }
 908
 909         /*
 910          * If injection has taken place anytime in the past then note this
 911          * on the ereport.
 912          */
 913         if (cmi_inj_tainted() == B_TRUE) {
 914                 fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
 915                     B_TRUE, NULL);
 916         }
 917
 918         /*
 919          * Post ereport.
 920          */
 921         if (panicstr) {
 922                 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
 923                 if (scr_eqep)
 924                         errorq_cancel(ereport_errorq, scr_eqep);
 925         } else {
 926                 (void) fm_ereport_post(ereport, EVCH_TRYHARD);
 927                 fm_nvlist_destroy(ereport, FM_NVA_FREE);
 928         }
 929
 930 }
 931
 932 /*ARGSUSED*/
 933 void
 934 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
 935 {
 936         const gcpu_logout_t *gcl = data;
 937         const gcpu_bank_logout_t *gbl;
 938         int ismc;
 939         int i;
 940
 941         ismc = gcl->ismc;
 942         for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
 943                 const gcpu_error_disp_t *gened;
 944                 cms_cookie_t mscookie;
 945
 946                 if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
 947                     !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
 948                         uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
 949
 950                         /*
 951                          * Perform a match based on IA32 MCA architectural
 952                          * components alone.
 953                          */
 954                         gened = gcpu_disp_match(code); /* may be NULL */
 955
 956                         /*
 957                          * Now see if an model-specific match can be made.
 958                          */
 959                         mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc,
 960                             i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
 961                             gcl->gcl_ms_logout);
 962
 963                         /*
 964                          * Prepare and dispatch an ereport for logging and
 965                          * diagnosis.
 966                          */
 967                         gcpu_ereport_post(gcl, i, gened, mscookie,
 968                             gbl->gbl_status);
 969                 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
 970                     (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
 971                         /*
 972                          * Telemetry kept changing as we tried to read
 973                          * it.  Force an unknown ereport leafclass but
 974                          * keep the telemetry unchanged for logging.
 975                          */
 976                         gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
 977                             gbl->gbl_status);
 978                 }
 979         }
 980 }
 981
 982 static size_t gcpu_mca_queue_datasz = 0;
 983
 984 /*
 985  * The following code is ready to make a weak attempt at growing the
 986  * errorq structure size.  Since it is not foolproof (we don't know
 987  * who may already be producing to the outgoing errorq) our caller
 988  * instead assures that we'll always be called with no greater data
 989  * size than on our first call.
 990  */
 991 static void
 992 gcpu_errorq_init(size_t datasz)
 993 {
 994         int slots;
 995
 996         mutex_enter(&gcpu_mca_queue_lock);
 997
 998         if (gcpu_mca_queue_datasz >= datasz) {
 999                 mutex_exit(&gcpu_mca_queue_lock);
1000                 return;
1001         }
1002
1003         membar_producer();
1004         if (gcpu_mca_queue) {
1005                 gcpu_mca_queue_datasz = 0;
1006                 errorq_destroy(gcpu_mca_queue);
1007         }
1008
1009         slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
1010         slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
1011
1012         gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
1013             NULL, slots, datasz, 1, ERRORQ_VITAL);
1014
1015         if (gcpu_mca_queue != NULL)
1016                 gcpu_mca_queue_datasz = datasz;
1017
1018         mutex_exit(&gcpu_mca_queue_lock);
1019 }
1020
1021 /*
1022  * Perform MCA initialization as described in section 14.6 of Intel 64
1023  * and IA-32 Architectures Software Developer's Manual Volume 3A.
1024  */
1025
1026 static uint_t global_nbanks;
1027
1028 void
1029 gcpu_mca_init(cmi_hdl_t hdl)
1030 {
1031         gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1032         uint64_t cap;
1033         uint_t vendor = cmi_hdl_vendor(hdl);
1034         uint_t family = cmi_hdl_family(hdl);
1035         uint_t rev = cmi_hdl_chiprev(hdl);
1036         gcpu_mca_t *mca = &gcpu->gcpu_mca;
1037         int mcg_ctl_present;
1038         uint_t nbanks;
1039         uint32_t ctl_skip_mask = 0;
1040         uint32_t status_skip_mask = 0;
1041         size_t mslsz;
1042         int i;
1043         int mcg_ctl2_present;
1044         uint32_t cmci_capable = 0;
1045         if (gcpu == NULL)
1046                 return;
1047
1048         /* We add MCi_ADDR always for AMD Family 0xf and above */
1049         if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B))
1050                 gcpu_force_addr_in_payload = 1;
1051
1052         /*
1053          * Protect from some silly /etc/system settings.
1054          */
1055         if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1056                 gcpu_mca_telemetry_retries = 5;
1057
1058         if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1059                 return;
1060
1061         /*
1062          * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1063          * both MCA and MCE support (i.e., X86FSET_MCA).  P5, K6, and earlier
1064          * processors, which have their own more primitive way of doing
1065          * machine checks, will not have cmi_mca_init called since their
1066          * CPUID information will not indicate both MCA and MCE features.
1067          */
1068         ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA));
1069
1070         /*
1071          * Determine whether the IA32_MCG_CTL register is present.  If it
1072          * is we will enable all features by writing -1 to it towards
1073          * the end of this initialization;  if it is absent then volume 3A
1074          * says we must nonetheless continue to initialize the individual
1075          * banks.
1076          */
1077         mcg_ctl_present = cap & MCG_CAP_CTL_P;
1078         mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1079
1080         /*
1081          * We squirell values away for inspection/debugging.
1082          */
1083         mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1084         if (mcg_ctl_present)
1085                 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1086                     &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1087
1088         /*
1089          * Determine the number of error-reporting banks implemented.
1090          */
1091         mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1092
1093         if (nbanks != 0 && global_nbanks == 0)
1094                 global_nbanks = nbanks; /* no race - BSP will get here first */
1095
1096         /*
1097          * If someone is hiding the number of banks (perhaps we are fully
1098          * virtualized?) or if this processor has more banks than the
1099          * first to set global_nbanks then bail.  The latter requirement
1100          * is because we need to size our errorq data structure and we
1101          * don't want to have to grow the errorq (destroy and recreate)
1102          * which may just lose some telemetry.
1103          */
1104         if (nbanks == 0 || nbanks > global_nbanks)
1105                 return;
1106
1107         mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1108             sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1109
1110         /*
1111          * Calculate the size we need to allocate for a gcpu_logout_t
1112          * with a gcl_data array big enough for all banks of this cpu.
1113          * Add any space requested by the model-specific logout support.
1114          */
1115         mslsz = cms_logout_size(hdl);
1116         mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1117             (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1118
1119         for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1120                 gcpu_logout_t *gcl;
1121
1122                 mca->gcpu_mca_logout[i] = gcl =
1123                     kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1124                 gcl->gcl_gcpu = gcpu;
1125                 gcl->gcl_nbanks = nbanks;
1126                 gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1127                     (char *)(&gcl->gcl_data[0]) + nbanks *
1128                     sizeof (gcpu_bank_logout_t);
1129
1130         }
1131
1132
1133         mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1134
1135         mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1136             KM_SLEEP);
1137
1138         /*
1139          * Create our errorq to transport the logout structures.  This
1140          * can fail so users of gcpu_mca_queue must be prepared for NULL.
1141          */
1142         gcpu_errorq_init(mca->gcpu_mca_lgsz);
1143
1144         /*
1145          * Not knowing which, if any, banks are shared between cores we
1146          * assure serialization of MCA bank initialization by each cpu
1147          * on the chip.  On chip architectures in which some banks are
1148          * shared this will mean the shared resource is initialized more
1149          * than once - we're simply aiming to avoid simultaneous MSR writes
1150          * to the shared resource.
1151          *
1152          * Even with these precautions, some platforms may yield a GP fault
1153          * if a core other than a designated master tries to write anything
1154          * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
1155          * those writes under on_trap protection.
1156          */
1157         mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1158
1159         /*
1160          * Initialize poller data, but don't start polling yet.
1161          */
1162         gcpu_mca_poll_init(hdl);
1163
1164         /*
1165          * Work out which MCA banks we will initialize.  In MCA logout
1166          * code we will only read those banks which we initialize here.
1167          */
1168         for (i = 0; i < nbanks; i++) {
1169                 boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1170                 boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1171
1172                 if (!cms_present(hdl)) {
1173                         /*
1174                          * Model-specific support is not present, try to use
1175                          * sane defaults.
1176                          *
1177                          * On AMD family 6 processors, reports about spurious
1178                          * machine checks indicate that bank 0 should be
1179                          * skipped.
1180                          *
1181                          * On Intel family 6 processors, the documentation tells
1182                          * us not to write to MC0_CTL.
1183                          *
1184                          */
1185                         if (i == 0 && family == 6) {
1186                                 switch (vendor) {
1187                                 case X86_VENDOR_AMD:
1188                                         skipstatus = B_TRUE;
1189                                         /*FALLTHRU*/
1190                                 case X86_VENDOR_Intel:
1191                                         skipctl = B_TRUE;
1192                                         break;
1193                                 }
1194                         }
1195                 }
1196
1197                 ctl_skip_mask |= skipctl << i;
1198                 status_skip_mask |= skipstatus << i;
1199
1200                 if (skipctl && skipstatus)
1201                         continue;
1202
1203                 /*
1204                  * Record which MCA banks were enabled, from the point of view
1205                  * of the whole chip (if some cores share a bank we must be
1206                  * sure either can logout from it).
1207                  */
1208                 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1209
1210                 /*
1211                  * check CMCI capability
1212                  */
1213                 if (mcg_ctl2_present) {
1214                         uint64_t ctl2;
1215                         uint32_t cap = 0;
1216                         (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1217                         if (ctl2 & MSR_MC_CTL2_EN)
1218                                 continue;
1219                         ctl2 |= MSR_MC_CTL2_EN;
1220                         (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1221                         (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1222                         mca->gcpu_bank_cmci[i].cmci_cap = cap =
1223                             (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1224                         if (cap)
1225                                 cmci_capable ++;
1226                         /*
1227                          * Set threshold to 1 while unset the en field, to avoid
1228                          * CMCI trigged before APIC LVT entry init.
1229                          */
1230                         ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
1231                         (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1232
1233                         /*
1234                          * init cmci related count
1235                          */
1236                         mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1237                         mca->gcpu_bank_cmci[i].drtcmci = 0;
1238                         mca->gcpu_bank_cmci[i].ncmci = 0;
1239                 }
1240         }
1241
1242         if (cmci_capable)
1243                 cmi_enable_cmci = 1;
1244
1245         /*
1246          * Log any valid telemetry lurking in the MCA banks, but do not
1247          * clear the status registers.  Ignore the disposition returned -
1248          * we have already paniced or reset for any nasty errors found here.
1249          *
1250          * Intel vol 3A says that we should not do this on family 0x6,
1251          * and that for any extended family the BIOS clears things
1252          * on power-on reset so you'll only potentially find valid telemetry
1253          * on warm reset (we do it for both - on power-on reset we should
1254          * just see zeroes).
1255          *
1256          * AMD docs since K7 say we should process anything we find here.
1257          */
1258         if (!gcpu_suppress_log_on_init &&
1259             (vendor == X86_VENDOR_Intel && family >= 0xf ||
1260             vendor == X86_VENDOR_AMD))
1261                 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1262                     GCPU_MPT_WHAT_POKE_ERR);
1263
1264         /*
1265          * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1266          * model-specific module the power of veto.
1267          */
1268         for (i = 0; i < nbanks; i++) {
1269                 struct gcpu_bios_bankcfg *bcfgp =
1270                     mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1271
1272                 /*
1273                  * Stash inherited bank MCA state, even for banks we will
1274                  * not initialize ourselves.  Do not read the MISC register
1275                  * unconditionally - on some processors that will #GP on
1276                  * banks that do not implement the MISC register (would be
1277                  * caught by on_trap, anyway).
1278                  */
1279                 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1280                     &bcfgp->bios_bank_ctl);
1281
1282                 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1283                     &bcfgp->bios_bank_status);
1284
1285                 if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) ||
1286                     gcpu_force_addr_in_payload) {
1287                         (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1288                             &bcfgp->bios_bank_addr);
1289                 }
1290
1291                 /*
1292                  * In some old BIOS the status value after boot can indicate
1293                  * MISCV when there is actually no MISC register for
1294                  * that bank.  The following read could therefore
1295                  * aggravate a general protection fault.  This should be
1296                  * caught by on_trap, but the #GP fault handler is busted
1297                  * and can suffer a double fault even before we get to
1298                  * trap() to check for on_trap protection.  Until that
1299                  * issue is fixed we remove the one access that we know
1300                  * can cause a #GP.
1301                  *
1302                  * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1303                  *      (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1304                  *          &bcfgp->bios_bank_misc);
1305                  */
1306                 bcfgp->bios_bank_misc = 0;
1307
1308                 if (!(ctl_skip_mask & (1 << i))) {
1309                         (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1310                             cms_bankctl_val(hdl, i, -1ULL));
1311                 }
1312
1313                 if (!(status_skip_mask & (1 << i))) {
1314                         (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1315                             cms_bankstatus_val(hdl, i, 0ULL));
1316                 }
1317         }
1318         /*
1319          * Now let the model-specific support perform further initialization
1320          * of non-architectural features.
1321          */
1322         cms_mca_init(hdl, nbanks);
1323
1324         (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1325         membar_producer();
1326
1327         /* enable all machine-check features */
1328         if (mcg_ctl_present)
1329                 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1330                     cms_mcgctl_val(hdl, nbanks, -1ULL));
1331
1332         mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1333
1334         /* enable machine-check exception in CR4 */
1335         cmi_hdl_enable_mce(hdl);
1336 }
1337
1338 static uint64_t
1339 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1340     gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1341 {
1342         int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1343         gcpu_mca_t *mca = &gcpu->gcpu_mca;
1344         int nbanks = mca->gcpu_mca_nbanks;
1345         gcpu_mce_status_t mce;
1346         gcpu_bank_logout_t *gbl;
1347         uint64_t disp = 0;
1348         int i;
1349
1350         if (mcesp == NULL)
1351                 mcesp = &mce;
1352
1353         mcesp->mce_nerr = nerr;
1354
1355         mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1356             mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1357             mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1358
1359         /*
1360          * If this a machine check then if the return instruction pointer
1361          * is not valid the current context is lost.
1362          */
1363         if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1364                 disp |= CMI_ERRDISP_RIPV_INVALID;
1365         gcl->ismc = ismc;
1366
1367         for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1368                 uint64_t mcistatus = gbl->gbl_status;
1369                 uint32_t ms_scope;
1370                 int pcc, uc;
1371                 int poisoned;
1372
1373                 if (!(mcistatus & MSR_MC_STATUS_VAL))
1374                         continue;
1375
1376                 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1377                         continue;
1378
1379                 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1380                 uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1381                 mcesp->mce_npcc += pcc;
1382                 mcesp->mce_nuc += uc;
1383
1384                 ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1385                     gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1386
1387                 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1388                         pcc = 0;
1389                         mcesp->mce_npcc_ok++;
1390                         gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1391                 }
1392
1393                 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1394                         uc = 0;
1395                         mcesp->mce_nuc_ok++;
1396                         gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1397                 }
1398
1399                 if (uc) {
1400                         poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1401                         if (poisoned) {
1402                                 mcesp->mce_nuc_poisoned++;
1403                                 gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1404                         }
1405                 }
1406
1407                 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1408                         /*
1409                          * We're not being instructed to ignore the error,
1410                          * so apply our standard disposition logic to it.
1411                          */
1412                         if (uc && !poisoned) {
1413                                 unconstrained++;
1414                                 gbl->gbl_disp |= disp |
1415                                     CMI_ERRDISP_UC_UNCONSTRAINED;
1416                         }
1417
1418                         if (pcc && ismc) {
1419                                 curctxbad++;
1420                                 gbl->gbl_disp |= disp |
1421                                     CMI_ERRDISP_CURCTXBAD;
1422                         }
1423
1424                         /*
1425                          * Even if the above may not indicate that the error
1426                          * is terminal, model-specific support may insist
1427                          * that we treat it as such.  Such errors wil be
1428                          * fatal even if discovered via poll.
1429                          */
1430                         if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1431                                 forcefatal++;
1432                                 mcesp->mce_forcefatal++;
1433                                 gbl->gbl_disp |= disp |
1434                                     CMI_ERRDISP_FORCEFATAL;
1435                         }
1436                 } else {
1437                         mcesp->mce_ignored++;
1438                         gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1439                 }
1440         }
1441
1442         if (unconstrained > 0)
1443                 disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1444
1445         if (curctxbad > 0)
1446                 disp |= CMI_ERRDISP_CURCTXBAD;
1447
1448         if (forcefatal > 0)
1449                 disp |= CMI_ERRDISP_FORCEFATAL;
1450
1451         if (gcpu_mca_queue != NULL) {
1452                 int how;
1453
1454                 if (ismc) {
1455                         how = cmi_mce_response(rp, disp) ?
1456                             ERRORQ_ASYNC :      /* no panic, so arrange drain */
1457                             ERRORQ_SYNC;        /* panic flow will drain */
1458                 } else {
1459                         how = (disp & CMI_ERRDISP_FORCEFATAL &&
1460                             cmi_panic_on_ue()) ?
1461                             ERRORQ_SYNC :       /* poller will panic */
1462                             ERRORQ_ASYNC;       /* no panic */
1463                 }
1464
1465                 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1466         } else if (disp != 0) {
1467                 gcpu_bleat(hdl, gcl);
1468         }
1469
1470         mcesp->mce_disp = disp;
1471
1472         return (disp);
1473 }
1474
1475 /*
1476  * Gather error telemetry from our source, and then submit it for
1477  * processing.
1478  */
1479
1480 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1481         ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1482
1483 #define STATUS_EQV(s1, s2) \
1484         (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1485
1486 static uint32_t gcpu_deferrred_polled_clears;
1487
1488 static void
1489 gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1490     uint64_t status, int what)
1491 {
1492         uint64_t ctl2;
1493
1494         if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1495             (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1496             !(status & MSR_MC_STATUS_CEC_MASK)))) {
1497
1498                 if (!(bank_cmci_p->cmci_enabled)) {
1499                         /*
1500                          * when cmci is disabled, and the bank has no error or
1501                          * no corrected error for
1502                          * gcpu_mca_cmci_reenable_threshold consecutive polls,
1503                          * turn on this bank's cmci.
1504                          */
1505
1506                         bank_cmci_p->drtcmci ++;
1507
1508                         if (bank_cmci_p->drtcmci >=
1509                             gcpu_mca_cmci_reenable_threshold) {
1510
1511                                 /* turn on cmci */
1512
1513                                 (void) cmi_hdl_rdmsr(hdl,
1514                                     IA32_MSR_MC_CTL2(bank), &ctl2);
1515                                 ctl2 |= MSR_MC_CTL2_EN;
1516                                 (void) cmi_hdl_wrmsr(hdl,
1517                                     IA32_MSR_MC_CTL2(bank), ctl2);
1518
1519                                 /* reset counter and set flag */
1520                                 bank_cmci_p->drtcmci = 0;
1521                                 bank_cmci_p->cmci_enabled = 1;
1522                         }
1523                 } else {
1524                         /*
1525                          * when cmci is enabled,if is in cyclic poll and the
1526                          * bank has no error or no corrected error, reset ncmci
1527                          * counter
1528                          */
1529                         bank_cmci_p->ncmci = 0;
1530                 }
1531         }
1532 }
1533
1534 static void
1535 gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1536     int what)
1537 {
1538         uint64_t ctl2 = 0;
1539
1540         /*
1541          * if cmci of this bank occurred beyond
1542          * gcpu_mca_cmci_throttling_threshold between 2 polls,
1543          * turn off this bank's CMCI;
1544          */
1545         if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1546
1547                 /* if it is cmci trap, increase the count */
1548                 bank_cmci_p->ncmci++;
1549
1550                 if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1551
1552                         /* turn off cmci */
1553
1554                         (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1555                             &ctl2);
1556                         ctl2 &= ~MSR_MC_CTL2_EN;
1557                         (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1558                             ctl2);
1559
1560                         /* clear the flag and count */
1561
1562                         bank_cmci_p->cmci_enabled = 0;
1563                         bank_cmci_p->ncmci = 0;
1564                 }
1565         }
1566 }
1567
1568 static void
1569 clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1570     cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1571 {
1572         int i;
1573         gcpu_bank_logout_t *gbl, *pgbl;
1574         uint64_t status;
1575
1576         if (first < 0 || last < 0)
1577                 return;
1578
1579         for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1580                 status = gbl->gbl_status;
1581                 if (status == 0)
1582                         continue;
1583                 if (clrstatus == B_FALSE)
1584                         goto serialize;
1585
1586                 /*
1587                  * For i86xpv we always clear status in order to invalidate
1588                  * the interposed telemetry.
1589                  *
1590                  * For native machine checks we always clear status here.  For
1591                  * native polls we must be a little more cautious since there
1592                  * is an outside chance that we may clear telemetry from a
1593                  * shared MCA bank on which a sibling core is machine checking.
1594                  *
1595                  * For polled observations of errors that look like they may
1596                  * produce a machine check (UC/PCC and ENabled, although these
1597                  * do not guarantee a machine check on error occurence)
1598                  * we will not clear the status at this wakeup unless
1599                  * we saw the same status at the previous poll.  We will
1600                  * always process and log the current observations - it
1601                  * is only the clearing of MCi_STATUS which may be
1602                  * deferred until the next wakeup.
1603                  */
1604                 if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1605                         (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1606                         goto serialize;
1607                 }
1608
1609                 /*
1610                  * We have a polled observation of a machine check
1611                  * candidate.  If we saw essentially the same status at the
1612                  * last poll then clear the status now since this appears
1613                  * not to be a #MC candidate after all.  If we see quite
1614                  * different status now then do not clear, but reconsider at
1615                  * the next poll.  In no actual machine check clears
1616                  * the status in the interim then the status should not
1617                  * keep changing forever (meaning we'd never clear it)
1618                  * since before long we'll simply have latched the highest-
1619                  * priority error and set the OVerflow bit.  Nonetheless
1620                  * we count how many times we defer clearing and after
1621                  * a while insist on clearing the status.
1622                  */
1623                 pgbl = &pgcl->gcl_data[i];
1624                 if (pgbl->gbl_clrdefcnt != 0) {
1625                         /* We deferred clear on this bank at last wakeup */
1626                         if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1627                             pgbl->gbl_clrdefcnt > 5) {
1628                                 /*
1629                                  * Status is unchanged so clear it now and,
1630                                  * since we have already logged this info,
1631                                  * avoid logging it again.
1632                                  */
1633                                 gbl->gbl_status = 0;
1634                                 (void) cmi_hdl_wrmsr(hdl,
1635                                     IA32_MSR_MC(i, STATUS), 0ULL);
1636                         } else {
1637                                 /* Record deferral for next wakeup */
1638                                 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1639                         }
1640                 } else {
1641                         /* Record initial deferral for next wakeup */
1642                         gbl->gbl_clrdefcnt = 1;
1643                         gcpu_deferrred_polled_clears++;
1644                 }
1645
1646 serialize:
1647                 {
1648                         /*
1649                          * Intel Vol 3A says to execute a serializing
1650                          * instruction here, ie CPUID.  Well WRMSR is also
1651                          * defined to be serializing, so the status clear above
1652                          * should suffice.  To be a good citizen, and since
1653                          * some clears are deferred, we'll execute a CPUID
1654                          * instruction here.
1655                          */
1656                         struct cpuid_regs tmp;
1657                         (void) __cpuid_insn(&tmp);
1658                 }
1659         }
1660 }
1661
1662 /*ARGSUSED5*/
1663 void
1664 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1665     gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1666 {
1667         gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1668         gcpu_mca_t *mca = &gcpu->gcpu_mca;
1669         int nbanks = mca->gcpu_mca_nbanks;
1670         gcpu_bank_logout_t *gbl, *pgbl;
1671         gcpu_logout_t *gcl, *pgcl;
1672         int ismc = (rp != NULL);
1673         int ispoll = !ismc;
1674         int i, nerr = 0;
1675         cmi_errno_t err;
1676         uint64_t mcg_status;
1677         uint64_t disp;
1678         uint64_t cap;
1679         int first = -1;
1680         int last = -1;
1681         int willpanic = 0;
1682
1683         if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1684             CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1685             CMI_SUCCESS) {
1686                 if (mcesp != NULL)
1687                         mcesp->mce_nerr = mcesp->mce_disp = 0;
1688                 return;
1689         }
1690
1691         if (ismc) {
1692                 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1693         } else {
1694                 int pidx = mca->gcpu_mca_nextpoll_idx;
1695                 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1696                     GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1697
1698                 gcl = mca->gcpu_mca_logout[pidx];       /* current logout */
1699                 pgcl = mca->gcpu_mca_logout[ppidx];     /* previous logout */
1700                 mca->gcpu_mca_nextpoll_idx = ppidx;     /* switch next time */
1701         }
1702
1703         gcl->gcl_timestamp = gethrtime_waitfree();
1704         gcl->gcl_mcg_status = mcg_status;
1705         gcl->gcl_ip = rp ? rp->r_pc : 0;
1706
1707         gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1708         if (cap & MCG_CAP_TES_P)
1709                 gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1710
1711         for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1712                 uint64_t status, status2, addr, misc;
1713                 int retries = gcpu_mca_telemetry_retries;
1714
1715                 gbl->gbl_status = 0;
1716                 gbl->gbl_disp = 0;
1717                 gbl->gbl_clrdefcnt = 0;
1718
1719                 /*
1720                  * Only logout from MCA banks we have initialized from at
1721                  * least one core.  If a core shares an MCA bank with another
1722                  * but perhaps lost the race to initialize it, then it must
1723                  * still be allowed to logout from the shared bank.
1724                  */
1725                 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1726                         continue;
1727
1728                 /*
1729                  * On a poll look only at the banks we've been asked to check.
1730                  */
1731                 if (rp == NULL && !(bankmask & 1 << i))
1732                         continue;
1733
1734
1735                 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1736                     CMI_SUCCESS)
1737                         continue;
1738
1739                 gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1740
1741 retry:
1742                 if (!(status & MSR_MC_STATUS_VAL))
1743                         continue;
1744
1745                 /* First and last bank that have valid status */
1746                 if (first < 0)
1747                         first = i;
1748                 last = i;
1749
1750                 addr = -1;
1751                 misc = 0;
1752
1753                 if ((status & MSR_MC_STATUS_ADDRV) ||
1754                     gcpu_force_addr_in_payload)
1755                         (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1756
1757                 if (status & MSR_MC_STATUS_MISCV)
1758                         (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1759
1760                 gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1761
1762                 /*
1763                  * Allow the model-specific code to extract bank telemetry.
1764                  */
1765                 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1766
1767                 /*
1768                  * Not all cpu models assure us that the status/address/misc
1769                  * data will not change during the above sequence of MSR reads,
1770                  * or that it can only change by the addition of the OVerflow
1771                  * bit to the status register.  If the status has changed
1772                  * other than in the overflow bit then we attempt to reread
1773                  * for a consistent snapshot, but eventually give up and
1774                  * go with what we've got.  We only perform this check
1775                  * for a poll - a further #MC during a #MC will reset, and
1776                  * polled errors should not overwrite higher-priority
1777                  * trapping errors (but could set the overflow bit).
1778                  */
1779                 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1780                     &status2)) == CMI_SUCCESS) {
1781                         if (!STATUS_EQV(status, status2)) {
1782                                 if (retries-- > 0) {
1783                                         status = status2;
1784                                         goto retry;
1785                                 } else {
1786                                         gbl->gbl_disp |=
1787                                             CMI_ERRDISP_INCONSISTENT;
1788                                 }
1789                         }
1790                 } else if (ispoll && err != CMI_SUCCESS) {
1791                         gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1792                 }
1793
1794                 nerr++;
1795                 gbl->gbl_status = status;
1796                 gbl->gbl_addr = addr;
1797                 gbl->gbl_misc = misc;
1798
1799                 /*
1800                  * For polled observation, if the count of deferred status
1801                  * clears updated in the clear_mc() is nonzero and the
1802                  * MCi_STATUS has not changed, the last wakeup has produced
1803                  * the ereport of the error. Therefore, clear the status in
1804                  * this wakeup to avoid duplicate ereport.
1805                  */
1806                 pgbl = &pgcl->gcl_data[i];
1807                 if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1808                     pgbl->gbl_clrdefcnt != 0) {
1809                         if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1810                                 gbl->gbl_status = 0;
1811                                 (void) cmi_hdl_wrmsr(hdl,
1812                                     IA32_MSR_MC(i, STATUS), 0ULL);
1813                         }
1814                 }
1815         }
1816
1817         if (gcpu_mca_stack_flag)
1818                 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1819         else
1820                 gcl->gcl_stackdepth = 0;
1821
1822         /*
1823          * Decide our disposition for this error or errors, and submit for
1824          * logging and subsequent diagnosis.
1825          */
1826         if (nerr != 0) {
1827                 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1828
1829                 willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1830
1831                 if (!willpanic)
1832                         clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1833         } else {
1834                 disp = 0;
1835                 if (mcesp) {
1836                         mcesp->mce_nerr = mcesp->mce_disp = 0;
1837                 }
1838         }
1839
1840         /*
1841          * Clear MCG_STATUS if MCIP is set (machine check in progress).
1842          * If a second #MC had occured before now the system would have
1843          * reset.  We can only do thise once gcpu_mca_process has copied
1844          * the logout structure.
1845          */
1846         if (ismc && mcg_status & MCG_STATUS_MCIP)
1847                 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1848
1849         /*
1850          * At this point we have read and logged all telemetry that is visible
1851          * under the MCA.  On architectures for which the NorthBridge is
1852          * on-chip this may include NB-observed errors, but where the NB
1853          * is off chip it may have been the source of the #MC request and
1854          * so we must call into the memory-controller driver to give it
1855          * a chance to log errors.
1856          */
1857         if (ismc) {
1858                 cmi_mc_logout(hdl, 1, willpanic);
1859         }
1860 }
1861
1862 int gcpu_mca_trap_vomit_summary = 0;
1863
1864 /*
1865  * On a native machine check exception we come here from mcetrap via
1866  * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
1867  * cpus of the chip, so it is possible that another cpu on this chip could
1868  * initiate a poll while we're in the #mc handler;  it is also possible that
1869  * this trap has occured during a poll on this cpu.  So we must acquire
1870  * the chip-wide poll lock, but be careful to avoid deadlock.
1871  *
1872  * The 'data' pointer cannot be NULL due to init order.
1873  */
1874 uint64_t
1875 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1876 {
1877         gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1878         kmutex_t *poll_lock = NULL;
1879         gcpu_mce_status_t mce;
1880         uint64_t mcg_status;
1881         int tooklock = 0;
1882
1883         if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1884             CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
1885                 return (0);
1886
1887         /*
1888          * Synchronize with any poller from another core that may happen
1889          * to share access to one or more of the MCA banks.
1890          */
1891         if (gcpu->gcpu_shared != NULL)
1892                 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
1893
1894         if (poll_lock != NULL && !mutex_owned(poll_lock)) {
1895                 /*
1896                  * The lock is not owned by the thread we have
1897                  * interrupted.  Spin for this adaptive lock.
1898                  */
1899                 while (!mutex_tryenter(poll_lock)) {
1900                         while (mutex_owner(poll_lock) != NULL)
1901                                 ;
1902                 }
1903                 tooklock = 1;
1904         }
1905
1906         gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
1907
1908         if (tooklock)
1909                 mutex_exit(poll_lock);
1910
1911         /*
1912          * gcpu_mca_trap_vomit_summary may be set for debug assistance.
1913          */
1914         if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
1915                 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
1916                     "%u PCC (%u ok), "
1917                     "%u UC (%d ok, %u poisoned), "
1918                     "%u forcefatal, %u ignored",
1919                     mce.mce_nerr, (u_longlong_t)mce.mce_disp,
1920                     mce.mce_npcc, mce.mce_npcc_ok,
1921                     mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
1922                     mce.mce_forcefatal, mce.mce_ignored);
1923         }
1924
1925         return (mce.mce_disp);
1926 }
1927
1928 /*ARGSUSED*/
1929 void
1930 gcpu_faulted_enter(cmi_hdl_t hdl)
1931 {
1932         /* Nothing to do here */
1933 }
1934
1935 /*ARGSUSED*/
1936 void
1937 gcpu_faulted_exit(cmi_hdl_t hdl)
1938 {
1939         gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1940
1941         gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
1942 }
1943
1944 /*
1945  * Write the requested values to the indicated MSRs.  Having no knowledge
1946  * of the model-specific requirements for writing to these model-specific
1947  * registers, we will only blindly write to those MSRs if the 'force'
1948  * argument is nonzero.  That option should only be used in prototyping
1949  * and debugging.
1950  */
1951 /*ARGSUSED*/
1952 cmi_errno_t
1953 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
1954     int force)
1955 {
1956         int i, errs = 0;
1957
1958         for (i = 0; i < nregs; i++) {
1959                 uint_t msr = regs[i].cmr_msrnum;
1960                 uint64_t val = regs[i].cmr_msrval;
1961
1962                 if (cms_present(hdl)) {
1963                         if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
1964                                 errs++;
1965                 } else if (force) {
1966                         errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
1967                 } else {
1968                         errs++;
1969                 }
1970         }
1971
1972         return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
1973 }
1974
1975 /* deconfigure gcpu_mca_init() */
1976 void
1977 gcpu_mca_fini(cmi_hdl_t hdl)
1978 {
1979         gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1980         gcpu_mca_t *mca = &gcpu->gcpu_mca;
1981         int i;
1982
1983         /*
1984          * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1985          * both MCA and MCE support (i.e., X86FSET_MCA).  P5, K6, and earlier
1986          * processors, which have their own more primitive way of doing
1987          * machine checks, will not have cmi_mca_init called since their
1988          * CPUID information will not indicate both MCA and MCE features.
1989          */
1990         if (!is_x86_feature(x86_featureset, X86FSET_MCA))
1991                 return;
1992         /*
1993          * disable machine check in CR4
1994          */
1995         cmi_ntv_hwdisable_mce(hdl);
1996         mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1997         gcpu_mca_poll_fini(hdl);
1998         mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1999
2000         /*
2001          * free resources allocated during init
2002          */
2003         if (mca->gcpu_bank_cmci != NULL) {
2004                 kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) *
2005                     mca->gcpu_mca_nbanks);
2006         }
2007
2008         for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
2009                 if (mca->gcpu_mca_logout[i] != NULL) {
2010                         kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz);
2011                 }
2012         }
2013
2014         if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) {
2015                 kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg,
2016                     sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks);
2017         }
2018 }