drm/radeon: remove cayman_gpu_is_lockup
[linux-2.6/libata-dev.git] / drivers / edac / mce_amd.c
blobd0c372e30de41766cac54568c12d2994738cbf38
1 #include <linux/module.h>
2 #include <linux/slab.h>
4 #include "mce_amd.h"
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
22 nb_bus_decoder = f;
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
28 if (nb_bus_decoder) {
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
38 * or MSR0000_0411.
41 /* transaction type */
42 const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
45 /* cache level */
46 const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char * const rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
59 /* request timeout */
60 const char * const to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
63 /* memory or i/o */
64 const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char * const f15h_ic_mce_desc[] = {
68 "UC during a demand linefill from L2",
69 "Parity error during data load from IC",
70 "Parity error for IC valid bit",
71 "Main tag parity error",
72 "Parity error in prediction queue",
73 "PFB data/address parity error",
74 "Parity error in the branch status reg",
75 "PFB promotion address error",
76 "Tag error during probe/victimization",
77 "Parity error for IC probe tag valid bit",
78 "PFB non-cacheable bit parity error",
79 "PFB valid bit parity error", /* xec = 0xd */
80 "Microcode Patch Buffer", /* xec = 010 */
81 "uop queue",
82 "insn buffer",
83 "predecode buffer",
84 "fetch address FIFO"
87 static const char * const f15h_cu_mce_desc[] = {
88 "Fill ECC error on data fills", /* xec = 0x4 */
89 "Fill parity error on insn fills",
90 "Prefetcher request FIFO parity error",
91 "PRQ address parity error",
92 "PRQ data parity error",
93 "WCC Tag ECC error",
94 "WCC Data ECC error",
95 "WCB Data parity error",
96 "VB Data ECC or parity error",
97 "L2 Tag ECC error", /* xec = 0x10 */
98 "Hard L2 Tag ECC error",
99 "Multiple hits on L2 tag",
100 "XAB parity error",
101 "PRB address parity error"
104 static const char * const nb_mce_desc[] = {
105 "DRAM ECC error detected on the NB",
106 "CRC error detected on HT link",
107 "Link-defined sync error packets detected on HT link",
108 "HT Master abort",
109 "HT Target abort",
110 "Invalid GART PTE entry during GART table walk",
111 "Unsupported atomic RMW received from an IO link",
112 "Watchdog timeout due to lack of progress",
113 "DRAM ECC error detected on the NB",
114 "SVM DMA Exclusion Vector error",
115 "HT data error detected on link",
116 "Protocol error (link, L3, probe filter)",
117 "NB internal arrays parity error",
118 "DRAM addr/ctl signals parity error",
119 "IO link transmission error",
120 "L3 data cache ECC error", /* xec = 0x1c */
121 "L3 cache tag error",
122 "L3 LRU parity bits error",
123 "ECC Error in the Probe Filter directory"
126 static const char * const fr_ex_mce_desc[] = {
127 "CPU Watchdog timer expire",
128 "Wakeup array dest tag",
129 "AG payload array",
130 "EX payload array",
131 "IDRF array",
132 "Retire dispatch queue",
133 "Mapper checkpoint array",
134 "Physical register file EX0 port",
135 "Physical register file EX1 port",
136 "Physical register file AG0 port",
137 "Physical register file AG1 port",
138 "Flag register file",
139 "DE error occurred"
142 static bool f12h_dc_mce(u16 ec, u8 xec)
144 bool ret = false;
146 if (MEM_ERROR(ec)) {
147 u8 ll = LL(ec);
148 ret = true;
150 if (ll == LL_L2)
151 pr_cont("during L1 linefill from L2.\n");
152 else if (ll == LL_L1)
153 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
154 else
155 ret = false;
157 return ret;
160 static bool f10h_dc_mce(u16 ec, u8 xec)
162 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
163 pr_cont("during data scrub.\n");
164 return true;
166 return f12h_dc_mce(ec, xec);
169 static bool k8_dc_mce(u16 ec, u8 xec)
171 if (BUS_ERROR(ec)) {
172 pr_cont("during system linefill.\n");
173 return true;
176 return f10h_dc_mce(ec, xec);
179 static bool f14h_dc_mce(u16 ec, u8 xec)
181 u8 r4 = R4(ec);
182 bool ret = true;
184 if (MEM_ERROR(ec)) {
186 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
187 return false;
189 switch (r4) {
190 case R4_DRD:
191 case R4_DWR:
192 pr_cont("Data/Tag parity error due to %s.\n",
193 (r4 == R4_DRD ? "load/hw prf" : "store"));
194 break;
195 case R4_EVICT:
196 pr_cont("Copyback parity error on a tag miss.\n");
197 break;
198 case R4_SNOOP:
199 pr_cont("Tag parity error during snoop.\n");
200 break;
201 default:
202 ret = false;
204 } else if (BUS_ERROR(ec)) {
206 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
207 return false;
209 pr_cont("System read data error on a ");
211 switch (r4) {
212 case R4_RD:
213 pr_cont("TLB reload.\n");
214 break;
215 case R4_DWR:
216 pr_cont("store.\n");
217 break;
218 case R4_DRD:
219 pr_cont("load.\n");
220 break;
221 default:
222 ret = false;
224 } else {
225 ret = false;
228 return ret;
231 static bool f15h_dc_mce(u16 ec, u8 xec)
233 bool ret = true;
235 if (MEM_ERROR(ec)) {
237 switch (xec) {
238 case 0x0:
239 pr_cont("Data Array access error.\n");
240 break;
242 case 0x1:
243 pr_cont("UC error during a linefill from L2/NB.\n");
244 break;
246 case 0x2:
247 case 0x11:
248 pr_cont("STQ access error.\n");
249 break;
251 case 0x3:
252 pr_cont("SCB access error.\n");
253 break;
255 case 0x10:
256 pr_cont("Tag error.\n");
257 break;
259 case 0x12:
260 pr_cont("LDQ access error.\n");
261 break;
263 default:
264 ret = false;
266 } else if (BUS_ERROR(ec)) {
268 if (!xec)
269 pr_cont("System Read Data Error.\n");
270 else
271 pr_cont(" Internal error condition type %d.\n", xec);
272 } else
273 ret = false;
275 return ret;
278 static void amd_decode_dc_mce(struct mce *m)
280 u16 ec = EC(m->status);
281 u8 xec = XEC(m->status, xec_mask);
283 pr_emerg(HW_ERR "Data Cache Error: ");
285 /* TLB error signatures are the same across families */
286 if (TLB_ERROR(ec)) {
287 if (TT(ec) == TT_DATA) {
288 pr_cont("%s TLB %s.\n", LL_MSG(ec),
289 ((xec == 2) ? "locked miss"
290 : (xec ? "multimatch" : "parity")));
291 return;
293 } else if (fam_ops->dc_mce(ec, xec))
295 else
296 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
299 static bool k8_ic_mce(u16 ec, u8 xec)
301 u8 ll = LL(ec);
302 bool ret = true;
304 if (!MEM_ERROR(ec))
305 return false;
307 if (ll == 0x2)
308 pr_cont("during a linefill from L2.\n");
309 else if (ll == 0x1) {
310 switch (R4(ec)) {
311 case R4_IRD:
312 pr_cont("Parity error during data load.\n");
313 break;
315 case R4_EVICT:
316 pr_cont("Copyback Parity/Victim error.\n");
317 break;
319 case R4_SNOOP:
320 pr_cont("Tag Snoop error.\n");
321 break;
323 default:
324 ret = false;
325 break;
327 } else
328 ret = false;
330 return ret;
333 static bool f14h_ic_mce(u16 ec, u8 xec)
335 u8 r4 = R4(ec);
336 bool ret = true;
338 if (MEM_ERROR(ec)) {
339 if (TT(ec) != 0 || LL(ec) != 1)
340 ret = false;
342 if (r4 == R4_IRD)
343 pr_cont("Data/tag array parity error for a tag hit.\n");
344 else if (r4 == R4_SNOOP)
345 pr_cont("Tag error during snoop/victimization.\n");
346 else
347 ret = false;
349 return ret;
352 static bool f15h_ic_mce(u16 ec, u8 xec)
354 bool ret = true;
356 if (!MEM_ERROR(ec))
357 return false;
359 switch (xec) {
360 case 0x0 ... 0xa:
361 pr_cont("%s.\n", f15h_ic_mce_desc[xec]);
362 break;
364 case 0xd:
365 pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]);
366 break;
368 case 0x10:
369 pr_cont("%s.\n", f15h_ic_mce_desc[xec-4]);
370 break;
372 case 0x11 ... 0x14:
373 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]);
374 break;
376 default:
377 ret = false;
379 return ret;
382 static void amd_decode_ic_mce(struct mce *m)
384 u16 ec = EC(m->status);
385 u8 xec = XEC(m->status, xec_mask);
387 pr_emerg(HW_ERR "Instruction Cache Error: ");
389 if (TLB_ERROR(ec))
390 pr_cont("%s TLB %s.\n", LL_MSG(ec),
391 (xec ? "multimatch" : "parity error"));
392 else if (BUS_ERROR(ec)) {
393 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
395 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
396 } else if (fam_ops->ic_mce(ec, xec))
398 else
399 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
402 static void amd_decode_bu_mce(struct mce *m)
404 u16 ec = EC(m->status);
405 u8 xec = XEC(m->status, xec_mask);
407 pr_emerg(HW_ERR "Bus Unit Error");
409 if (xec == 0x1)
410 pr_cont(" in the write data buffers.\n");
411 else if (xec == 0x3)
412 pr_cont(" in the victim data buffers.\n");
413 else if (xec == 0x2 && MEM_ERROR(ec))
414 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
415 else if (xec == 0x0) {
416 if (TLB_ERROR(ec))
417 pr_cont(": %s error in a Page Descriptor Cache or "
418 "Guest TLB.\n", TT_MSG(ec));
419 else if (BUS_ERROR(ec))
420 pr_cont(": %s/ECC error in data read from NB: %s.\n",
421 R4_MSG(ec), PP_MSG(ec));
422 else if (MEM_ERROR(ec)) {
423 u8 r4 = R4(ec);
425 if (r4 >= 0x7)
426 pr_cont(": %s error during data copyback.\n",
427 R4_MSG(ec));
428 else if (r4 <= 0x1)
429 pr_cont(": %s parity/ECC error during data "
430 "access from L2.\n", R4_MSG(ec));
431 else
432 goto wrong_bu_mce;
433 } else
434 goto wrong_bu_mce;
435 } else
436 goto wrong_bu_mce;
438 return;
440 wrong_bu_mce:
441 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
444 static void amd_decode_cu_mce(struct mce *m)
446 u16 ec = EC(m->status);
447 u8 xec = XEC(m->status, xec_mask);
449 pr_emerg(HW_ERR "Combined Unit Error: ");
451 if (TLB_ERROR(ec)) {
452 if (xec == 0x0)
453 pr_cont("Data parity TLB read error.\n");
454 else if (xec == 0x1)
455 pr_cont("Poison data provided for TLB fill.\n");
456 else
457 goto wrong_cu_mce;
458 } else if (BUS_ERROR(ec)) {
459 if (xec > 2)
460 goto wrong_cu_mce;
462 pr_cont("Error during attempted NB data read.\n");
463 } else if (MEM_ERROR(ec)) {
464 switch (xec) {
465 case 0x4 ... 0xc:
466 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]);
467 break;
469 case 0x10 ... 0x14:
470 pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]);
471 break;
473 default:
474 goto wrong_cu_mce;
478 return;
480 wrong_cu_mce:
481 pr_emerg(HW_ERR "Corrupted CU MCE info?\n");
484 static void amd_decode_ls_mce(struct mce *m)
486 u16 ec = EC(m->status);
487 u8 xec = XEC(m->status, xec_mask);
489 if (boot_cpu_data.x86 >= 0x14) {
490 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
491 " please report on LKML.\n");
492 return;
495 pr_emerg(HW_ERR "Load Store Error");
497 if (xec == 0x0) {
498 u8 r4 = R4(ec);
500 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
501 goto wrong_ls_mce;
503 pr_cont(" during %s.\n", R4_MSG(ec));
504 } else
505 goto wrong_ls_mce;
507 return;
509 wrong_ls_mce:
510 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
513 void amd_decode_nb_mce(struct mce *m)
515 struct cpuinfo_x86 *c = &boot_cpu_data;
516 int node_id = amd_get_nb_id(m->extcpu);
517 u16 ec = EC(m->status);
518 u8 xec = XEC(m->status, 0x1f);
519 u8 offset = 0;
521 pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id);
523 switch (xec) {
524 case 0x0 ... 0xe:
526 /* special handling for DRAM ECCs */
527 if (xec == 0x0 || xec == 0x8) {
528 /* no ECCs on F11h */
529 if (c->x86 == 0x11)
530 goto wrong_nb_mce;
532 pr_cont("%s.\n", nb_mce_desc[xec]);
534 if (nb_bus_decoder)
535 nb_bus_decoder(node_id, m);
536 return;
538 break;
540 case 0xf:
541 if (TLB_ERROR(ec))
542 pr_cont("GART Table Walk data error.\n");
543 else if (BUS_ERROR(ec))
544 pr_cont("DMA Exclusion Vector Table Walk error.\n");
545 else
546 goto wrong_nb_mce;
547 return;
549 case 0x19:
550 if (boot_cpu_data.x86 == 0x15)
551 pr_cont("Compute Unit Data Error.\n");
552 else
553 goto wrong_nb_mce;
554 return;
556 case 0x1c ... 0x1f:
557 offset = 13;
558 break;
560 default:
561 goto wrong_nb_mce;
564 pr_cont("%s.\n", nb_mce_desc[xec - offset]);
565 return;
567 wrong_nb_mce:
568 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
570 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
572 static void amd_decode_fr_mce(struct mce *m)
574 struct cpuinfo_x86 *c = &boot_cpu_data;
575 u8 xec = XEC(m->status, xec_mask);
577 if (c->x86 == 0xf || c->x86 == 0x11)
578 goto wrong_fr_mce;
580 pr_emerg(HW_ERR "%s Error: ",
581 (c->x86 == 0x15 ? "Execution Unit" : "FIROB"));
583 if (xec == 0x0 || xec == 0xc)
584 pr_cont("%s.\n", fr_ex_mce_desc[xec]);
585 else if (xec < 0xd)
586 pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]);
587 else
588 goto wrong_fr_mce;
590 return;
592 wrong_fr_mce:
593 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
596 static void amd_decode_fp_mce(struct mce *m)
598 u8 xec = XEC(m->status, xec_mask);
600 pr_emerg(HW_ERR "Floating Point Unit Error: ");
602 switch (xec) {
603 case 0x1:
604 pr_cont("Free List");
605 break;
607 case 0x2:
608 pr_cont("Physical Register File");
609 break;
611 case 0x3:
612 pr_cont("Retire Queue");
613 break;
615 case 0x4:
616 pr_cont("Scheduler table");
617 break;
619 case 0x5:
620 pr_cont("Status Register File");
621 break;
623 default:
624 goto wrong_fp_mce;
625 break;
628 pr_cont(" parity error.\n");
630 return;
632 wrong_fp_mce:
633 pr_emerg(HW_ERR "Corrupted FP MCE info?\n");
636 static inline void amd_decode_err_code(u16 ec)
639 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
641 if (BUS_ERROR(ec))
642 pr_cont(", mem/io: %s", II_MSG(ec));
643 else
644 pr_cont(", tx: %s", TT_MSG(ec));
646 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
647 pr_cont(", mem-tx: %s", R4_MSG(ec));
649 if (BUS_ERROR(ec))
650 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
653 pr_cont("\n");
657 * Filter out unwanted MCE signatures here.
659 static bool amd_filter_mce(struct mce *m)
661 u8 xec = (m->status >> 16) & 0x1f;
664 * NB GART TLB error reporting is disabled by default.
666 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
667 return true;
669 return false;
672 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
674 struct mce *m = (struct mce *)data;
675 struct cpuinfo_x86 *c = &boot_cpu_data;
676 int ecc;
678 if (amd_filter_mce(m))
679 return NOTIFY_STOP;
681 pr_emerg(HW_ERR "CPU:%d\tMC%d_STATUS[%s|%s|%s|%s|%s",
682 m->extcpu, m->bank,
683 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
684 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
685 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
686 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
687 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
689 if (c->x86 == 0x15)
690 pr_cont("|%s|%s",
691 ((m->status & BIT_64(44)) ? "Deferred" : "-"),
692 ((m->status & BIT_64(43)) ? "Poison" : "-"));
694 /* do the two bits[14:13] together */
695 ecc = (m->status >> 45) & 0x3;
696 if (ecc)
697 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
699 pr_cont("]: 0x%016llx\n", m->status);
701 if (m->status & MCI_STATUS_ADDRV)
702 pr_emerg(HW_ERR "\tMC%d_ADDR: 0x%016llx\n", m->bank, m->addr);
704 switch (m->bank) {
705 case 0:
706 amd_decode_dc_mce(m);
707 break;
709 case 1:
710 amd_decode_ic_mce(m);
711 break;
713 case 2:
714 if (c->x86 == 0x15)
715 amd_decode_cu_mce(m);
716 else
717 amd_decode_bu_mce(m);
718 break;
720 case 3:
721 amd_decode_ls_mce(m);
722 break;
724 case 4:
725 amd_decode_nb_mce(m);
726 break;
728 case 5:
729 amd_decode_fr_mce(m);
730 break;
732 case 6:
733 amd_decode_fp_mce(m);
734 break;
736 default:
737 break;
740 amd_decode_err_code(m->status & 0xffff);
742 return NOTIFY_STOP;
744 EXPORT_SYMBOL_GPL(amd_decode_mce);
746 static struct notifier_block amd_mce_dec_nb = {
747 .notifier_call = amd_decode_mce,
750 static int __init mce_amd_init(void)
752 struct cpuinfo_x86 *c = &boot_cpu_data;
754 if (c->x86_vendor != X86_VENDOR_AMD)
755 return 0;
757 if (c->x86 < 0xf || c->x86 > 0x15)
758 return 0;
760 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
761 if (!fam_ops)
762 return -ENOMEM;
764 switch (c->x86) {
765 case 0xf:
766 fam_ops->dc_mce = k8_dc_mce;
767 fam_ops->ic_mce = k8_ic_mce;
768 break;
770 case 0x10:
771 fam_ops->dc_mce = f10h_dc_mce;
772 fam_ops->ic_mce = k8_ic_mce;
773 break;
775 case 0x11:
776 fam_ops->dc_mce = k8_dc_mce;
777 fam_ops->ic_mce = k8_ic_mce;
778 break;
780 case 0x12:
781 fam_ops->dc_mce = f12h_dc_mce;
782 fam_ops->ic_mce = k8_ic_mce;
783 break;
785 case 0x14:
786 nb_err_cpumask = 0x3;
787 fam_ops->dc_mce = f14h_dc_mce;
788 fam_ops->ic_mce = f14h_ic_mce;
789 break;
791 case 0x15:
792 xec_mask = 0x1f;
793 fam_ops->dc_mce = f15h_dc_mce;
794 fam_ops->ic_mce = f15h_ic_mce;
795 break;
797 default:
798 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
799 kfree(fam_ops);
800 return -EINVAL;
803 pr_info("MCE: In-kernel MCE decoding enabled.\n");
805 mce_register_decode_chain(&amd_mce_dec_nb);
807 return 0;
809 early_initcall(mce_amd_init);
811 #ifdef MODULE
812 static void __exit mce_amd_exit(void)
814 mce_unregister_decode_chain(&amd_mce_dec_nb);
815 kfree(fam_ops);
818 MODULE_DESCRIPTION("AMD MCE decoder");
819 MODULE_ALIAS("edac-mce-amd");
820 MODULE_LICENSE("GPL");
821 module_exit(mce_amd_exit);
822 #endif