slab/slub: struct memcg_params
[linux-2.6.git] / drivers / edac / mce_amd.c
blobad637572d8c77420c6da8e213890adb4f077e2fe
1 #include <linux/module.h>
2 #include <linux/slab.h>
4 #include "mce_amd.h"
6 static struct amd_decoder_ops *fam_ops;
8 static u8 xec_mask = 0xf;
9 static u8 nb_err_cpumask = 0xf;
11 static bool report_gart_errors;
12 static void (*nb_bus_decoder)(int node_id, struct mce *m);
14 void amd_report_gart_errors(bool v)
16 report_gart_errors = v;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
20 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
22 nb_bus_decoder = f;
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
26 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
28 if (nb_bus_decoder) {
29 WARN_ON(nb_bus_decoder != f);
31 nb_bus_decoder = NULL;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 * string representation for the different MCA reported error types, see F3x48
38 * or MSR0000_0411.
41 /* transaction type */
42 const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs);
45 /* cache level */
46 const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs);
49 /* memory transaction type */
50 const char * const rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs);
55 /* participating processor */
56 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs);
59 /* request timeout */
60 const char * const to_msgs[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs);
63 /* memory or i/o */
64 const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs);
67 static const char * const f15h_mc1_mce_desc[] = {
68 "UC during a demand linefill from L2",
69 "Parity error during data load from IC",
70 "Parity error for IC valid bit",
71 "Main tag parity error",
72 "Parity error in prediction queue",
73 "PFB data/address parity error",
74 "Parity error in the branch status reg",
75 "PFB promotion address error",
76 "Tag error during probe/victimization",
77 "Parity error for IC probe tag valid bit",
78 "PFB non-cacheable bit parity error",
79 "PFB valid bit parity error", /* xec = 0xd */
80 "Microcode Patch Buffer", /* xec = 010 */
81 "uop queue",
82 "insn buffer",
83 "predecode buffer",
84 "fetch address FIFO"
87 static const char * const f15h_mc2_mce_desc[] = {
88 "Fill ECC error on data fills", /* xec = 0x4 */
89 "Fill parity error on insn fills",
90 "Prefetcher request FIFO parity error",
91 "PRQ address parity error",
92 "PRQ data parity error",
93 "WCC Tag ECC error",
94 "WCC Data ECC error",
95 "WCB Data parity error",
96 "VB Data ECC or parity error",
97 "L2 Tag ECC error", /* xec = 0x10 */
98 "Hard L2 Tag ECC error",
99 "Multiple hits on L2 tag",
100 "XAB parity error",
101 "PRB address parity error"
104 static const char * const mc4_mce_desc[] = {
105 "DRAM ECC error detected on the NB",
106 "CRC error detected on HT link",
107 "Link-defined sync error packets detected on HT link",
108 "HT Master abort",
109 "HT Target abort",
110 "Invalid GART PTE entry during GART table walk",
111 "Unsupported atomic RMW received from an IO link",
112 "Watchdog timeout due to lack of progress",
113 "DRAM ECC error detected on the NB",
114 "SVM DMA Exclusion Vector error",
115 "HT data error detected on link",
116 "Protocol error (link, L3, probe filter)",
117 "NB internal arrays parity error",
118 "DRAM addr/ctl signals parity error",
119 "IO link transmission error",
120 "L3 data cache ECC error", /* xec = 0x1c */
121 "L3 cache tag error",
122 "L3 LRU parity bits error",
123 "ECC Error in the Probe Filter directory"
126 static const char * const mc5_mce_desc[] = {
127 "CPU Watchdog timer expire",
128 "Wakeup array dest tag",
129 "AG payload array",
130 "EX payload array",
131 "IDRF array",
132 "Retire dispatch queue",
133 "Mapper checkpoint array",
134 "Physical register file EX0 port",
135 "Physical register file EX1 port",
136 "Physical register file AG0 port",
137 "Physical register file AG1 port",
138 "Flag register file",
139 "DE error occurred"
142 static bool f12h_mc0_mce(u16 ec, u8 xec)
144 bool ret = false;
146 if (MEM_ERROR(ec)) {
147 u8 ll = LL(ec);
148 ret = true;
150 if (ll == LL_L2)
151 pr_cont("during L1 linefill from L2.\n");
152 else if (ll == LL_L1)
153 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
154 else
155 ret = false;
157 return ret;
160 static bool f10h_mc0_mce(u16 ec, u8 xec)
162 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
163 pr_cont("during data scrub.\n");
164 return true;
166 return f12h_mc0_mce(ec, xec);
169 static bool k8_mc0_mce(u16 ec, u8 xec)
171 if (BUS_ERROR(ec)) {
172 pr_cont("during system linefill.\n");
173 return true;
176 return f10h_mc0_mce(ec, xec);
179 static bool f14h_mc0_mce(u16 ec, u8 xec)
181 u8 r4 = R4(ec);
182 bool ret = true;
184 if (MEM_ERROR(ec)) {
186 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
187 return false;
189 switch (r4) {
190 case R4_DRD:
191 case R4_DWR:
192 pr_cont("Data/Tag parity error due to %s.\n",
193 (r4 == R4_DRD ? "load/hw prf" : "store"));
194 break;
195 case R4_EVICT:
196 pr_cont("Copyback parity error on a tag miss.\n");
197 break;
198 case R4_SNOOP:
199 pr_cont("Tag parity error during snoop.\n");
200 break;
201 default:
202 ret = false;
204 } else if (BUS_ERROR(ec)) {
206 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
207 return false;
209 pr_cont("System read data error on a ");
211 switch (r4) {
212 case R4_RD:
213 pr_cont("TLB reload.\n");
214 break;
215 case R4_DWR:
216 pr_cont("store.\n");
217 break;
218 case R4_DRD:
219 pr_cont("load.\n");
220 break;
221 default:
222 ret = false;
224 } else {
225 ret = false;
228 return ret;
231 static bool f15h_mc0_mce(u16 ec, u8 xec)
233 bool ret = true;
235 if (MEM_ERROR(ec)) {
237 switch (xec) {
238 case 0x0:
239 pr_cont("Data Array access error.\n");
240 break;
242 case 0x1:
243 pr_cont("UC error during a linefill from L2/NB.\n");
244 break;
246 case 0x2:
247 case 0x11:
248 pr_cont("STQ access error.\n");
249 break;
251 case 0x3:
252 pr_cont("SCB access error.\n");
253 break;
255 case 0x10:
256 pr_cont("Tag error.\n");
257 break;
259 case 0x12:
260 pr_cont("LDQ access error.\n");
261 break;
263 default:
264 ret = false;
266 } else if (BUS_ERROR(ec)) {
268 if (!xec)
269 pr_cont("System Read Data Error.\n");
270 else
271 pr_cont(" Internal error condition type %d.\n", xec);
272 } else
273 ret = false;
275 return ret;
278 static void decode_mc0_mce(struct mce *m)
280 u16 ec = EC(m->status);
281 u8 xec = XEC(m->status, xec_mask);
283 pr_emerg(HW_ERR "MC0 Error: ");
285 /* TLB error signatures are the same across families */
286 if (TLB_ERROR(ec)) {
287 if (TT(ec) == TT_DATA) {
288 pr_cont("%s TLB %s.\n", LL_MSG(ec),
289 ((xec == 2) ? "locked miss"
290 : (xec ? "multimatch" : "parity")));
291 return;
293 } else if (fam_ops->mc0_mce(ec, xec))
295 else
296 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
299 static bool k8_mc1_mce(u16 ec, u8 xec)
301 u8 ll = LL(ec);
302 bool ret = true;
304 if (!MEM_ERROR(ec))
305 return false;
307 if (ll == 0x2)
308 pr_cont("during a linefill from L2.\n");
309 else if (ll == 0x1) {
310 switch (R4(ec)) {
311 case R4_IRD:
312 pr_cont("Parity error during data load.\n");
313 break;
315 case R4_EVICT:
316 pr_cont("Copyback Parity/Victim error.\n");
317 break;
319 case R4_SNOOP:
320 pr_cont("Tag Snoop error.\n");
321 break;
323 default:
324 ret = false;
325 break;
327 } else
328 ret = false;
330 return ret;
333 static bool f14h_mc1_mce(u16 ec, u8 xec)
335 u8 r4 = R4(ec);
336 bool ret = true;
338 if (MEM_ERROR(ec)) {
339 if (TT(ec) != 0 || LL(ec) != 1)
340 ret = false;
342 if (r4 == R4_IRD)
343 pr_cont("Data/tag array parity error for a tag hit.\n");
344 else if (r4 == R4_SNOOP)
345 pr_cont("Tag error during snoop/victimization.\n");
346 else
347 ret = false;
349 return ret;
352 static bool f15h_mc1_mce(u16 ec, u8 xec)
354 bool ret = true;
356 if (!MEM_ERROR(ec))
357 return false;
359 switch (xec) {
360 case 0x0 ... 0xa:
361 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
362 break;
364 case 0xd:
365 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
366 break;
368 case 0x10:
369 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
370 break;
372 case 0x11 ... 0x14:
373 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
374 break;
376 default:
377 ret = false;
379 return ret;
382 static void decode_mc1_mce(struct mce *m)
384 u16 ec = EC(m->status);
385 u8 xec = XEC(m->status, xec_mask);
387 pr_emerg(HW_ERR "MC1 Error: ");
389 if (TLB_ERROR(ec))
390 pr_cont("%s TLB %s.\n", LL_MSG(ec),
391 (xec ? "multimatch" : "parity error"));
392 else if (BUS_ERROR(ec)) {
393 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
395 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
396 } else if (fam_ops->mc1_mce(ec, xec))
398 else
399 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
402 static void decode_mc2_mce(struct mce *m)
404 u16 ec = EC(m->status);
405 u8 xec = XEC(m->status, xec_mask);
407 pr_emerg(HW_ERR "MC2 Error");
409 if (xec == 0x1)
410 pr_cont(" in the write data buffers.\n");
411 else if (xec == 0x3)
412 pr_cont(" in the victim data buffers.\n");
413 else if (xec == 0x2 && MEM_ERROR(ec))
414 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
415 else if (xec == 0x0) {
416 if (TLB_ERROR(ec))
417 pr_cont(": %s error in a Page Descriptor Cache or "
418 "Guest TLB.\n", TT_MSG(ec));
419 else if (BUS_ERROR(ec))
420 pr_cont(": %s/ECC error in data read from NB: %s.\n",
421 R4_MSG(ec), PP_MSG(ec));
422 else if (MEM_ERROR(ec)) {
423 u8 r4 = R4(ec);
425 if (r4 >= 0x7)
426 pr_cont(": %s error during data copyback.\n",
427 R4_MSG(ec));
428 else if (r4 <= 0x1)
429 pr_cont(": %s parity/ECC error during data "
430 "access from L2.\n", R4_MSG(ec));
431 else
432 goto wrong_mc2_mce;
433 } else
434 goto wrong_mc2_mce;
435 } else
436 goto wrong_mc2_mce;
438 return;
440 wrong_mc2_mce:
441 pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n");
444 static void decode_f15_mc2_mce(struct mce *m)
446 u16 ec = EC(m->status);
447 u8 xec = XEC(m->status, xec_mask);
449 pr_emerg(HW_ERR "MC2 Error: ");
451 if (TLB_ERROR(ec)) {
452 if (xec == 0x0)
453 pr_cont("Data parity TLB read error.\n");
454 else if (xec == 0x1)
455 pr_cont("Poison data provided for TLB fill.\n");
456 else
457 goto wrong_f15_mc2_mce;
458 } else if (BUS_ERROR(ec)) {
459 if (xec > 2)
460 goto wrong_f15_mc2_mce;
462 pr_cont("Error during attempted NB data read.\n");
463 } else if (MEM_ERROR(ec)) {
464 switch (xec) {
465 case 0x4 ... 0xc:
466 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
467 break;
469 case 0x10 ... 0x14:
470 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
471 break;
473 default:
474 goto wrong_f15_mc2_mce;
478 return;
480 wrong_f15_mc2_mce:
481 pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n");
484 static void decode_mc3_mce(struct mce *m)
486 u16 ec = EC(m->status);
487 u8 xec = XEC(m->status, xec_mask);
489 if (boot_cpu_data.x86 >= 0x14) {
490 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
491 " please report on LKML.\n");
492 return;
495 pr_emerg(HW_ERR "MC3 Error");
497 if (xec == 0x0) {
498 u8 r4 = R4(ec);
500 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
501 goto wrong_mc3_mce;
503 pr_cont(" during %s.\n", R4_MSG(ec));
504 } else
505 goto wrong_mc3_mce;
507 return;
509 wrong_mc3_mce:
510 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
513 static void decode_mc4_mce(struct mce *m)
515 struct cpuinfo_x86 *c = &boot_cpu_data;
516 int node_id = amd_get_nb_id(m->extcpu);
517 u16 ec = EC(m->status);
518 u8 xec = XEC(m->status, 0x1f);
519 u8 offset = 0;
521 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
523 switch (xec) {
524 case 0x0 ... 0xe:
526 /* special handling for DRAM ECCs */
527 if (xec == 0x0 || xec == 0x8) {
528 /* no ECCs on F11h */
529 if (c->x86 == 0x11)
530 goto wrong_mc4_mce;
532 pr_cont("%s.\n", mc4_mce_desc[xec]);
534 if (nb_bus_decoder)
535 nb_bus_decoder(node_id, m);
536 return;
538 break;
540 case 0xf:
541 if (TLB_ERROR(ec))
542 pr_cont("GART Table Walk data error.\n");
543 else if (BUS_ERROR(ec))
544 pr_cont("DMA Exclusion Vector Table Walk error.\n");
545 else
546 goto wrong_mc4_mce;
547 return;
549 case 0x19:
550 if (boot_cpu_data.x86 == 0x15)
551 pr_cont("Compute Unit Data Error.\n");
552 else
553 goto wrong_mc4_mce;
554 return;
556 case 0x1c ... 0x1f:
557 offset = 13;
558 break;
560 default:
561 goto wrong_mc4_mce;
564 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
565 return;
567 wrong_mc4_mce:
568 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
571 static void decode_mc5_mce(struct mce *m)
573 struct cpuinfo_x86 *c = &boot_cpu_data;
574 u8 xec = XEC(m->status, xec_mask);
576 if (c->x86 == 0xf || c->x86 == 0x11)
577 goto wrong_mc5_mce;
579 pr_emerg(HW_ERR "MC5 Error: ");
581 if (xec == 0x0 || xec == 0xc)
582 pr_cont("%s.\n", mc5_mce_desc[xec]);
583 else if (xec < 0xd)
584 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
585 else
586 goto wrong_mc5_mce;
588 return;
590 wrong_mc5_mce:
591 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
594 static void decode_mc6_mce(struct mce *m)
596 u8 xec = XEC(m->status, xec_mask);
598 pr_emerg(HW_ERR "MC6 Error: ");
600 switch (xec) {
601 case 0x1:
602 pr_cont("Free List");
603 break;
605 case 0x2:
606 pr_cont("Physical Register File");
607 break;
609 case 0x3:
610 pr_cont("Retire Queue");
611 break;
613 case 0x4:
614 pr_cont("Scheduler table");
615 break;
617 case 0x5:
618 pr_cont("Status Register File");
619 break;
621 default:
622 goto wrong_mc6_mce;
623 break;
626 pr_cont(" parity error.\n");
628 return;
630 wrong_mc6_mce:
631 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
634 static inline void amd_decode_err_code(u16 ec)
637 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
639 if (BUS_ERROR(ec))
640 pr_cont(", mem/io: %s", II_MSG(ec));
641 else
642 pr_cont(", tx: %s", TT_MSG(ec));
644 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
645 pr_cont(", mem-tx: %s", R4_MSG(ec));
647 if (BUS_ERROR(ec))
648 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
651 pr_cont("\n");
655 * Filter out unwanted MCE signatures here.
657 static bool amd_filter_mce(struct mce *m)
659 u8 xec = (m->status >> 16) & 0x1f;
662 * NB GART TLB error reporting is disabled by default.
664 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
665 return true;
667 return false;
670 static const char *decode_error_status(struct mce *m)
672 if (m->status & MCI_STATUS_UC) {
673 if (m->status & MCI_STATUS_PCC)
674 return "System Fatal error.";
675 if (m->mcgstatus & MCG_STATUS_RIPV)
676 return "Uncorrected, software restartable error.";
677 return "Uncorrected, software containable error.";
680 if (m->status & MCI_STATUS_DEFERRED)
681 return "Deferred error.";
683 return "Corrected error, no action required.";
686 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
688 struct mce *m = (struct mce *)data;
689 struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
690 int ecc;
692 if (amd_filter_mce(m))
693 return NOTIFY_STOP;
695 switch (m->bank) {
696 case 0:
697 decode_mc0_mce(m);
698 break;
700 case 1:
701 decode_mc1_mce(m);
702 break;
704 case 2:
705 if (c->x86 == 0x15)
706 decode_f15_mc2_mce(m);
707 else
708 decode_mc2_mce(m);
709 break;
711 case 3:
712 decode_mc3_mce(m);
713 break;
715 case 4:
716 decode_mc4_mce(m);
717 break;
719 case 5:
720 decode_mc5_mce(m);
721 break;
723 case 6:
724 decode_mc6_mce(m);
725 break;
727 default:
728 break;
731 pr_emerg(HW_ERR "Error Status: %s\n", decode_error_status(m));
733 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
734 m->extcpu,
735 c->x86, c->x86_model, c->x86_mask,
736 m->bank,
737 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
738 ((m->status & MCI_STATUS_UC) ? "UE" : "CE"),
739 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
740 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
741 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
743 if (c->x86 == 0x15)
744 pr_cont("|%s|%s",
745 ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
746 ((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
748 /* do the two bits[14:13] together */
749 ecc = (m->status >> 45) & 0x3;
750 if (ecc)
751 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
753 pr_cont("]: 0x%016llx\n", m->status);
755 if (m->status & MCI_STATUS_ADDRV)
756 pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr);
758 amd_decode_err_code(m->status & 0xffff);
760 return NOTIFY_STOP;
762 EXPORT_SYMBOL_GPL(amd_decode_mce);
764 static struct notifier_block amd_mce_dec_nb = {
765 .notifier_call = amd_decode_mce,
768 static int __init mce_amd_init(void)
770 struct cpuinfo_x86 *c = &boot_cpu_data;
772 if (c->x86_vendor != X86_VENDOR_AMD)
773 return 0;
775 if (c->x86 < 0xf || c->x86 > 0x15)
776 return 0;
778 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
779 if (!fam_ops)
780 return -ENOMEM;
782 switch (c->x86) {
783 case 0xf:
784 fam_ops->mc0_mce = k8_mc0_mce;
785 fam_ops->mc1_mce = k8_mc1_mce;
786 break;
788 case 0x10:
789 fam_ops->mc0_mce = f10h_mc0_mce;
790 fam_ops->mc1_mce = k8_mc1_mce;
791 break;
793 case 0x11:
794 fam_ops->mc0_mce = k8_mc0_mce;
795 fam_ops->mc1_mce = k8_mc1_mce;
796 break;
798 case 0x12:
799 fam_ops->mc0_mce = f12h_mc0_mce;
800 fam_ops->mc1_mce = k8_mc1_mce;
801 break;
803 case 0x14:
804 nb_err_cpumask = 0x3;
805 fam_ops->mc0_mce = f14h_mc0_mce;
806 fam_ops->mc1_mce = f14h_mc1_mce;
807 break;
809 case 0x15:
810 xec_mask = 0x1f;
811 fam_ops->mc0_mce = f15h_mc0_mce;
812 fam_ops->mc1_mce = f15h_mc1_mce;
813 break;
815 default:
816 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
817 kfree(fam_ops);
818 return -EINVAL;
821 pr_info("MCE: In-kernel MCE decoding enabled.\n");
823 mce_register_decode_chain(&amd_mce_dec_nb);
825 return 0;
827 early_initcall(mce_amd_init);
829 #ifdef MODULE
830 static void __exit mce_amd_exit(void)
832 mce_unregister_decode_chain(&amd_mce_dec_nb);
833 kfree(fam_ops);
836 MODULE_DESCRIPTION("AMD MCE decoder");
837 MODULE_ALIAS("edac-mce-amd");
838 MODULE_LICENSE("GPL");
839 module_exit(mce_amd_exit);
840 #endif