1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops
*fam_ops
;
8 static u8 xec_mask
= 0xf;
9 static u8 nb_err_cpumask
= 0xf;
11 static bool report_gart_errors
;
12 static void (*nb_bus_decoder
)(int node_id
, struct mce
*m
);
14 void amd_report_gart_errors(bool v
)
16 report_gart_errors
= v
;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors
);
20 void amd_register_ecc_decoder(void (*f
)(int, struct mce
*))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder
);
26 void amd_unregister_ecc_decoder(void (*f
)(int, struct mce
*))
29 WARN_ON(nb_bus_decoder
!= f
);
31 nb_bus_decoder
= NULL
;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder
);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 static const char * const tt_msgs
[] = { "INSN", "DATA", "GEN", "RESV" };
45 static const char * const ll_msgs
[] = { "RESV", "L1", "L2", "L3/GEN" };
47 /* memory transaction type */
48 static const char * const rrrr_msgs
[] = {
49 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52 /* participating processor */
53 const char * const pp_msgs
[] = { "SRC", "RES", "OBS", "GEN" };
54 EXPORT_SYMBOL_GPL(pp_msgs
);
57 static const char * const to_msgs
[] = { "no timeout", "timed out" };
60 static const char * const ii_msgs
[] = { "MEM", "RESV", "IO", "GEN" };
62 /* internal error type */
63 static const char * const uu_msgs
[] = { "RESV", "RESV", "HWA", "RESV" };
65 static const char * const f15h_mc1_mce_desc
[] = {
66 "UC during a demand linefill from L2",
67 "Parity error during data load from IC",
68 "Parity error for IC valid bit",
69 "Main tag parity error",
70 "Parity error in prediction queue",
71 "PFB data/address parity error",
72 "Parity error in the branch status reg",
73 "PFB promotion address error",
74 "Tag error during probe/victimization",
75 "Parity error for IC probe tag valid bit",
76 "PFB non-cacheable bit parity error",
77 "PFB valid bit parity error", /* xec = 0xd */
78 "Microcode Patch Buffer", /* xec = 010 */
85 static const char * const f15h_mc2_mce_desc
[] = {
86 "Fill ECC error on data fills", /* xec = 0x4 */
87 "Fill parity error on insn fills",
88 "Prefetcher request FIFO parity error",
89 "PRQ address parity error",
90 "PRQ data parity error",
93 "WCB Data parity error",
94 "VB Data ECC or parity error",
95 "L2 Tag ECC error", /* xec = 0x10 */
96 "Hard L2 Tag ECC error",
97 "Multiple hits on L2 tag",
99 "PRB address parity error"
102 static const char * const mc4_mce_desc
[] = {
103 "DRAM ECC error detected on the NB",
104 "CRC error detected on HT link",
105 "Link-defined sync error packets detected on HT link",
108 "Invalid GART PTE entry during GART table walk",
109 "Unsupported atomic RMW received from an IO link",
110 "Watchdog timeout due to lack of progress",
111 "DRAM ECC error detected on the NB",
112 "SVM DMA Exclusion Vector error",
113 "HT data error detected on link",
114 "Protocol error (link, L3, probe filter)",
115 "NB internal arrays parity error",
116 "DRAM addr/ctl signals parity error",
117 "IO link transmission error",
118 "L3 data cache ECC error", /* xec = 0x1c */
119 "L3 cache tag error",
120 "L3 LRU parity bits error",
121 "ECC Error in the Probe Filter directory"
124 static const char * const mc5_mce_desc
[] = {
125 "CPU Watchdog timer expire",
126 "Wakeup array dest tag",
130 "Retire dispatch queue",
131 "Mapper checkpoint array",
132 "Physical register file EX0 port",
133 "Physical register file EX1 port",
134 "Physical register file AG0 port",
135 "Physical register file AG1 port",
136 "Flag register file",
140 static bool f12h_mc0_mce(u16 ec
, u8 xec
)
149 pr_cont("during L1 linefill from L2.\n");
150 else if (ll
== LL_L1
)
151 pr_cont("Data/Tag %s error.\n", R4_MSG(ec
));
158 static bool f10h_mc0_mce(u16 ec
, u8 xec
)
160 if (R4(ec
) == R4_GEN
&& LL(ec
) == LL_L1
) {
161 pr_cont("during data scrub.\n");
164 return f12h_mc0_mce(ec
, xec
);
167 static bool k8_mc0_mce(u16 ec
, u8 xec
)
170 pr_cont("during system linefill.\n");
174 return f10h_mc0_mce(ec
, xec
);
177 static bool cat_mc0_mce(u16 ec
, u8 xec
)
184 if (TT(ec
) != TT_DATA
|| LL(ec
) != LL_L1
)
190 pr_cont("Data/Tag parity error due to %s.\n",
191 (r4
== R4_DRD
? "load/hw prf" : "store"));
194 pr_cont("Copyback parity error on a tag miss.\n");
197 pr_cont("Tag parity error during snoop.\n");
202 } else if (BUS_ERROR(ec
)) {
204 if ((II(ec
) != II_MEM
&& II(ec
) != II_IO
) || LL(ec
) != LL_LG
)
207 pr_cont("System read data error on a ");
211 pr_cont("TLB reload.\n");
229 static bool f15h_mc0_mce(u16 ec
, u8 xec
)
237 pr_cont("Data Array access error.\n");
241 pr_cont("UC error during a linefill from L2/NB.\n");
246 pr_cont("STQ access error.\n");
250 pr_cont("SCB access error.\n");
254 pr_cont("Tag error.\n");
258 pr_cont("LDQ access error.\n");
264 } else if (BUS_ERROR(ec
)) {
267 pr_cont("System Read Data Error.\n");
269 pr_cont(" Internal error condition type %d.\n", xec
);
276 static void decode_mc0_mce(struct mce
*m
)
278 u16 ec
= EC(m
->status
);
279 u8 xec
= XEC(m
->status
, xec_mask
);
281 pr_emerg(HW_ERR
"MC0 Error: ");
283 /* TLB error signatures are the same across families */
285 if (TT(ec
) == TT_DATA
) {
286 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
287 ((xec
== 2) ? "locked miss"
288 : (xec
? "multimatch" : "parity")));
291 } else if (fam_ops
->mc0_mce(ec
, xec
))
294 pr_emerg(HW_ERR
"Corrupted MC0 MCE info?\n");
297 static bool k8_mc1_mce(u16 ec
, u8 xec
)
306 pr_cont("during a linefill from L2.\n");
307 else if (ll
== 0x1) {
310 pr_cont("Parity error during data load.\n");
314 pr_cont("Copyback Parity/Victim error.\n");
318 pr_cont("Tag Snoop error.\n");
331 static bool cat_mc1_mce(u16 ec
, u8 xec
)
339 if (TT(ec
) != TT_INSTR
)
343 pr_cont("Data/tag array parity error for a tag hit.\n");
344 else if (r4
== R4_SNOOP
)
345 pr_cont("Tag error during snoop/victimization.\n");
347 pr_cont("Tag parity error from victim castout.\n");
349 pr_cont("Microcode patch RAM parity error.\n");
356 static bool f15h_mc1_mce(u16 ec
, u8 xec
)
365 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
]);
369 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
-2]);
373 pr_cont("%s.\n", f15h_mc1_mce_desc
[xec
-4]);
377 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc
[xec
-4]);
386 static void decode_mc1_mce(struct mce
*m
)
388 u16 ec
= EC(m
->status
);
389 u8 xec
= XEC(m
->status
, xec_mask
);
391 pr_emerg(HW_ERR
"MC1 Error: ");
394 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
395 (xec
? "multimatch" : "parity error"));
396 else if (BUS_ERROR(ec
)) {
397 bool k8
= (boot_cpu_data
.x86
== 0xf && (m
->status
& BIT_64(58)));
399 pr_cont("during %s.\n", (k8
? "system linefill" : "NB data read"));
400 } else if (fam_ops
->mc1_mce(ec
, xec
))
403 pr_emerg(HW_ERR
"Corrupted MC1 MCE info?\n");
406 static bool k8_mc2_mce(u16 ec
, u8 xec
)
411 pr_cont(" in the write data buffers.\n");
413 pr_cont(" in the victim data buffers.\n");
414 else if (xec
== 0x2 && MEM_ERROR(ec
))
415 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec
));
416 else if (xec
== 0x0) {
418 pr_cont(": %s error in a Page Descriptor Cache or "
419 "Guest TLB.\n", TT_MSG(ec
));
420 else if (BUS_ERROR(ec
))
421 pr_cont(": %s/ECC error in data read from NB: %s.\n",
422 R4_MSG(ec
), PP_MSG(ec
));
423 else if (MEM_ERROR(ec
)) {
427 pr_cont(": %s error during data copyback.\n",
430 pr_cont(": %s parity/ECC error during data "
431 "access from L2.\n", R4_MSG(ec
));
442 static bool f15h_mc2_mce(u16 ec
, u8 xec
)
448 pr_cont("Data parity TLB read error.\n");
450 pr_cont("Poison data provided for TLB fill.\n");
453 } else if (BUS_ERROR(ec
)) {
457 pr_cont("Error during attempted NB data read.\n");
458 } else if (MEM_ERROR(ec
)) {
461 pr_cont("%s.\n", f15h_mc2_mce_desc
[xec
- 0x4]);
465 pr_cont("%s.\n", f15h_mc2_mce_desc
[xec
- 0x7]);
476 static bool f16h_mc2_mce(u16 ec
, u8 xec
)
485 pr_cont("%cBUFF parity error.\n", (r4
== R4_RD
) ? 'I' : 'O');
490 pr_cont("ECC error in L2 tag (%s).\n",
491 ((r4
== R4_GEN
) ? "BankReq" :
492 ((r4
== R4_SNOOP
) ? "Prb" : "Fill")));
497 pr_cont("ECC error in L2 data array (%s).\n",
498 (((r4
== R4_RD
) && !(xec
& 0x3)) ? "Hit" :
499 ((r4
== R4_GEN
) ? "Attr" :
500 ((r4
== R4_EVICT
) ? "Vict" : "Fill"))));
505 pr_cont("Parity error in L2 attribute bits (%s).\n",
506 ((r4
== R4_RD
) ? "Hit" :
507 ((r4
== R4_GEN
) ? "Attr" : "Fill")));
517 static void decode_mc2_mce(struct mce
*m
)
519 u16 ec
= EC(m
->status
);
520 u8 xec
= XEC(m
->status
, xec_mask
);
522 pr_emerg(HW_ERR
"MC2 Error: ");
524 if (!fam_ops
->mc2_mce(ec
, xec
))
525 pr_cont(HW_ERR
"Corrupted MC2 MCE info?\n");
528 static void decode_mc3_mce(struct mce
*m
)
530 u16 ec
= EC(m
->status
);
531 u8 xec
= XEC(m
->status
, xec_mask
);
533 if (boot_cpu_data
.x86
>= 0x14) {
534 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
535 " please report on LKML.\n");
539 pr_emerg(HW_ERR
"MC3 Error");
544 if (!BUS_ERROR(ec
) || (r4
!= R4_DRD
&& r4
!= R4_DWR
))
547 pr_cont(" during %s.\n", R4_MSG(ec
));
554 pr_emerg(HW_ERR
"Corrupted MC3 MCE info?\n");
557 static void decode_mc4_mce(struct mce
*m
)
559 struct cpuinfo_x86
*c
= &boot_cpu_data
;
560 int node_id
= amd_get_nb_id(m
->extcpu
);
561 u16 ec
= EC(m
->status
);
562 u8 xec
= XEC(m
->status
, 0x1f);
565 pr_emerg(HW_ERR
"MC4 Error (node %d): ", node_id
);
570 /* special handling for DRAM ECCs */
571 if (xec
== 0x0 || xec
== 0x8) {
572 /* no ECCs on F11h */
576 pr_cont("%s.\n", mc4_mce_desc
[xec
]);
579 nb_bus_decoder(node_id
, m
);
586 pr_cont("GART Table Walk data error.\n");
587 else if (BUS_ERROR(ec
))
588 pr_cont("DMA Exclusion Vector Table Walk error.\n");
594 if (boot_cpu_data
.x86
== 0x15 || boot_cpu_data
.x86
== 0x16)
595 pr_cont("Compute Unit Data Error.\n");
608 pr_cont("%s.\n", mc4_mce_desc
[xec
- offset
]);
612 pr_emerg(HW_ERR
"Corrupted MC4 MCE info?\n");
615 static void decode_mc5_mce(struct mce
*m
)
617 struct cpuinfo_x86
*c
= &boot_cpu_data
;
618 u8 xec
= XEC(m
->status
, xec_mask
);
620 if (c
->x86
== 0xf || c
->x86
== 0x11)
623 pr_emerg(HW_ERR
"MC5 Error: ");
625 if (xec
== 0x0 || xec
== 0xc)
626 pr_cont("%s.\n", mc5_mce_desc
[xec
]);
628 pr_cont("%s parity error.\n", mc5_mce_desc
[xec
]);
635 pr_emerg(HW_ERR
"Corrupted MC5 MCE info?\n");
638 static void decode_mc6_mce(struct mce
*m
)
640 u8 xec
= XEC(m
->status
, xec_mask
);
642 pr_emerg(HW_ERR
"MC6 Error: ");
646 pr_cont("Free List");
650 pr_cont("Physical Register File");
654 pr_cont("Retire Queue");
658 pr_cont("Scheduler table");
662 pr_cont("Status Register File");
670 pr_cont(" parity error.\n");
675 pr_emerg(HW_ERR
"Corrupted MC6 MCE info?\n");
678 static inline void amd_decode_err_code(u16 ec
)
681 pr_emerg(HW_ERR
"internal: %s\n", UU_MSG(ec
));
685 pr_emerg(HW_ERR
"cache level: %s", LL_MSG(ec
));
688 pr_cont(", mem/io: %s", II_MSG(ec
));
690 pr_cont(", tx: %s", TT_MSG(ec
));
692 if (MEM_ERROR(ec
) || BUS_ERROR(ec
)) {
693 pr_cont(", mem-tx: %s", R4_MSG(ec
));
696 pr_cont(", part-proc: %s (%s)", PP_MSG(ec
), TO_MSG(ec
));
703 * Filter out unwanted MCE signatures here.
705 static bool amd_filter_mce(struct mce
*m
)
707 u8 xec
= (m
->status
>> 16) & 0x1f;
710 * NB GART TLB error reporting is disabled by default.
712 if (m
->bank
== 4 && xec
== 0x5 && !report_gart_errors
)
718 static const char *decode_error_status(struct mce
*m
)
720 if (m
->status
& MCI_STATUS_UC
) {
721 if (m
->status
& MCI_STATUS_PCC
)
722 return "System Fatal error.";
723 if (m
->mcgstatus
& MCG_STATUS_RIPV
)
724 return "Uncorrected, software restartable error.";
725 return "Uncorrected, software containable error.";
728 if (m
->status
& MCI_STATUS_DEFERRED
)
729 return "Deferred error.";
731 return "Corrected error, no action required.";
734 int amd_decode_mce(struct notifier_block
*nb
, unsigned long val
, void *data
)
736 struct mce
*m
= (struct mce
*)data
;
737 struct cpuinfo_x86
*c
= &cpu_data(m
->extcpu
);
740 if (amd_filter_mce(m
))
776 pr_emerg(HW_ERR
"Error Status: %s\n", decode_error_status(m
));
778 pr_emerg(HW_ERR
"CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
780 c
->x86
, c
->x86_model
, c
->x86_mask
,
782 ((m
->status
& MCI_STATUS_OVER
) ? "Over" : "-"),
783 ((m
->status
& MCI_STATUS_UC
) ? "UE" : "CE"),
784 ((m
->status
& MCI_STATUS_MISCV
) ? "MiscV" : "-"),
785 ((m
->status
& MCI_STATUS_PCC
) ? "PCC" : "-"),
786 ((m
->status
& MCI_STATUS_ADDRV
) ? "AddrV" : "-"));
788 if (c
->x86
== 0x15 || c
->x86
== 0x16)
790 ((m
->status
& MCI_STATUS_DEFERRED
) ? "Deferred" : "-"),
791 ((m
->status
& MCI_STATUS_POISON
) ? "Poison" : "-"));
793 /* do the two bits[14:13] together */
794 ecc
= (m
->status
>> 45) & 0x3;
796 pr_cont("|%sECC", ((ecc
== 2) ? "C" : "U"));
798 pr_cont("]: 0x%016llx\n", m
->status
);
800 if (m
->status
& MCI_STATUS_ADDRV
)
801 pr_emerg(HW_ERR
"MC%d_ADDR: 0x%016llx\n", m
->bank
, m
->addr
);
803 amd_decode_err_code(m
->status
& 0xffff);
807 EXPORT_SYMBOL_GPL(amd_decode_mce
);
809 static struct notifier_block amd_mce_dec_nb
= {
810 .notifier_call
= amd_decode_mce
,
813 static int __init
mce_amd_init(void)
815 struct cpuinfo_x86
*c
= &boot_cpu_data
;
817 if (c
->x86_vendor
!= X86_VENDOR_AMD
)
820 if (c
->x86
< 0xf || c
->x86
> 0x16)
823 fam_ops
= kzalloc(sizeof(struct amd_decoder_ops
), GFP_KERNEL
);
829 fam_ops
->mc0_mce
= k8_mc0_mce
;
830 fam_ops
->mc1_mce
= k8_mc1_mce
;
831 fam_ops
->mc2_mce
= k8_mc2_mce
;
835 fam_ops
->mc0_mce
= f10h_mc0_mce
;
836 fam_ops
->mc1_mce
= k8_mc1_mce
;
837 fam_ops
->mc2_mce
= k8_mc2_mce
;
841 fam_ops
->mc0_mce
= k8_mc0_mce
;
842 fam_ops
->mc1_mce
= k8_mc1_mce
;
843 fam_ops
->mc2_mce
= k8_mc2_mce
;
847 fam_ops
->mc0_mce
= f12h_mc0_mce
;
848 fam_ops
->mc1_mce
= k8_mc1_mce
;
849 fam_ops
->mc2_mce
= k8_mc2_mce
;
853 nb_err_cpumask
= 0x3;
854 fam_ops
->mc0_mce
= cat_mc0_mce
;
855 fam_ops
->mc1_mce
= cat_mc1_mce
;
856 fam_ops
->mc2_mce
= k8_mc2_mce
;
861 fam_ops
->mc0_mce
= f15h_mc0_mce
;
862 fam_ops
->mc1_mce
= f15h_mc1_mce
;
863 fam_ops
->mc2_mce
= f15h_mc2_mce
;
868 fam_ops
->mc0_mce
= cat_mc0_mce
;
869 fam_ops
->mc1_mce
= cat_mc1_mce
;
870 fam_ops
->mc2_mce
= f16h_mc2_mce
;
874 printk(KERN_WARNING
"Huh? What family is it: 0x%x?!\n", c
->x86
);
879 pr_info("MCE: In-kernel MCE decoding enabled.\n");
881 mce_register_decode_chain(&amd_mce_dec_nb
);
885 early_initcall(mce_amd_init
);
888 static void __exit
mce_amd_exit(void)
890 mce_unregister_decode_chain(&amd_mce_dec_nb
);
894 MODULE_DESCRIPTION("AMD MCE decoder");
895 MODULE_ALIAS("edac-mce-amd");
896 MODULE_LICENSE("GPL");
897 module_exit(mce_amd_exit
);