1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops
*fam_ops
;
8 static u8 xec_mask
= 0xf;
9 static u8 nb_err_cpumask
= 0xf;
11 static bool report_gart_errors
;
12 static void (*nb_bus_decoder
)(int node_id
, struct mce
*m
, u32 nbcfg
);
14 void amd_report_gart_errors(bool v
)
16 report_gart_errors
= v
;
18 EXPORT_SYMBOL_GPL(amd_report_gart_errors
);
20 void amd_register_ecc_decoder(void (*f
)(int, struct mce
*, u32
))
24 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder
);
26 void amd_unregister_ecc_decoder(void (*f
)(int, struct mce
*, u32
))
29 WARN_ON(nb_bus_decoder
!= f
);
31 nb_bus_decoder
= NULL
;
34 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder
);
37 * string representation for the different MCA reported error types, see F3x48
41 /* transaction type */
42 const char *tt_msgs
[] = { "INSN", "DATA", "GEN", "RESV" };
43 EXPORT_SYMBOL_GPL(tt_msgs
);
46 const char *ll_msgs
[] = { "RESV", "L1", "L2", "L3/GEN" };
47 EXPORT_SYMBOL_GPL(ll_msgs
);
49 /* memory transaction type */
50 const char *rrrr_msgs
[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
53 EXPORT_SYMBOL_GPL(rrrr_msgs
);
55 /* participating processor */
56 const char *pp_msgs
[] = { "SRC", "RES", "OBS", "GEN" };
57 EXPORT_SYMBOL_GPL(pp_msgs
);
60 const char *to_msgs
[] = { "no timeout", "timed out" };
61 EXPORT_SYMBOL_GPL(to_msgs
);
64 const char *ii_msgs
[] = { "MEM", "RESV", "IO", "GEN" };
65 EXPORT_SYMBOL_GPL(ii_msgs
);
67 static const char *f10h_nb_mce_desc
[] = {
69 "Protocol error (link, L3, probe filter, etc.)",
70 "Parity error in NB-internal arrays",
71 "Link Retry due to IO link transmission error",
72 "L3 ECC data cache error",
73 "ECC error in L3 cache tag",
74 "L3 LRU parity bits error",
75 "ECC Error in the Probe Filter directory"
78 static const char * const f15h_ic_mce_desc
[] = {
79 "UC during a demand linefill from L2",
80 "Parity error during data load from IC",
81 "Parity error for IC valid bit",
82 "Main tag parity error",
83 "Parity error in prediction queue",
84 "PFB data/address parity error",
85 "Parity error in the branch status reg",
86 "PFB promotion address error",
87 "Tag error during probe/victimization",
88 "Parity error for IC probe tag valid bit",
89 "PFB non-cacheable bit parity error",
90 "PFB valid bit parity error", /* xec = 0xd */
91 "patch RAM", /* xec = 010 */
98 static const char * const f15h_cu_mce_desc
[] = {
99 "Fill ECC error on data fills", /* xec = 0x4 */
100 "Fill parity error on insn fills",
101 "Prefetcher request FIFO parity error",
102 "PRQ address parity error",
103 "PRQ data parity error",
105 "WCC Data ECC error",
106 "WCB Data parity error",
108 "L2 Tag ECC error", /* xec = 0x10 */
109 "Hard L2 Tag ECC error",
110 "Multiple hits on L2 tag",
112 "PRB address parity error"
115 static const char * const fr_ex_mce_desc
[] = {
116 "CPU Watchdog timer expire",
117 "Wakeup array dest tag",
121 "Retire dispatch queue",
122 "Mapper checkpoint array",
123 "Physical register file EX0 port",
124 "Physical register file EX1 port",
125 "Physical register file AG0 port",
126 "Physical register file AG1 port",
127 "Flag register file",
128 "DE correctable error could not be corrected"
131 static bool f12h_dc_mce(u16 ec
, u8 xec
)
140 pr_cont("during L1 linefill from L2.\n");
141 else if (ll
== LL_L1
)
142 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec
));
149 static bool f10h_dc_mce(u16 ec
, u8 xec
)
151 u8 r4
= (ec
>> 4) & 0xf;
154 if (r4
== R4_GEN
&& ll
== LL_L1
) {
155 pr_cont("during data scrub.\n");
158 return f12h_dc_mce(ec
, xec
);
161 static bool k8_dc_mce(u16 ec
, u8 xec
)
164 pr_cont("during system linefill.\n");
168 return f10h_dc_mce(ec
, xec
);
171 static bool f14h_dc_mce(u16 ec
, u8 xec
)
173 u8 r4
= (ec
>> 4) & 0xf;
175 u8 tt
= (ec
>> 2) & 0x3;
181 if (tt
!= TT_DATA
|| ll
!= LL_L1
)
187 pr_cont("Data/Tag parity error due to %s.\n",
188 (r4
== R4_DRD
? "load/hw prf" : "store"));
191 pr_cont("Copyback parity error on a tag miss.\n");
194 pr_cont("Tag parity error during snoop.\n");
199 } else if (BUS_ERROR(ec
)) {
201 if ((ii
!= II_MEM
&& ii
!= II_IO
) || ll
!= LL_LG
)
204 pr_cont("System read data error on a ");
208 pr_cont("TLB reload.\n");
226 static bool f15h_dc_mce(u16 ec
, u8 xec
)
234 pr_cont("Data Array access error.\n");
238 pr_cont("UC error during a linefill from L2/NB.\n");
243 pr_cont("STQ access error.\n");
247 pr_cont("SCB access error.\n");
251 pr_cont("Tag error.\n");
255 pr_cont("LDQ access error.\n");
261 } else if (BUS_ERROR(ec
)) {
264 pr_cont("during system linefill.\n");
266 pr_cont(" Internal %s condition.\n",
267 ((xec
== 1) ? "livelock" : "deadlock"));
274 static void amd_decode_dc_mce(struct mce
*m
)
276 u16 ec
= m
->status
& 0xffff;
277 u8 xec
= (m
->status
>> 16) & xec_mask
;
279 pr_emerg(HW_ERR
"Data Cache Error: ");
281 /* TLB error signatures are the same across families */
283 u8 tt
= (ec
>> 2) & 0x3;
286 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
287 ((xec
== 2) ? "locked miss"
288 : (xec
? "multimatch" : "parity")));
291 } else if (fam_ops
->dc_mce(ec
, xec
))
294 pr_emerg(HW_ERR
"Corrupted DC MCE info?\n");
297 static bool k8_ic_mce(u16 ec
, u8 xec
)
300 u8 r4
= (ec
>> 4) & 0xf;
307 pr_cont("during a linefill from L2.\n");
308 else if (ll
== 0x1) {
311 pr_cont("Parity error during data load.\n");
315 pr_cont("Copyback Parity/Victim error.\n");
319 pr_cont("Tag Snoop error.\n");
332 static bool f14h_ic_mce(u16 ec
, u8 xec
)
335 u8 tt
= (ec
>> 2) & 0x3;
336 u8 r4
= (ec
>> 4) & 0xf;
340 if (tt
!= 0 || ll
!= 1)
344 pr_cont("Data/tag array parity error for a tag hit.\n");
345 else if (r4
== R4_SNOOP
)
346 pr_cont("Tag error during snoop/victimization.\n");
353 static bool f15h_ic_mce(u16 ec
, u8 xec
)
362 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
]);
366 pr_cont("%s.\n", f15h_ic_mce_desc
[xec
-2]);
370 pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc
[xec
-4]);
379 static void amd_decode_ic_mce(struct mce
*m
)
381 u16 ec
= m
->status
& 0xffff;
382 u8 xec
= (m
->status
>> 16) & xec_mask
;
384 pr_emerg(HW_ERR
"Instruction Cache Error: ");
387 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
388 (xec
? "multimatch" : "parity error"));
389 else if (BUS_ERROR(ec
)) {
390 bool k8
= (boot_cpu_data
.x86
== 0xf && (m
->status
& BIT_64(58)));
392 pr_cont("during %s.\n", (k8
? "system linefill" : "NB data read"));
393 } else if (fam_ops
->ic_mce(ec
, xec
))
396 pr_emerg(HW_ERR
"Corrupted IC MCE info?\n");
399 static void amd_decode_bu_mce(struct mce
*m
)
401 u32 ec
= m
->status
& 0xffff;
402 u32 xec
= (m
->status
>> 16) & xec_mask
;
404 pr_emerg(HW_ERR
"Bus Unit Error");
407 pr_cont(" in the write data buffers.\n");
409 pr_cont(" in the victim data buffers.\n");
410 else if (xec
== 0x2 && MEM_ERROR(ec
))
411 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec
));
412 else if (xec
== 0x0) {
414 pr_cont(": %s error in a Page Descriptor Cache or "
415 "Guest TLB.\n", TT_MSG(ec
));
416 else if (BUS_ERROR(ec
))
417 pr_cont(": %s/ECC error in data read from NB: %s.\n",
418 RRRR_MSG(ec
), PP_MSG(ec
));
419 else if (MEM_ERROR(ec
)) {
420 u8 rrrr
= (ec
>> 4) & 0xf;
423 pr_cont(": %s error during data copyback.\n",
425 else if (rrrr
<= 0x1)
426 pr_cont(": %s parity/ECC error during data "
427 "access from L2.\n", RRRR_MSG(ec
));
438 pr_emerg(HW_ERR
"Corrupted BU MCE info?\n");
441 static void amd_decode_cu_mce(struct mce
*m
)
443 u16 ec
= m
->status
& 0xffff;
444 u8 xec
= (m
->status
>> 16) & xec_mask
;
446 pr_emerg(HW_ERR
"Combined Unit Error: ");
450 pr_cont("Data parity TLB read error.\n");
452 pr_cont("Poison data provided for TLB fill.\n");
455 } else if (BUS_ERROR(ec
)) {
459 pr_cont("Error during attempted NB data read.\n");
460 } else if (MEM_ERROR(ec
)) {
463 pr_cont("%s.\n", f15h_cu_mce_desc
[xec
- 0x4]);
467 pr_cont("%s.\n", f15h_cu_mce_desc
[xec
- 0x7]);
478 pr_emerg(HW_ERR
"Corrupted CU MCE info?\n");
481 static void amd_decode_ls_mce(struct mce
*m
)
483 u16 ec
= m
->status
& 0xffff;
484 u8 xec
= (m
->status
>> 16) & xec_mask
;
486 if (boot_cpu_data
.x86
>= 0x14) {
487 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
488 " please report on LKML.\n");
492 pr_emerg(HW_ERR
"Load Store Error");
495 u8 r4
= (ec
>> 4) & 0xf;
497 if (!BUS_ERROR(ec
) || (r4
!= R4_DRD
&& r4
!= R4_DWR
))
500 pr_cont(" during %s.\n", RRRR_MSG(ec
));
507 pr_emerg(HW_ERR
"Corrupted LS MCE info?\n");
510 static bool k8_nb_mce(u16 ec
, u8 xec
)
516 pr_cont("CRC error detected on HT link.\n");
520 pr_cont("Invalid GART PTE entry during GART table walk.\n");
524 pr_cont("Unsupported atomic RMW received from an IO link.\n");
529 if (boot_cpu_data
.x86
== 0x11)
532 pr_cont("DRAM ECC error detected on the NB.\n");
536 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
547 static bool f10h_nb_mce(u16 ec
, u8 xec
)
552 if (k8_nb_mce(ec
, xec
))
566 pr_cont("GART Table Walk data error.\n");
567 else if (BUS_ERROR(ec
))
568 pr_cont("DMA Exclusion Vector Table Walk error.\n");
576 if (boot_cpu_data
.x86
== 0x15)
577 pr_cont("Compute Unit Data Error.\n");
595 pr_cont("%s.\n", f10h_nb_mce_desc
[xec
- offset
]);
601 static bool nb_noop_mce(u16 ec
, u8 xec
)
606 void amd_decode_nb_mce(int node_id
, struct mce
*m
, u32 nbcfg
)
608 u8 xec
= (m
->status
>> 16) & 0x1f;
609 u16 ec
= m
->status
& 0xffff;
610 u32 nbsh
= (u32
)(m
->status
>> 32);
612 pr_emerg(HW_ERR
"Northbridge Error, node %d: ", node_id
);
615 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
616 * value encoding has changed so interpret those differently
618 if ((boot_cpu_data
.x86
== 0x10) &&
619 (boot_cpu_data
.x86_model
> 7)) {
620 if (nbsh
& K8_NBSH_ERR_CPU_VAL
)
621 pr_cont(", core: %u", (u8
)(nbsh
& nb_err_cpumask
));
623 u8 assoc_cpus
= nbsh
& nb_err_cpumask
;
626 pr_cont(", core: %d", fls(assoc_cpus
) - 1);
631 pr_cont("Sync error (sync packets on HT link detected).\n");
635 pr_cont("HT Master abort.\n");
639 pr_cont("HT Target abort.\n");
643 pr_cont("NB Watchdog timeout.\n");
647 pr_cont("SVM DMA Exclusion Vector error.\n");
654 if (!fam_ops
->nb_mce(ec
, xec
))
657 if (boot_cpu_data
.x86
== 0xf || boot_cpu_data
.x86
== 0x10)
658 if ((xec
== 0x8 || xec
== 0x0) && nb_bus_decoder
)
659 nb_bus_decoder(node_id
, m
, nbcfg
);
664 pr_emerg(HW_ERR
"Corrupted NB MCE info?\n");
666 EXPORT_SYMBOL_GPL(amd_decode_nb_mce
);
668 static void amd_decode_fr_mce(struct mce
*m
)
670 struct cpuinfo_x86
*c
= &boot_cpu_data
;
671 u8 xec
= (m
->status
>> 16) & xec_mask
;
673 if (c
->x86
== 0xf || c
->x86
== 0x11)
676 if (c
->x86
!= 0x15 && xec
!= 0x0)
679 pr_emerg(HW_ERR
"%s Error: ",
680 (c
->x86
== 0x15 ? "Execution Unit" : "FIROB"));
682 if (xec
== 0x0 || xec
== 0xc)
683 pr_cont("%s.\n", fr_ex_mce_desc
[xec
]);
685 pr_cont("%s parity error.\n", fr_ex_mce_desc
[xec
]);
692 pr_emerg(HW_ERR
"Corrupted FR MCE info?\n");
695 static void amd_decode_fp_mce(struct mce
*m
)
697 u8 xec
= (m
->status
>> 16) & xec_mask
;
699 pr_emerg(HW_ERR
"Floating Point Unit Error: ");
703 pr_cont("Free List");
707 pr_cont("Physical Register File");
711 pr_cont("Retire Queue");
715 pr_cont("Scheduler table");
719 pr_cont("Status Register File");
727 pr_cont(" parity error.\n");
732 pr_emerg(HW_ERR
"Corrupted FP MCE info?\n");
735 static inline void amd_decode_err_code(u16 ec
)
738 pr_emerg(HW_ERR
"Transaction: %s, Cache Level: %s\n",
739 TT_MSG(ec
), LL_MSG(ec
));
740 } else if (MEM_ERROR(ec
)) {
741 pr_emerg(HW_ERR
"Transaction: %s, Type: %s, Cache Level: %s\n",
742 RRRR_MSG(ec
), TT_MSG(ec
), LL_MSG(ec
));
743 } else if (BUS_ERROR(ec
)) {
744 pr_emerg(HW_ERR
"Transaction: %s (%s), %s, Cache Level: %s, "
745 "Participating Processor: %s\n",
746 RRRR_MSG(ec
), II_MSG(ec
), TO_MSG(ec
), LL_MSG(ec
),
749 pr_emerg(HW_ERR
"Huh? Unknown MCE error 0x%x\n", ec
);
753 * Filter out unwanted MCE signatures here.
755 static bool amd_filter_mce(struct mce
*m
)
757 u8 xec
= (m
->status
>> 16) & 0x1f;
760 * NB GART TLB error reporting is disabled by default.
762 if (m
->bank
== 4 && xec
== 0x5 && !report_gart_errors
)
768 int amd_decode_mce(struct notifier_block
*nb
, unsigned long val
, void *data
)
770 struct mce
*m
= (struct mce
*)data
;
773 if (amd_filter_mce(m
))
776 pr_emerg(HW_ERR
"MC%d_STATUS: ", m
->bank
);
778 pr_cont("%sorrected error, other errors lost: %s, "
779 "CPU context corrupt: %s",
780 ((m
->status
& MCI_STATUS_UC
) ? "Unc" : "C"),
781 ((m
->status
& MCI_STATUS_OVER
) ? "yes" : "no"),
782 ((m
->status
& MCI_STATUS_PCC
) ? "yes" : "no"));
784 /* do the two bits[14:13] together */
785 ecc
= (m
->status
>> 45) & 0x3;
787 pr_cont(", %sECC Error", ((ecc
== 2) ? "C" : "U"));
793 amd_decode_dc_mce(m
);
797 amd_decode_ic_mce(m
);
801 if (boot_cpu_data
.x86
== 0x15)
802 amd_decode_cu_mce(m
);
804 amd_decode_bu_mce(m
);
808 amd_decode_ls_mce(m
);
812 node
= amd_get_nb_id(m
->extcpu
);
813 amd_decode_nb_mce(node
, m
, 0);
817 amd_decode_fr_mce(m
);
821 amd_decode_fp_mce(m
);
828 amd_decode_err_code(m
->status
& 0xffff);
832 EXPORT_SYMBOL_GPL(amd_decode_mce
);
834 static struct notifier_block amd_mce_dec_nb
= {
835 .notifier_call
= amd_decode_mce
,
838 static int __init
mce_amd_init(void)
840 if (boot_cpu_data
.x86_vendor
!= X86_VENDOR_AMD
)
843 if ((boot_cpu_data
.x86
< 0xf || boot_cpu_data
.x86
> 0x12) &&
844 (boot_cpu_data
.x86
!= 0x14 || boot_cpu_data
.x86_model
> 0xf))
847 fam_ops
= kzalloc(sizeof(struct amd_decoder_ops
), GFP_KERNEL
);
851 switch (boot_cpu_data
.x86
) {
853 fam_ops
->dc_mce
= k8_dc_mce
;
854 fam_ops
->ic_mce
= k8_ic_mce
;
855 fam_ops
->nb_mce
= k8_nb_mce
;
859 fam_ops
->dc_mce
= f10h_dc_mce
;
860 fam_ops
->ic_mce
= k8_ic_mce
;
861 fam_ops
->nb_mce
= f10h_nb_mce
;
865 fam_ops
->dc_mce
= k8_dc_mce
;
866 fam_ops
->ic_mce
= k8_ic_mce
;
867 fam_ops
->nb_mce
= f10h_nb_mce
;
871 fam_ops
->dc_mce
= f12h_dc_mce
;
872 fam_ops
->ic_mce
= k8_ic_mce
;
873 fam_ops
->nb_mce
= nb_noop_mce
;
877 nb_err_cpumask
= 0x3;
878 fam_ops
->dc_mce
= f14h_dc_mce
;
879 fam_ops
->ic_mce
= f14h_ic_mce
;
880 fam_ops
->nb_mce
= nb_noop_mce
;
885 fam_ops
->dc_mce
= f15h_dc_mce
;
886 fam_ops
->ic_mce
= f15h_ic_mce
;
887 fam_ops
->nb_mce
= f10h_nb_mce
;
891 printk(KERN_WARNING
"Huh? What family is that: %d?!\n",
897 pr_info("MCE: In-kernel MCE decoding enabled.\n");
899 atomic_notifier_chain_register(&x86_mce_decoder_chain
, &amd_mce_dec_nb
);
903 early_initcall(mce_amd_init
);
906 static void __exit
mce_amd_exit(void)
908 atomic_notifier_chain_unregister(&x86_mce_decoder_chain
, &amd_mce_dec_nb
);
912 MODULE_DESCRIPTION("AMD MCE decoder");
913 MODULE_ALIAS("edac-mce-amd");
914 MODULE_LICENSE("GPL");
915 module_exit(mce_amd_exit
);