1 #include <linux/module.h>
2 #include <linux/slab.h>
6 static struct amd_decoder_ops
*fam_ops
;
8 static u8 nb_err_cpumask
= 0xf;
10 static bool report_gart_errors
;
11 static void (*nb_bus_decoder
)(int node_id
, struct mce
*m
, u32 nbcfg
);
13 void amd_report_gart_errors(bool v
)
15 report_gart_errors
= v
;
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors
);
19 void amd_register_ecc_decoder(void (*f
)(int, struct mce
*, u32
))
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder
);
25 void amd_unregister_ecc_decoder(void (*f
)(int, struct mce
*, u32
))
28 WARN_ON(nb_bus_decoder
!= f
);
30 nb_bus_decoder
= NULL
;
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder
);
36 * string representation for the different MCA reported error types, see F3x48
40 /* transaction type */
41 const char *tt_msgs
[] = { "INSN", "DATA", "GEN", "RESV" };
42 EXPORT_SYMBOL_GPL(tt_msgs
);
45 const char *ll_msgs
[] = { "RESV", "L1", "L2", "L3/GEN" };
46 EXPORT_SYMBOL_GPL(ll_msgs
);
48 /* memory transaction type */
49 const char *rrrr_msgs
[] = {
50 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52 EXPORT_SYMBOL_GPL(rrrr_msgs
);
54 /* participating processor */
55 const char *pp_msgs
[] = { "SRC", "RES", "OBS", "GEN" };
56 EXPORT_SYMBOL_GPL(pp_msgs
);
59 const char *to_msgs
[] = { "no timeout", "timed out" };
60 EXPORT_SYMBOL_GPL(to_msgs
);
63 const char *ii_msgs
[] = { "MEM", "RESV", "IO", "GEN" };
64 EXPORT_SYMBOL_GPL(ii_msgs
);
66 static const char *f10h_nb_mce_desc
[] = {
68 "Protocol error (link, L3, probe filter, etc.)",
69 "Parity error in NB-internal arrays",
70 "Link Retry due to IO link transmission error",
71 "L3 ECC data cache error",
72 "ECC error in L3 cache tag",
73 "L3 LRU parity bits error",
74 "ECC Error in the Probe Filter directory"
77 static bool f12h_dc_mce(u16 ec
)
86 pr_cont("during L1 linefill from L2.\n");
88 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec
));
95 static bool f10h_dc_mce(u16 ec
)
97 u8 r4
= (ec
>> 4) & 0xf;
100 if (r4
== R4_GEN
&& ll
== LL_L1
) {
101 pr_cont("during data scrub.\n");
104 return f12h_dc_mce(ec
);
107 static bool k8_dc_mce(u16 ec
)
110 pr_cont("during system linefill.\n");
114 return f10h_dc_mce(ec
);
117 static bool f14h_dc_mce(u16 ec
)
119 u8 r4
= (ec
>> 4) & 0xf;
121 u8 tt
= (ec
>> 2) & 0x3;
127 if (tt
!= TT_DATA
|| ll
!= LL_L1
)
133 pr_cont("Data/Tag parity error due to %s.\n",
134 (r4
== R4_DRD
? "load/hw prf" : "store"));
137 pr_cont("Copyback parity error on a tag miss.\n");
140 pr_cont("Tag parity error during snoop.\n");
145 } else if (BUS_ERROR(ec
)) {
147 if ((ii
!= II_MEM
&& ii
!= II_IO
) || ll
!= LL_LG
)
150 pr_cont("System read data error on a ");
154 pr_cont("TLB reload.\n");
172 static void amd_decode_dc_mce(struct mce
*m
)
174 u16 ec
= m
->status
& 0xffff;
175 u8 xec
= (m
->status
>> 16) & 0xf;
177 pr_emerg(HW_ERR
"Data Cache Error: ");
179 /* TLB error signatures are the same across families */
181 u8 tt
= (ec
>> 2) & 0x3;
184 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
185 (xec
? "multimatch" : "parity error"));
192 if (!fam_ops
->dc_mce(ec
))
198 pr_emerg(HW_ERR
"Corrupted DC MCE info?\n");
201 static bool k8_ic_mce(u16 ec
)
204 u8 r4
= (ec
>> 4) & 0xf;
211 pr_cont("during a linefill from L2.\n");
212 else if (ll
== 0x1) {
215 pr_cont("Parity error during data load.\n");
219 pr_cont("Copyback Parity/Victim error.\n");
223 pr_cont("Tag Snoop error.\n");
236 static bool f14h_ic_mce(u16 ec
)
239 u8 tt
= (ec
>> 2) & 0x3;
240 u8 r4
= (ec
>> 4) & 0xf;
244 if (tt
!= 0 || ll
!= 1)
248 pr_cont("Data/tag array parity error for a tag hit.\n");
249 else if (r4
== R4_SNOOP
)
250 pr_cont("Tag error during snoop/victimization.\n");
257 static void amd_decode_ic_mce(struct mce
*m
)
259 u16 ec
= m
->status
& 0xffff;
260 u8 xec
= (m
->status
>> 16) & 0xf;
262 pr_emerg(HW_ERR
"Instruction Cache Error: ");
265 pr_cont("%s TLB %s.\n", LL_MSG(ec
),
266 (xec
? "multimatch" : "parity error"));
267 else if (BUS_ERROR(ec
)) {
268 bool k8
= (boot_cpu_data
.x86
== 0xf && (m
->status
& BIT_64(58)));
270 pr_cont("during %s.\n", (k8
? "system linefill" : "NB data read"));
271 } else if (fam_ops
->ic_mce(ec
))
274 pr_emerg(HW_ERR
"Corrupted IC MCE info?\n");
277 static void amd_decode_bu_mce(struct mce
*m
)
279 u32 ec
= m
->status
& 0xffff;
280 u32 xec
= (m
->status
>> 16) & 0xf;
282 pr_emerg(HW_ERR
"Bus Unit Error");
285 pr_cont(" in the write data buffers.\n");
287 pr_cont(" in the victim data buffers.\n");
288 else if (xec
== 0x2 && MEM_ERROR(ec
))
289 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec
));
290 else if (xec
== 0x0) {
292 pr_cont(": %s error in a Page Descriptor Cache or "
293 "Guest TLB.\n", TT_MSG(ec
));
294 else if (BUS_ERROR(ec
))
295 pr_cont(": %s/ECC error in data read from NB: %s.\n",
296 RRRR_MSG(ec
), PP_MSG(ec
));
297 else if (MEM_ERROR(ec
)) {
298 u8 rrrr
= (ec
>> 4) & 0xf;
301 pr_cont(": %s error during data copyback.\n",
303 else if (rrrr
<= 0x1)
304 pr_cont(": %s parity/ECC error during data "
305 "access from L2.\n", RRRR_MSG(ec
));
316 pr_emerg(HW_ERR
"Corrupted BU MCE info?\n");
319 static void amd_decode_ls_mce(struct mce
*m
)
321 u16 ec
= m
->status
& 0xffff;
322 u8 xec
= (m
->status
>> 16) & 0xf;
324 if (boot_cpu_data
.x86
== 0x14) {
325 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
326 " please report on LKML.\n");
330 pr_emerg(HW_ERR
"Load Store Error");
333 u8 r4
= (ec
>> 4) & 0xf;
335 if (!BUS_ERROR(ec
) || (r4
!= R4_DRD
&& r4
!= R4_DWR
))
338 pr_cont(" during %s.\n", RRRR_MSG(ec
));
345 pr_emerg(HW_ERR
"Corrupted LS MCE info?\n");
348 static bool k8_nb_mce(u16 ec
, u8 xec
)
354 pr_cont("CRC error detected on HT link.\n");
358 pr_cont("Invalid GART PTE entry during GART table walk.\n");
362 pr_cont("Unsupported atomic RMW received from an IO link.\n");
367 if (boot_cpu_data
.x86
== 0x11)
370 pr_cont("DRAM ECC error detected on the NB.\n");
374 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
385 static bool f10h_nb_mce(u16 ec
, u8 xec
)
390 if (k8_nb_mce(ec
, xec
))
404 pr_cont("GART Table Walk data error.\n");
405 else if (BUS_ERROR(ec
))
406 pr_cont("DMA Exclusion Vector Table Walk error.\n");
424 pr_cont("%s.\n", f10h_nb_mce_desc
[xec
- offset
]);
430 static bool nb_noop_mce(u16 ec
, u8 xec
)
435 void amd_decode_nb_mce(int node_id
, struct mce
*m
, u32 nbcfg
)
437 u8 xec
= (m
->status
>> 16) & 0x1f;
438 u16 ec
= m
->status
& 0xffff;
439 u32 nbsh
= (u32
)(m
->status
>> 32);
441 pr_emerg(HW_ERR
"Northbridge Error, node %d: ", node_id
);
444 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
445 * value encoding has changed so interpret those differently
447 if ((boot_cpu_data
.x86
== 0x10) &&
448 (boot_cpu_data
.x86_model
> 7)) {
449 if (nbsh
& K8_NBSH_ERR_CPU_VAL
)
450 pr_cont(", core: %u", (u8
)(nbsh
& nb_err_cpumask
));
452 u8 assoc_cpus
= nbsh
& nb_err_cpumask
;
455 pr_cont(", core: %d", fls(assoc_cpus
) - 1);
460 pr_cont("Sync error (sync packets on HT link detected).\n");
464 pr_cont("HT Master abort.\n");
468 pr_cont("HT Target abort.\n");
472 pr_cont("NB Watchdog timeout.\n");
476 pr_cont("SVM DMA Exclusion Vector error.\n");
483 if (!fam_ops
->nb_mce(ec
, xec
))
486 if (boot_cpu_data
.x86
== 0xf || boot_cpu_data
.x86
== 0x10)
487 if ((xec
== 0x8 || xec
== 0x0) && nb_bus_decoder
)
488 nb_bus_decoder(node_id
, m
, nbcfg
);
493 pr_emerg(HW_ERR
"Corrupted NB MCE info?\n");
495 EXPORT_SYMBOL_GPL(amd_decode_nb_mce
);
497 static void amd_decode_fr_mce(struct mce
*m
)
499 if (boot_cpu_data
.x86
== 0xf ||
500 boot_cpu_data
.x86
== 0x11)
503 /* we have only one error signature so match all fields at once. */
504 if ((m
->status
& 0xffff) == 0x0f0f) {
505 pr_emerg(HW_ERR
"FR Error: CPU Watchdog timer expire.\n");
510 pr_emerg(HW_ERR
"Corrupted FR MCE info?\n");
513 static inline void amd_decode_err_code(u16 ec
)
516 pr_emerg(HW_ERR
"Transaction: %s, Cache Level: %s\n",
517 TT_MSG(ec
), LL_MSG(ec
));
518 } else if (MEM_ERROR(ec
)) {
519 pr_emerg(HW_ERR
"Transaction: %s, Type: %s, Cache Level: %s\n",
520 RRRR_MSG(ec
), TT_MSG(ec
), LL_MSG(ec
));
521 } else if (BUS_ERROR(ec
)) {
522 pr_emerg(HW_ERR
"Transaction: %s (%s), %s, Cache Level: %s, "
523 "Participating Processor: %s\n",
524 RRRR_MSG(ec
), II_MSG(ec
), TO_MSG(ec
), LL_MSG(ec
),
527 pr_emerg(HW_ERR
"Huh? Unknown MCE error 0x%x\n", ec
);
531 * Filter out unwanted MCE signatures here.
533 static bool amd_filter_mce(struct mce
*m
)
535 u8 xec
= (m
->status
>> 16) & 0x1f;
538 * NB GART TLB error reporting is disabled by default.
540 if (m
->bank
== 4 && xec
== 0x5 && !report_gart_errors
)
546 int amd_decode_mce(struct notifier_block
*nb
, unsigned long val
, void *data
)
548 struct mce
*m
= (struct mce
*)data
;
551 if (amd_filter_mce(m
))
554 pr_emerg(HW_ERR
"MC%d_STATUS: ", m
->bank
);
556 pr_cont("%sorrected error, other errors lost: %s, "
557 "CPU context corrupt: %s",
558 ((m
->status
& MCI_STATUS_UC
) ? "Unc" : "C"),
559 ((m
->status
& MCI_STATUS_OVER
) ? "yes" : "no"),
560 ((m
->status
& MCI_STATUS_PCC
) ? "yes" : "no"));
562 /* do the two bits[14:13] together */
563 ecc
= (m
->status
>> 45) & 0x3;
565 pr_cont(", %sECC Error", ((ecc
== 2) ? "C" : "U"));
571 amd_decode_dc_mce(m
);
575 amd_decode_ic_mce(m
);
579 amd_decode_bu_mce(m
);
583 amd_decode_ls_mce(m
);
587 node
= amd_get_nb_id(m
->extcpu
);
588 amd_decode_nb_mce(node
, m
, 0);
592 amd_decode_fr_mce(m
);
599 amd_decode_err_code(m
->status
& 0xffff);
603 EXPORT_SYMBOL_GPL(amd_decode_mce
);
605 static struct notifier_block amd_mce_dec_nb
= {
606 .notifier_call
= amd_decode_mce
,
609 static int __init
mce_amd_init(void)
611 if (boot_cpu_data
.x86_vendor
!= X86_VENDOR_AMD
)
614 if ((boot_cpu_data
.x86
< 0xf || boot_cpu_data
.x86
> 0x12) &&
615 (boot_cpu_data
.x86
!= 0x14 || boot_cpu_data
.x86_model
> 0xf))
618 fam_ops
= kzalloc(sizeof(struct amd_decoder_ops
), GFP_KERNEL
);
622 switch (boot_cpu_data
.x86
) {
624 fam_ops
->dc_mce
= k8_dc_mce
;
625 fam_ops
->ic_mce
= k8_ic_mce
;
626 fam_ops
->nb_mce
= k8_nb_mce
;
630 fam_ops
->dc_mce
= f10h_dc_mce
;
631 fam_ops
->ic_mce
= k8_ic_mce
;
632 fam_ops
->nb_mce
= f10h_nb_mce
;
636 fam_ops
->dc_mce
= k8_dc_mce
;
637 fam_ops
->ic_mce
= k8_ic_mce
;
638 fam_ops
->nb_mce
= f10h_nb_mce
;
642 fam_ops
->dc_mce
= f12h_dc_mce
;
643 fam_ops
->ic_mce
= k8_ic_mce
;
644 fam_ops
->nb_mce
= nb_noop_mce
;
648 nb_err_cpumask
= 0x3;
649 fam_ops
->dc_mce
= f14h_dc_mce
;
650 fam_ops
->ic_mce
= f14h_ic_mce
;
651 fam_ops
->nb_mce
= nb_noop_mce
;
655 printk(KERN_WARNING
"Huh? What family is that: %d?!\n",
661 pr_info("MCE: In-kernel MCE decoding enabled.\n");
663 atomic_notifier_chain_register(&x86_mce_decoder_chain
, &amd_mce_dec_nb
);
667 early_initcall(mce_amd_init
);
670 static void __exit
mce_amd_exit(void)
672 atomic_notifier_chain_unregister(&x86_mce_decoder_chain
, &amd_mce_dec_nb
);
676 MODULE_DESCRIPTION("AMD MCE decoder");
677 MODULE_ALIAS("edac-mce-amd");
678 MODULE_LICENSE("GPL");
679 module_exit(mce_amd_exit
);