2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
18 #include <linux/capability.h>
19 #include <linux/cpu.h>
20 #include <linux/percpu.h>
21 #include <linux/ctype.h>
22 #include <asm/processor.h>
25 #include <asm/kdebug.h>
26 #include <asm/uaccess.h>
28 #define MISC_MCELOG_MINOR 227
31 static int mce_dont_init
;
33 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
34 3: never panic or exit (for testing only) */
35 static int tolerant
= 1;
37 static unsigned long bank
[NR_BANKS
] = { [0 ... NR_BANKS
-1] = ~0UL };
38 static unsigned long console_logged
;
39 static int notify_user
;
41 static int mce_bootlog
= 1;
44 * Lockless MCE logging infrastructure.
45 * This avoids deadlocks on printk locks without having to break locks. Also
46 * separate MCEs from kernel messages to avoid bogus bug reports.
49 struct mce_log mcelog
= {
54 void mce_log(struct mce
*mce
)
60 entry
= rcu_dereference(mcelog
.next
);
61 /* The rmb forces the compiler to reload next in each
65 /* When the buffer fills up discard new entries. Assume
66 that the earlier errors are the more interesting. */
67 if (entry
>= MCE_LOG_LEN
) {
68 set_bit(MCE_OVERFLOW
, &mcelog
.flags
);
71 /* Old left over entry. Skip. */
72 if (mcelog
.entry
[entry
].finished
) {
80 if (cmpxchg(&mcelog
.next
, entry
, next
) == entry
)
83 memcpy(mcelog
.entry
+ entry
, mce
, sizeof(struct mce
));
85 mcelog
.entry
[entry
].finished
= 1;
88 if (!test_and_set_bit(0, &console_logged
))
92 static void print_mce(struct mce
*m
)
94 printk(KERN_EMERG
"\n"
96 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
97 m
->cpu
, m
->mcgstatus
, m
->bank
, m
->status
);
100 "RIP%s %02x:<%016Lx> ",
101 !(m
->mcgstatus
& MCG_STATUS_EIPV
) ? " !INEXACT!" : "",
103 if (m
->cs
== __KERNEL_CS
)
104 print_symbol("{%s}", m
->rip
);
107 printk(KERN_EMERG
"TSC %Lx ", m
->tsc
);
109 printk("ADDR %Lx ", m
->addr
);
111 printk("MISC %Lx ", m
->misc
);
115 static void mce_panic(char *msg
, struct mce
*backup
, unsigned long start
)
119 for (i
= 0; i
< MCE_LOG_LEN
; i
++) {
120 unsigned long tsc
= mcelog
.entry
[i
].tsc
;
121 if (time_before(tsc
, start
))
123 print_mce(&mcelog
.entry
[i
]);
124 if (backup
&& mcelog
.entry
[i
].tsc
== backup
->tsc
)
130 printk("Fake panic: %s\n", msg
);
135 static int mce_available(struct cpuinfo_x86
*c
)
137 return test_bit(X86_FEATURE_MCE
, &c
->x86_capability
) &&
138 test_bit(X86_FEATURE_MCA
, &c
->x86_capability
);
141 static inline void mce_get_rip(struct mce
*m
, struct pt_regs
*regs
)
143 if (regs
&& (m
->mcgstatus
& MCG_STATUS_RIPV
)) {
151 /* Assume the RIP in the MSR is exact. Is this true? */
152 m
->mcgstatus
|= MCG_STATUS_EIPV
;
153 rdmsrl(rip_msr
, m
->rip
);
159 * The actual machine check handler
162 void do_machine_check(struct pt_regs
* regs
, long error_code
)
164 struct mce m
, panicm
;
165 int nowayout
= (tolerant
< 1);
169 int panicm_found
= 0;
172 notify_die(DIE_NMI
, "machine check", regs
, error_code
, 255, SIGKILL
);
176 memset(&m
, 0, sizeof(struct mce
));
177 m
.cpu
= hard_smp_processor_id();
178 rdmsrl(MSR_IA32_MCG_STATUS
, m
.mcgstatus
);
179 if (!(m
.mcgstatus
& MCG_STATUS_RIPV
))
185 for (i
= 0; i
< banks
; i
++) {
194 rdmsrl(MSR_IA32_MC0_STATUS
+ i
*4, m
.status
);
195 if ((m
.status
& MCI_STATUS_VAL
) == 0)
198 if (m
.status
& MCI_STATUS_EN
) {
199 /* In theory _OVER could be a nowayout too, but
200 assume any overflowed errors were no fatal. */
201 nowayout
|= !!(m
.status
& MCI_STATUS_PCC
);
202 kill_it
|= !!(m
.status
& MCI_STATUS_UC
);
205 if (m
.status
& MCI_STATUS_MISCV
)
206 rdmsrl(MSR_IA32_MC0_MISC
+ i
*4, m
.misc
);
207 if (m
.status
& MCI_STATUS_ADDRV
)
208 rdmsrl(MSR_IA32_MC0_ADDR
+ i
*4, m
.addr
);
210 mce_get_rip(&m
, regs
);
213 wrmsrl(MSR_IA32_MC0_STATUS
+ i
*4, 0);
214 if (error_code
!= -2)
217 /* Did this bank cause the exception? */
218 /* Assume that the bank with uncorrectable errors did it,
219 and that there is only a single one. */
220 if ((m
.status
& MCI_STATUS_UC
) && (m
.status
& MCI_STATUS_EN
)) {
225 add_taint(TAINT_MACHINE_CHECK
);
228 /* Never do anything final in the polling timer */
232 /* If we didn't find an uncorrectable error, pick
233 the last one (shouldn't happen, just being safe). */
237 mce_panic("Machine check", &panicm
, mcestart
);
241 if (m
.mcgstatus
& MCG_STATUS_RIPV
)
242 user_space
= panicm
.rip
&& (panicm
.cs
& 3);
244 /* When the machine was in user space and the CPU didn't get
245 confused it's normally not necessary to panic, unless you
246 are paranoid (tolerant == 0)
248 RED-PEN could be more tolerant for MCEs in idle,
249 but most likely they occur at boot anyways, where
250 it is best to just halt the machine. */
251 if ((!user_space
&& (panic_on_oops
|| tolerant
< 2)) ||
252 (unsigned)current
->pid
<= 1)
253 mce_panic("Uncorrected machine check", &panicm
, mcestart
);
255 /* do_exit takes an awful lot of locks and has as
256 slight risk of deadlocking. If you don't want that
257 don't set tolerant >= 2 */
263 /* Last thing done in the machine check exception to clear state. */
264 wrmsrl(MSR_IA32_MCG_STATUS
, 0);
268 * Periodic polling timer for "silent" machine check errors.
271 static int check_interval
= 5 * 60; /* 5 minutes */
272 static void mcheck_timer(void *data
);
273 static DECLARE_WORK(mcheck_work
, mcheck_timer
, NULL
);
275 static void mcheck_check_cpu(void *info
)
277 if (mce_available(¤t_cpu_data
))
278 do_machine_check(NULL
, 0);
281 static void mcheck_timer(void *data
)
283 on_each_cpu(mcheck_check_cpu
, NULL
, 1, 1);
284 schedule_delayed_work(&mcheck_work
, check_interval
* HZ
);
287 * It's ok to read stale data here for notify_user and
288 * console_logged as we'll simply get the updated versions
289 * on the next mcheck_timer execution and atomic operations
290 * on console_logged act as synchronization for notify_user
293 if (notify_user
&& console_logged
) {
295 clear_bit(0, &console_logged
);
296 printk(KERN_INFO
"Machine check events logged\n");
301 static __init
int periodic_mcheck_init(void)
304 schedule_delayed_work(&mcheck_work
, check_interval
*HZ
);
307 __initcall(periodic_mcheck_init
);
311 * Initialize Machine Checks for a CPU.
313 static void mce_init(void *dummy
)
318 rdmsrl(MSR_IA32_MCG_CAP
, cap
);
320 if (banks
> NR_BANKS
) {
321 printk(KERN_INFO
"MCE: warning: using only %d banks\n", banks
);
324 /* Use accurate RIP reporting if available. */
325 if ((cap
& (1<<9)) && ((cap
>> 16) & 0xff) >= 9)
326 rip_msr
= MSR_IA32_MCG_EIP
;
328 /* Log the machine checks left over from the previous reset.
329 This also clears all registers */
330 do_machine_check(NULL
, mce_bootlog
? -1 : -2);
332 set_in_cr4(X86_CR4_MCE
);
335 wrmsr(MSR_IA32_MCG_CTL
, 0xffffffff, 0xffffffff);
337 for (i
= 0; i
< banks
; i
++) {
338 wrmsrl(MSR_IA32_MC0_CTL
+4*i
, bank
[i
]);
339 wrmsrl(MSR_IA32_MC0_STATUS
+4*i
, 0);
343 /* Add per CPU specific workarounds here */
344 static void __cpuinit
mce_cpu_quirks(struct cpuinfo_x86
*c
)
346 /* This should be disabled by the BIOS, but isn't always */
347 if (c
->x86_vendor
== X86_VENDOR_AMD
&& c
->x86
== 15) {
348 /* disable GART TBL walk error reporting, which trips off
349 incorrectly with the IOMMU & 3ware & Cerberus. */
350 clear_bit(10, &bank
[4]);
351 /* Lots of broken BIOS around that don't clear them
352 by default and leave crap in there. Don't log. */
358 static void __cpuinit
mce_cpu_features(struct cpuinfo_x86
*c
)
360 switch (c
->x86_vendor
) {
361 case X86_VENDOR_INTEL
:
362 mce_intel_feature_init(c
);
365 mce_amd_feature_init(c
);
373 * Called for each booted CPU to set up machine checks.
374 * Must be called with preempt off.
376 void __cpuinit
mcheck_init(struct cpuinfo_x86
*c
)
378 static cpumask_t mce_cpus __initdata
= CPU_MASK_NONE
;
383 cpu_test_and_set(smp_processor_id(), mce_cpus
) ||
392 * Character device to read and clear the MCE log.
395 static void collect_tscs(void *data
)
397 unsigned long *cpu_tsc
= (unsigned long *)data
;
398 rdtscll(cpu_tsc
[smp_processor_id()]);
401 static ssize_t
mce_read(struct file
*filp
, char __user
*ubuf
, size_t usize
, loff_t
*off
)
403 unsigned long *cpu_tsc
;
404 static DECLARE_MUTEX(mce_read_sem
);
406 char __user
*buf
= ubuf
;
409 cpu_tsc
= kmalloc(NR_CPUS
* sizeof(long), GFP_KERNEL
);
414 next
= rcu_dereference(mcelog
.next
);
416 /* Only supports full reads right now */
417 if (*off
!= 0 || usize
< MCE_LOG_LEN
*sizeof(struct mce
)) {
424 for (i
= 0; i
< next
; i
++) {
425 unsigned long start
= jiffies
;
426 while (!mcelog
.entry
[i
].finished
) {
427 if (!time_before(jiffies
, start
+ 2)) {
428 memset(mcelog
.entry
+ i
,0, sizeof(struct mce
));
434 err
|= copy_to_user(buf
, mcelog
.entry
+ i
, sizeof(struct mce
));
435 buf
+= sizeof(struct mce
);
438 memset(mcelog
.entry
, 0, next
* sizeof(struct mce
));
443 /* Collect entries that were still getting written before the synchronize. */
445 on_each_cpu(collect_tscs
, cpu_tsc
, 1, 1);
446 for (i
= next
; i
< MCE_LOG_LEN
; i
++) {
447 if (mcelog
.entry
[i
].finished
&&
448 mcelog
.entry
[i
].tsc
< cpu_tsc
[mcelog
.entry
[i
].cpu
]) {
449 err
|= copy_to_user(buf
, mcelog
.entry
+i
, sizeof(struct mce
));
451 buf
+= sizeof(struct mce
);
452 memset(&mcelog
.entry
[i
], 0, sizeof(struct mce
));
457 return err
? -EFAULT
: buf
- ubuf
;
460 static int mce_ioctl(struct inode
*i
, struct file
*f
,unsigned int cmd
, unsigned long arg
)
462 int __user
*p
= (int __user
*)arg
;
463 if (!capable(CAP_SYS_ADMIN
))
466 case MCE_GET_RECORD_LEN
:
467 return put_user(sizeof(struct mce
), p
);
468 case MCE_GET_LOG_LEN
:
469 return put_user(MCE_LOG_LEN
, p
);
470 case MCE_GETCLEAR_FLAGS
: {
473 flags
= mcelog
.flags
;
474 } while (cmpxchg(&mcelog
.flags
, flags
, 0) != flags
);
475 return put_user(flags
, p
);
482 static struct file_operations mce_chrdev_ops
= {
487 static struct miscdevice mce_log_device
= {
494 * Old style boot options parsing. Only for compatibility.
497 static int __init
mcheck_disable(char *str
)
503 /* mce=off disables machine check. Note you can reenable it later
505 mce=TOLERANCELEVEL (number, see above)
506 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
507 mce=nobootlog Don't log MCEs from before booting. */
508 static int __init
mcheck_enable(char *str
)
512 if (!strcmp(str
, "off"))
514 else if (!strcmp(str
, "bootlog") || !strcmp(str
,"nobootlog"))
515 mce_bootlog
= str
[0] == 'b';
516 else if (isdigit(str
[0]))
517 get_option(&str
, &tolerant
);
519 printk("mce= argument %s ignored. Please use /sys", str
);
523 __setup("nomce", mcheck_disable
);
524 __setup("mce", mcheck_enable
);
530 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
531 Only one CPU is active at this time, the others get readded later using
533 static int mce_resume(struct sys_device
*dev
)
539 /* Reinit MCEs after user configuration changes */
540 static void mce_restart(void)
543 cancel_delayed_work(&mcheck_work
);
544 /* Timer race is harmless here */
545 on_each_cpu(mce_init
, NULL
, 1, 1);
547 schedule_delayed_work(&mcheck_work
, check_interval
*HZ
);
550 static struct sysdev_class mce_sysclass
= {
551 .resume
= mce_resume
,
552 set_kset_name("machinecheck"),
555 static DEFINE_PER_CPU(struct sys_device
, device_mce
);
557 /* Why are there no generic functions for this? */
558 #define ACCESSOR(name, var, start) \
559 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
560 return sprintf(buf, "%lx\n", (unsigned long)var); \
562 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
564 unsigned long new = simple_strtoul(buf, &end, 0); \
565 if (end == buf) return -EINVAL; \
570 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
572 ACCESSOR(bank0ctl
,bank
[0],mce_restart())
573 ACCESSOR(bank1ctl
,bank
[1],mce_restart())
574 ACCESSOR(bank2ctl
,bank
[2],mce_restart())
575 ACCESSOR(bank3ctl
,bank
[3],mce_restart())
576 ACCESSOR(bank4ctl
,bank
[4],mce_restart())
577 ACCESSOR(tolerant
,tolerant
,)
578 ACCESSOR(check_interval
,check_interval
,mce_restart())
580 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
581 static __cpuinit
int mce_create_device(unsigned int cpu
)
584 if (!mce_available(&cpu_data
[cpu
]))
587 per_cpu(device_mce
,cpu
).id
= cpu
;
588 per_cpu(device_mce
,cpu
).cls
= &mce_sysclass
;
590 err
= sysdev_register(&per_cpu(device_mce
,cpu
));
593 sysdev_create_file(&per_cpu(device_mce
,cpu
), &attr_bank0ctl
);
594 sysdev_create_file(&per_cpu(device_mce
,cpu
), &attr_bank1ctl
);
595 sysdev_create_file(&per_cpu(device_mce
,cpu
), &attr_bank2ctl
);
596 sysdev_create_file(&per_cpu(device_mce
,cpu
), &attr_bank3ctl
);
597 sysdev_create_file(&per_cpu(device_mce
,cpu
), &attr_bank4ctl
);
598 sysdev_create_file(&per_cpu(device_mce
,cpu
), &attr_tolerant
);
599 sysdev_create_file(&per_cpu(device_mce
,cpu
), &attr_check_interval
);
604 #ifdef CONFIG_HOTPLUG_CPU
605 static __cpuinit
void mce_remove_device(unsigned int cpu
)
607 sysdev_remove_file(&per_cpu(device_mce
,cpu
), &attr_bank0ctl
);
608 sysdev_remove_file(&per_cpu(device_mce
,cpu
), &attr_bank1ctl
);
609 sysdev_remove_file(&per_cpu(device_mce
,cpu
), &attr_bank2ctl
);
610 sysdev_remove_file(&per_cpu(device_mce
,cpu
), &attr_bank3ctl
);
611 sysdev_remove_file(&per_cpu(device_mce
,cpu
), &attr_bank4ctl
);
612 sysdev_remove_file(&per_cpu(device_mce
,cpu
), &attr_tolerant
);
613 sysdev_remove_file(&per_cpu(device_mce
,cpu
), &attr_check_interval
);
614 sysdev_unregister(&per_cpu(device_mce
,cpu
));
618 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
620 mce_cpu_callback(struct notifier_block
*nfb
, unsigned long action
, void *hcpu
)
622 unsigned int cpu
= (unsigned long)hcpu
;
626 mce_create_device(cpu
);
628 #ifdef CONFIG_HOTPLUG_CPU
630 mce_remove_device(cpu
);
637 static struct notifier_block mce_cpu_notifier
= {
638 .notifier_call
= mce_cpu_callback
,
641 static __init
int mce_init_device(void)
646 if (!mce_available(&boot_cpu_data
))
648 err
= sysdev_class_register(&mce_sysclass
);
650 for_each_online_cpu(i
) {
651 mce_create_device(i
);
654 register_cpu_notifier(&mce_cpu_notifier
);
655 misc_register(&mce_log_device
);
659 device_initcall(mce_init_device
);