RT-AC66 3.0.0.4.374.130 core
[tomato.git] / release / src-rt-6.x / linux / linux-2.6 / arch / x86_64 / kernel / mce.c
blobaa1d1599179433b1e116cd834dfebebe2ab3c67f
1 /*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
8 #include <linux/init.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/sched.h>
12 #include <linux/string.h>
13 #include <linux/rcupdate.h>
14 #include <linux/kallsyms.h>
15 #include <linux/sysdev.h>
16 #include <linux/miscdevice.h>
17 #include <linux/fs.h>
18 #include <linux/capability.h>
19 #include <linux/cpu.h>
20 #include <linux/percpu.h>
21 #include <linux/ctype.h>
22 #include <linux/kmod.h>
23 #include <linux/kdebug.h>
24 #include <asm/processor.h>
25 #include <asm/msr.h>
26 #include <asm/mce.h>
27 #include <asm/uaccess.h>
28 #include <asm/smp.h>
30 #define MISC_MCELOG_MINOR 227
31 #define NR_BANKS 6
33 atomic_t mce_entry;
35 static int mce_dont_init;
37 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
38 3: never panic or exit (for testing only) */
39 static int tolerant = 1;
40 static int banks;
41 static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
42 static unsigned long console_logged;
43 static int notify_user;
44 static int rip_msr;
45 static int mce_bootlog = 1;
46 static atomic_t mce_events;
48 static char trigger[128];
49 static char *trigger_argv[2] = { trigger, NULL };
52 * Lockless MCE logging infrastructure.
53 * This avoids deadlocks on printk locks without having to break locks. Also
54 * separate MCEs from kernel messages to avoid bogus bug reports.
57 struct mce_log mcelog = {
58 MCE_LOG_SIGNATURE,
59 MCE_LOG_LEN,
60 };
62 void mce_log(struct mce *mce)
64 unsigned next, entry;
65 atomic_inc(&mce_events);
66 mce->finished = 0;
67 wmb();
68 for (;;) {
69 entry = rcu_dereference(mcelog.next);
70 /* The rmb forces the compiler to reload next in each
71 iteration */
72 rmb();
73 for (;;) {
74 /* When the buffer fills up discard new entries. Assume
75 that the earlier errors are the more interesting. */
76 if (entry >= MCE_LOG_LEN) {
77 set_bit(MCE_OVERFLOW, &mcelog.flags);
78 return;
80 /* Old left over entry. Skip. */
81 if (mcelog.entry[entry].finished) {
82 entry++;
83 continue;
85 break;
87 smp_rmb();
88 next = entry + 1;
89 if (cmpxchg(&mcelog.next, entry, next) == entry)
90 break;
92 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
93 wmb();
94 mcelog.entry[entry].finished = 1;
95 wmb();
97 if (!test_and_set_bit(0, &console_logged))
98 notify_user = 1;
101 static void print_mce(struct mce *m)
103 printk(KERN_EMERG "\n"
104 KERN_EMERG "HARDWARE ERROR\n"
105 KERN_EMERG
106 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
107 m->cpu, m->mcgstatus, m->bank, m->status);
108 if (m->rip) {
109 printk(KERN_EMERG
110 "RIP%s %02x:<%016Lx> ",
111 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
112 m->cs, m->rip);
113 if (m->cs == __KERNEL_CS)
114 print_symbol("{%s}", m->rip);
115 printk("\n");
117 printk(KERN_EMERG "TSC %Lx ", m->tsc);
118 if (m->addr)
119 printk("ADDR %Lx ", m->addr);
120 if (m->misc)
121 printk("MISC %Lx ", m->misc);
122 printk("\n");
123 printk(KERN_EMERG "This is not a software problem!\n");
124 printk(KERN_EMERG
125 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
128 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
130 int i;
131 oops_begin();
132 for (i = 0; i < MCE_LOG_LEN; i++) {
133 unsigned long tsc = mcelog.entry[i].tsc;
134 if (time_before(tsc, start))
135 continue;
136 print_mce(&mcelog.entry[i]);
137 if (backup && mcelog.entry[i].tsc == backup->tsc)
138 backup = NULL;
140 if (backup)
141 print_mce(backup);
142 if (tolerant >= 3)
143 printk("Fake panic: %s\n", msg);
144 else
145 panic(msg);
148 static int mce_available(struct cpuinfo_x86 *c)
150 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
153 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
155 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
156 m->rip = regs->rip;
157 m->cs = regs->cs;
158 } else {
159 m->rip = 0;
160 m->cs = 0;
162 if (rip_msr) {
163 /* Assume the RIP in the MSR is exact. Is this true? */
164 m->mcgstatus |= MCG_STATUS_EIPV;
165 rdmsrl(rip_msr, m->rip);
166 m->cs = 0;
170 static void do_mce_trigger(void)
172 static atomic_t mce_logged;
173 int events = atomic_read(&mce_events);
174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events);
177 call_usermodehelper(trigger, trigger_argv, NULL, -1);
182 * The actual machine check handler
185 void do_machine_check(struct pt_regs * regs, long error_code)
187 struct mce m, panicm;
188 int nowayout = (tolerant < 1);
189 int kill_it = 0;
190 u64 mcestart = 0;
191 int i;
192 int panicm_found = 0;
194 atomic_inc(&mce_entry);
196 if (regs)
197 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
198 if (!banks)
199 goto out2;
201 memset(&m, 0, sizeof(struct mce));
202 m.cpu = smp_processor_id();
203 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
204 if (!(m.mcgstatus & MCG_STATUS_RIPV))
205 kill_it = 1;
207 rdtscll(mcestart);
208 barrier();
210 for (i = 0; i < banks; i++) {
211 if (!bank[i])
212 continue;
214 m.misc = 0;
215 m.addr = 0;
216 m.bank = i;
217 m.tsc = 0;
219 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
220 if ((m.status & MCI_STATUS_VAL) == 0)
221 continue;
223 if (m.status & MCI_STATUS_EN) {
224 /* In theory _OVER could be a nowayout too, but
225 assume any overflowed errors were no fatal. */
226 nowayout |= !!(m.status & MCI_STATUS_PCC);
227 kill_it |= !!(m.status & MCI_STATUS_UC);
230 if (m.status & MCI_STATUS_MISCV)
231 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
232 if (m.status & MCI_STATUS_ADDRV)
233 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235 mce_get_rip(&m, regs);
236 if (error_code >= 0)
237 rdtscll(m.tsc);
238 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
239 if (error_code != -2)
240 mce_log(&m);
242 /* Did this bank cause the exception? */
243 /* Assume that the bank with uncorrectable errors did it,
244 and that there is only a single one. */
245 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
246 panicm = m;
247 panicm_found = 1;
250 add_taint(TAINT_MACHINE_CHECK);
253 /* Never do anything final in the polling timer */
254 if (!regs) {
255 /* Normal interrupt context here. Call trigger for any new
256 events. */
257 do_mce_trigger();
258 goto out;
261 /* If we didn't find an uncorrectable error, pick
262 the last one (shouldn't happen, just being safe). */
263 if (!panicm_found)
264 panicm = m;
265 if (nowayout)
266 mce_panic("Machine check", &panicm, mcestart);
267 if (kill_it) {
268 int user_space = 0;
270 if (m.mcgstatus & MCG_STATUS_RIPV)
271 user_space = panicm.rip && (panicm.cs & 3);
273 /* When the machine was in user space and the CPU didn't get
274 confused it's normally not necessary to panic, unless you
275 are paranoid (tolerant == 0)
277 RED-PEN could be more tolerant for MCEs in idle,
278 but most likely they occur at boot anyways, where
279 it is best to just halt the machine. */
280 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
281 (unsigned)current->pid <= 1)
282 mce_panic("Uncorrected machine check", &panicm, mcestart);
284 /* do_exit takes an awful lot of locks and has as
285 slight risk of deadlocking. If you don't want that
286 don't set tolerant >= 2 */
287 if (tolerant < 3)
288 do_exit(SIGBUS);
291 out:
292 /* Last thing done in the machine check exception to clear state. */
293 wrmsrl(MSR_IA32_MCG_STATUS, 0);
294 out2:
295 atomic_dec(&mce_entry);
298 #ifdef CONFIG_X86_MCE_INTEL
299 /***
300 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
301 * @cpu: The CPU on which the event occured.
302 * @status: Event status information
304 * This function should be called by the thermal interrupt after the
305 * event has been processed and the decision was made to log the event
306 * further.
308 * The status parameter will be saved to the 'status' field of 'struct mce'
309 * and historically has been the register value of the
310 * MSR_IA32_THERMAL_STATUS (Intel) msr.
312 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
314 struct mce m;
316 memset(&m, 0, sizeof(m));
317 m.cpu = cpu;
318 m.bank = MCE_THERMAL_BANK;
319 m.status = status;
320 rdtscll(m.tsc);
321 mce_log(&m);
323 #endif /* CONFIG_X86_MCE_INTEL */
326 * Periodic polling timer for "silent" machine check errors. If the
327 * poller finds an MCE, poll 2x faster. When the poller finds no more
328 * errors, poll 2x slower (up to check_interval seconds).
331 static int check_interval = 5 * 60; /* 5 minutes */
332 static int next_interval; /* in jiffies */
333 static void mcheck_timer(struct work_struct *work);
334 static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
336 static void mcheck_check_cpu(void *info)
338 if (mce_available(&current_cpu_data))
339 do_machine_check(NULL, 0);
342 static void mcheck_timer(struct work_struct *work)
344 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
347 * It's ok to read stale data here for notify_user and
348 * console_logged as we'll simply get the updated versions
349 * on the next mcheck_timer execution and atomic operations
350 * on console_logged act as synchronization for notify_user
351 * writes.
353 if (notify_user && console_logged) {
354 static unsigned long last_print;
355 unsigned long now = jiffies;
357 /* if we logged an MCE, reduce the polling interval */
358 next_interval = max(next_interval/2, HZ/100);
359 notify_user = 0;
360 clear_bit(0, &console_logged);
361 if (time_after_eq(now, last_print + (check_interval*HZ))) {
362 last_print = now;
363 printk(KERN_INFO "Machine check events logged\n");
365 } else {
366 next_interval = min(next_interval*2, check_interval*HZ);
369 schedule_delayed_work(&mcheck_work, next_interval);
373 static __init int periodic_mcheck_init(void)
375 next_interval = check_interval * HZ;
376 if (next_interval)
377 schedule_delayed_work(&mcheck_work, next_interval);
378 return 0;
380 __initcall(periodic_mcheck_init);
384 * Initialize Machine Checks for a CPU.
386 static void mce_init(void *dummy)
388 u64 cap;
389 int i;
391 rdmsrl(MSR_IA32_MCG_CAP, cap);
392 banks = cap & 0xff;
393 if (banks > NR_BANKS) {
394 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
395 banks = NR_BANKS;
397 /* Use accurate RIP reporting if available. */
398 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
399 rip_msr = MSR_IA32_MCG_EIP;
401 /* Log the machine checks left over from the previous reset.
402 This also clears all registers */
403 do_machine_check(NULL, mce_bootlog ? -1 : -2);
405 set_in_cr4(X86_CR4_MCE);
407 if (cap & MCG_CTL_P)
408 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
410 for (i = 0; i < banks; i++) {
411 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
412 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
416 /* Add per CPU specific workarounds here */
417 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
419 /* This should be disabled by the BIOS, but isn't always */
420 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
421 /* disable GART TBL walk error reporting, which trips off
422 incorrectly with the IOMMU & 3ware & Cerberus. */
423 clear_bit(10, &bank[4]);
424 /* Lots of broken BIOS around that don't clear them
425 by default and leave crap in there. Don't log. */
426 mce_bootlog = 0;
431 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
433 switch (c->x86_vendor) {
434 case X86_VENDOR_INTEL:
435 mce_intel_feature_init(c);
436 break;
437 case X86_VENDOR_AMD:
438 mce_amd_feature_init(c);
439 break;
440 default:
441 break;
446 * Called for each booted CPU to set up machine checks.
447 * Must be called with preempt off.
449 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
451 static cpumask_t mce_cpus = CPU_MASK_NONE;
453 mce_cpu_quirks(c);
455 if (mce_dont_init ||
456 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
457 !mce_available(c))
458 return;
460 mce_init(NULL);
461 mce_cpu_features(c);
465 * Character device to read and clear the MCE log.
468 static void collect_tscs(void *data)
470 unsigned long *cpu_tsc = (unsigned long *)data;
471 rdtscll(cpu_tsc[smp_processor_id()]);
474 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
476 unsigned long *cpu_tsc;
477 static DECLARE_MUTEX(mce_read_sem);
478 unsigned next;
479 char __user *buf = ubuf;
480 int i, err;
482 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
483 if (!cpu_tsc)
484 return -ENOMEM;
486 down(&mce_read_sem);
487 next = rcu_dereference(mcelog.next);
489 /* Only supports full reads right now */
490 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
491 up(&mce_read_sem);
492 kfree(cpu_tsc);
493 return -EINVAL;
496 err = 0;
497 for (i = 0; i < next; i++) {
498 unsigned long start = jiffies;
499 while (!mcelog.entry[i].finished) {
500 if (time_after_eq(jiffies, start + 2)) {
501 memset(mcelog.entry + i,0, sizeof(struct mce));
502 goto timeout;
504 cpu_relax();
506 smp_rmb();
507 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
508 buf += sizeof(struct mce);
509 timeout:
513 memset(mcelog.entry, 0, next * sizeof(struct mce));
514 mcelog.next = 0;
516 synchronize_sched();
518 /* Collect entries that were still getting written before the synchronize. */
520 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
521 for (i = next; i < MCE_LOG_LEN; i++) {
522 if (mcelog.entry[i].finished &&
523 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
524 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
525 smp_rmb();
526 buf += sizeof(struct mce);
527 memset(&mcelog.entry[i], 0, sizeof(struct mce));
530 up(&mce_read_sem);
531 kfree(cpu_tsc);
532 return err ? -EFAULT : buf - ubuf;
535 static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
537 int __user *p = (int __user *)arg;
538 if (!capable(CAP_SYS_ADMIN))
539 return -EPERM;
540 switch (cmd) {
541 case MCE_GET_RECORD_LEN:
542 return put_user(sizeof(struct mce), p);
543 case MCE_GET_LOG_LEN:
544 return put_user(MCE_LOG_LEN, p);
545 case MCE_GETCLEAR_FLAGS: {
546 unsigned flags;
547 do {
548 flags = mcelog.flags;
549 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
550 return put_user(flags, p);
552 default:
553 return -ENOTTY;
557 static const struct file_operations mce_chrdev_ops = {
558 .read = mce_read,
559 .ioctl = mce_ioctl,
562 static struct miscdevice mce_log_device = {
563 MISC_MCELOG_MINOR,
564 "mcelog",
565 &mce_chrdev_ops,
569 * Old style boot options parsing. Only for compatibility.
572 static int __init mcheck_disable(char *str)
574 mce_dont_init = 1;
575 return 1;
578 /* mce=off disables machine check. Note you can reenable it later
579 using sysfs.
580 mce=TOLERANCELEVEL (number, see above)
581 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
582 mce=nobootlog Don't log MCEs from before booting. */
583 static int __init mcheck_enable(char *str)
585 if (*str == '=')
586 str++;
587 if (!strcmp(str, "off"))
588 mce_dont_init = 1;
589 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
590 mce_bootlog = str[0] == 'b';
591 else if (isdigit(str[0]))
592 get_option(&str, &tolerant);
593 else
594 printk("mce= argument %s ignored. Please use /sys", str);
595 return 1;
598 __setup("nomce", mcheck_disable);
599 __setup("mce", mcheck_enable);
602 * Sysfs support
605 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
606 Only one CPU is active at this time, the others get readded later using
607 CPU hotplug. */
608 static int mce_resume(struct sys_device *dev)
610 mce_init(NULL);
611 return 0;
614 /* Reinit MCEs after user configuration changes */
615 static void mce_restart(void)
617 if (next_interval)
618 cancel_delayed_work(&mcheck_work);
619 /* Timer race is harmless here */
620 on_each_cpu(mce_init, NULL, 1, 1);
621 next_interval = check_interval * HZ;
622 if (next_interval)
623 schedule_delayed_work(&mcheck_work, next_interval);
626 static struct sysdev_class mce_sysclass = {
627 .resume = mce_resume,
628 set_kset_name("machinecheck"),
631 DEFINE_PER_CPU(struct sys_device, device_mce);
633 /* Why are there no generic functions for this? */
634 #define ACCESSOR(name, var, start) \
635 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
636 return sprintf(buf, "%lx\n", (unsigned long)var); \
638 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
639 char *end; \
640 unsigned long new = simple_strtoul(buf, &end, 0); \
641 if (end == buf) return -EINVAL; \
642 var = new; \
643 start; \
644 return end-buf; \
646 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
648 /* TBD should generate these dynamically based on number of available banks */
649 ACCESSOR(bank0ctl,bank[0],mce_restart())
650 ACCESSOR(bank1ctl,bank[1],mce_restart())
651 ACCESSOR(bank2ctl,bank[2],mce_restart())
652 ACCESSOR(bank3ctl,bank[3],mce_restart())
653 ACCESSOR(bank4ctl,bank[4],mce_restart())
654 ACCESSOR(bank5ctl,bank[5],mce_restart())
656 static ssize_t show_trigger(struct sys_device *s, char *buf)
658 strcpy(buf, trigger);
659 strcat(buf, "\n");
660 return strlen(trigger) + 1;
663 static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
665 char *p;
666 int len;
667 strncpy(trigger, buf, sizeof(trigger));
668 trigger[sizeof(trigger)-1] = 0;
669 len = strlen(trigger);
670 p = strchr(trigger, '\n');
671 if (*p) *p = 0;
672 return len;
675 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
676 ACCESSOR(tolerant,tolerant,)
677 ACCESSOR(check_interval,check_interval,mce_restart())
678 static struct sysdev_attribute *mce_attributes[] = {
679 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
680 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
681 &attr_tolerant, &attr_check_interval, &attr_trigger,
682 NULL
685 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
686 static __cpuinit int mce_create_device(unsigned int cpu)
688 int err;
689 int i;
690 if (!mce_available(&cpu_data[cpu]))
691 return -EIO;
693 per_cpu(device_mce,cpu).id = cpu;
694 per_cpu(device_mce,cpu).cls = &mce_sysclass;
696 err = sysdev_register(&per_cpu(device_mce,cpu));
698 if (!err) {
699 for (i = 0; mce_attributes[i]; i++)
700 sysdev_create_file(&per_cpu(device_mce,cpu),
701 mce_attributes[i]);
703 return err;
706 static void mce_remove_device(unsigned int cpu)
708 int i;
710 for (i = 0; mce_attributes[i]; i++)
711 sysdev_remove_file(&per_cpu(device_mce,cpu),
712 mce_attributes[i]);
713 sysdev_unregister(&per_cpu(device_mce,cpu));
714 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
717 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
718 static int
719 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
721 unsigned int cpu = (unsigned long)hcpu;
723 switch (action) {
724 case CPU_ONLINE:
725 case CPU_ONLINE_FROZEN:
726 mce_create_device(cpu);
727 break;
728 case CPU_DEAD:
729 case CPU_DEAD_FROZEN:
730 mce_remove_device(cpu);
731 break;
733 return NOTIFY_OK;
736 static struct notifier_block mce_cpu_notifier = {
737 .notifier_call = mce_cpu_callback,
740 static __init int mce_init_device(void)
742 int err;
743 int i = 0;
745 if (!mce_available(&boot_cpu_data))
746 return -EIO;
747 err = sysdev_class_register(&mce_sysclass);
749 for_each_online_cpu(i) {
750 mce_create_device(i);
753 register_hotcpu_notifier(&mce_cpu_notifier);
754 misc_register(&mce_log_device);
755 return err;
758 device_initcall(mce_init_device);