x86, mce, cmci: export MAX_NR_BANKS
[linux-2.6/linux-2.6-openrd.git] / arch / x86 / kernel / cpu / mcheck / mce_64.c
blob39f8bb525a74afa7bac0949d7a248a50a21c4988
1 /*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
10 #include <linux/init.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/sched.h>
14 #include <linux/smp_lock.h>
15 #include <linux/string.h>
16 #include <linux/rcupdate.h>
17 #include <linux/kallsyms.h>
18 #include <linux/sysdev.h>
19 #include <linux/miscdevice.h>
20 #include <linux/fs.h>
21 #include <linux/capability.h>
22 #include <linux/cpu.h>
23 #include <linux/percpu.h>
24 #include <linux/poll.h>
25 #include <linux/thread_info.h>
26 #include <linux/ctype.h>
27 #include <linux/kmod.h>
28 #include <linux/kdebug.h>
29 #include <linux/kobject.h>
30 #include <linux/sysfs.h>
31 #include <asm/processor.h>
32 #include <asm/msr.h>
33 #include <asm/mce.h>
34 #include <asm/uaccess.h>
35 #include <asm/smp.h>
36 #include <asm/idle.h>
38 #define MISC_MCELOG_MINOR 227
40 atomic_t mce_entry;
42 static int mce_dont_init;
45 * Tolerant levels:
46 * 0: always panic on uncorrected errors, log corrected errors
47 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
48 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
49 * 3: never panic or SIGBUS, log all errors (for testing only)
51 static int tolerant = 1;
52 static int banks;
53 static u64 *bank;
54 static unsigned long notify_user;
55 static int rip_msr;
56 static int mce_bootlog = -1;
57 static atomic_t mce_events;
59 static char trigger[128];
60 static char *trigger_argv[2] = { trigger, NULL };
62 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64 /* Do initial initialization of a struct mce */
65 void mce_setup(struct mce *m)
67 memset(m, 0, sizeof(struct mce));
68 m->cpu = smp_processor_id();
69 rdtscll(m->tsc);
73 * Lockless MCE logging infrastructure.
74 * This avoids deadlocks on printk locks without having to break locks. Also
75 * separate MCEs from kernel messages to avoid bogus bug reports.
78 static struct mce_log mcelog = {
79 MCE_LOG_SIGNATURE,
80 MCE_LOG_LEN,
83 void mce_log(struct mce *mce)
85 unsigned next, entry;
86 atomic_inc(&mce_events);
87 mce->finished = 0;
88 wmb();
89 for (;;) {
90 entry = rcu_dereference(mcelog.next);
91 for (;;) {
92 /* When the buffer fills up discard new entries. Assume
93 that the earlier errors are the more interesting. */
94 if (entry >= MCE_LOG_LEN) {
95 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
96 return;
98 /* Old left over entry. Skip. */
99 if (mcelog.entry[entry].finished) {
100 entry++;
101 continue;
103 break;
105 smp_rmb();
106 next = entry + 1;
107 if (cmpxchg(&mcelog.next, entry, next) == entry)
108 break;
110 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
111 wmb();
112 mcelog.entry[entry].finished = 1;
113 wmb();
115 set_bit(0, &notify_user);
118 static void print_mce(struct mce *m)
120 printk(KERN_EMERG "\n"
121 KERN_EMERG "HARDWARE ERROR\n"
122 KERN_EMERG
123 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
124 m->cpu, m->mcgstatus, m->bank, m->status);
125 if (m->ip) {
126 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
127 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
128 m->cs, m->ip);
129 if (m->cs == __KERNEL_CS)
130 print_symbol("{%s}", m->ip);
131 printk("\n");
133 printk(KERN_EMERG "TSC %llx ", m->tsc);
134 if (m->addr)
135 printk("ADDR %llx ", m->addr);
136 if (m->misc)
137 printk("MISC %llx ", m->misc);
138 printk("\n");
139 printk(KERN_EMERG "This is not a software problem!\n");
140 printk(KERN_EMERG "Run through mcelog --ascii to decode "
141 "and contact your hardware vendor\n");
144 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
146 int i;
148 oops_begin();
149 for (i = 0; i < MCE_LOG_LEN; i++) {
150 unsigned long tsc = mcelog.entry[i].tsc;
152 if (time_before(tsc, start))
153 continue;
154 print_mce(&mcelog.entry[i]);
155 if (backup && mcelog.entry[i].tsc == backup->tsc)
156 backup = NULL;
158 if (backup)
159 print_mce(backup);
160 panic(msg);
163 static int mce_available(struct cpuinfo_x86 *c)
165 if (mce_dont_init)
166 return 0;
167 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
170 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
172 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
173 m->ip = regs->ip;
174 m->cs = regs->cs;
175 } else {
176 m->ip = 0;
177 m->cs = 0;
179 if (rip_msr) {
180 /* Assume the RIP in the MSR is exact. Is this true? */
181 m->mcgstatus |= MCG_STATUS_EIPV;
182 rdmsrl(rip_msr, m->ip);
183 m->cs = 0;
188 * Poll for corrected events or events that happened before reset.
189 * Those are just logged through /dev/mcelog.
191 * This is executed in standard interrupt context.
193 void machine_check_poll(enum mcp_flags flags)
195 struct mce m;
196 int i;
198 mce_setup(&m);
200 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
201 for (i = 0; i < banks; i++) {
202 if (!bank[i])
203 continue;
205 m.misc = 0;
206 m.addr = 0;
207 m.bank = i;
208 m.tsc = 0;
210 barrier();
211 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
212 if (!(m.status & MCI_STATUS_VAL))
213 continue;
216 * Uncorrected events are handled by the exception handler
217 * when it is enabled. But when the exception is disabled log
218 * everything.
220 * TBD do the same check for MCI_STATUS_EN here?
222 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
223 continue;
225 if (m.status & MCI_STATUS_MISCV)
226 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
227 if (m.status & MCI_STATUS_ADDRV)
228 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
230 if (!(flags & MCP_TIMESTAMP))
231 m.tsc = 0;
233 * Don't get the IP here because it's unlikely to
234 * have anything to do with the actual error location.
237 mce_log(&m);
238 add_taint(TAINT_MACHINE_CHECK);
241 * Clear state for this bank.
243 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
247 * Don't clear MCG_STATUS here because it's only defined for
248 * exceptions.
253 * The actual machine check handler. This only handles real
254 * exceptions when something got corrupted coming in through int 18.
256 * This is executed in NMI context not subject to normal locking rules. This
257 * implies that most kernel services cannot be safely used. Don't even
258 * think about putting a printk in there!
260 void do_machine_check(struct pt_regs * regs, long error_code)
262 struct mce m, panicm;
263 u64 mcestart = 0;
264 int i;
265 int panicm_found = 0;
267 * If no_way_out gets set, there is no safe way to recover from this
268 * MCE. If tolerant is cranked up, we'll try anyway.
270 int no_way_out = 0;
272 * If kill_it gets set, there might be a way to recover from this
273 * error.
275 int kill_it = 0;
276 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
278 atomic_inc(&mce_entry);
280 if (notify_die(DIE_NMI, "machine check", regs, error_code,
281 18, SIGKILL) == NOTIFY_STOP)
282 goto out2;
283 if (!banks)
284 goto out2;
286 mce_setup(&m);
288 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
289 /* if the restart IP is not valid, we're done for */
290 if (!(m.mcgstatus & MCG_STATUS_RIPV))
291 no_way_out = 1;
293 rdtscll(mcestart);
294 barrier();
296 for (i = 0; i < banks; i++) {
297 __clear_bit(i, toclear);
298 if (!bank[i])
299 continue;
301 m.misc = 0;
302 m.addr = 0;
303 m.bank = i;
305 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
306 if ((m.status & MCI_STATUS_VAL) == 0)
307 continue;
310 * Non uncorrected errors are handled by machine_check_poll
311 * Leave them alone.
313 if ((m.status & MCI_STATUS_UC) == 0)
314 continue;
317 * Set taint even when machine check was not enabled.
319 add_taint(TAINT_MACHINE_CHECK);
321 __set_bit(i, toclear);
323 if (m.status & MCI_STATUS_EN) {
324 /* if PCC was set, there's no way out */
325 no_way_out |= !!(m.status & MCI_STATUS_PCC);
327 * If this error was uncorrectable and there was
328 * an overflow, we're in trouble. If no overflow,
329 * we might get away with just killing a task.
331 if (m.status & MCI_STATUS_UC) {
332 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
333 no_way_out = 1;
334 kill_it = 1;
336 } else {
338 * Machine check event was not enabled. Clear, but
339 * ignore.
341 continue;
344 if (m.status & MCI_STATUS_MISCV)
345 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
346 if (m.status & MCI_STATUS_ADDRV)
347 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
349 mce_get_rip(&m, regs);
350 mce_log(&m);
352 /* Did this bank cause the exception? */
353 /* Assume that the bank with uncorrectable errors did it,
354 and that there is only a single one. */
355 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
356 panicm = m;
357 panicm_found = 1;
361 /* If we didn't find an uncorrectable error, pick
362 the last one (shouldn't happen, just being safe). */
363 if (!panicm_found)
364 panicm = m;
367 * If we have decided that we just CAN'T continue, and the user
368 * has not set tolerant to an insane level, give up and die.
370 if (no_way_out && tolerant < 3)
371 mce_panic("Machine check", &panicm, mcestart);
374 * If the error seems to be unrecoverable, something should be
375 * done. Try to kill as little as possible. If we can kill just
376 * one task, do that. If the user has set the tolerance very
377 * high, don't try to do anything at all.
379 if (kill_it && tolerant < 3) {
380 int user_space = 0;
383 * If the EIPV bit is set, it means the saved IP is the
384 * instruction which caused the MCE.
386 if (m.mcgstatus & MCG_STATUS_EIPV)
387 user_space = panicm.ip && (panicm.cs & 3);
390 * If we know that the error was in user space, send a
391 * SIGBUS. Otherwise, panic if tolerance is low.
393 * force_sig() takes an awful lot of locks and has a slight
394 * risk of deadlocking.
396 if (user_space) {
397 force_sig(SIGBUS, current);
398 } else if (panic_on_oops || tolerant < 2) {
399 mce_panic("Uncorrected machine check",
400 &panicm, mcestart);
404 /* notify userspace ASAP */
405 set_thread_flag(TIF_MCE_NOTIFY);
407 /* the last thing we do is clear state */
408 for (i = 0; i < banks; i++) {
409 if (test_bit(i, toclear))
410 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
412 wrmsrl(MSR_IA32_MCG_STATUS, 0);
413 out2:
414 atomic_dec(&mce_entry);
417 #ifdef CONFIG_X86_MCE_INTEL
418 /***
419 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
420 * @cpu: The CPU on which the event occurred.
421 * @status: Event status information
423 * This function should be called by the thermal interrupt after the
424 * event has been processed and the decision was made to log the event
425 * further.
427 * The status parameter will be saved to the 'status' field of 'struct mce'
428 * and historically has been the register value of the
429 * MSR_IA32_THERMAL_STATUS (Intel) msr.
431 void mce_log_therm_throt_event(__u64 status)
433 struct mce m;
435 mce_setup(&m);
436 m.bank = MCE_THERMAL_BANK;
437 m.status = status;
438 mce_log(&m);
440 #endif /* CONFIG_X86_MCE_INTEL */
443 * Periodic polling timer for "silent" machine check errors. If the
444 * poller finds an MCE, poll 2x faster. When the poller finds no more
445 * errors, poll 2x slower (up to check_interval seconds).
448 static int check_interval = 5 * 60; /* 5 minutes */
449 static int next_interval; /* in jiffies */
450 static void mcheck_timer(unsigned long);
451 static DEFINE_PER_CPU(struct timer_list, mce_timer);
453 static void mcheck_timer(unsigned long data)
455 struct timer_list *t = &per_cpu(mce_timer, data);
457 WARN_ON(smp_processor_id() != data);
459 if (mce_available(&current_cpu_data))
460 machine_check_poll(MCP_TIMESTAMP);
463 * Alert userspace if needed. If we logged an MCE, reduce the
464 * polling interval, otherwise increase the polling interval.
466 if (mce_notify_user()) {
467 next_interval = max(next_interval/2, HZ/100);
468 } else {
469 next_interval = min(next_interval * 2,
470 (int)round_jiffies_relative(check_interval*HZ));
473 t->expires = jiffies + next_interval;
474 add_timer(t);
477 static void mce_do_trigger(struct work_struct *work)
479 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
482 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
485 * Notify the user(s) about new machine check events.
486 * Can be called from interrupt context, but not from machine check/NMI
487 * context.
489 int mce_notify_user(void)
491 clear_thread_flag(TIF_MCE_NOTIFY);
492 if (test_and_clear_bit(0, &notify_user)) {
493 static unsigned long last_print;
494 unsigned long now = jiffies;
496 wake_up_interruptible(&mce_wait);
499 * There is no risk of missing notifications because
500 * work_pending is always cleared before the function is
501 * executed.
503 if (trigger[0] && !work_pending(&mce_trigger_work))
504 schedule_work(&mce_trigger_work);
506 if (time_after_eq(now, last_print + (check_interval*HZ))) {
507 last_print = now;
508 printk(KERN_INFO "Machine check events logged\n");
511 return 1;
513 return 0;
516 /* see if the idle task needs to notify userspace */
517 static int
518 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
520 /* IDLE_END should be safe - interrupts are back on */
521 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
522 mce_notify_user();
524 return NOTIFY_OK;
527 static struct notifier_block mce_idle_notifier = {
528 .notifier_call = mce_idle_callback,
531 static __init int periodic_mcheck_init(void)
533 idle_notifier_register(&mce_idle_notifier);
534 return 0;
536 __initcall(periodic_mcheck_init);
539 * Initialize Machine Checks for a CPU.
541 static int mce_cap_init(void)
543 u64 cap;
544 unsigned b;
546 rdmsrl(MSR_IA32_MCG_CAP, cap);
547 b = cap & 0xff;
548 if (b > MAX_NR_BANKS) {
549 printk(KERN_WARNING
550 "MCE: Using only %u machine check banks out of %u\n",
551 MAX_NR_BANKS, b);
552 b = MAX_NR_BANKS;
555 /* Don't support asymmetric configurations today */
556 WARN_ON(banks != 0 && b != banks);
557 banks = b;
558 if (!bank) {
559 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
560 if (!bank)
561 return -ENOMEM;
562 memset(bank, 0xff, banks * sizeof(u64));
565 /* Use accurate RIP reporting if available. */
566 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
567 rip_msr = MSR_IA32_MCG_EIP;
569 return 0;
572 static void mce_init(void *dummy)
574 u64 cap;
575 int i;
578 * Log the machine checks left over from the previous reset.
580 machine_check_poll(MCP_UC);
582 set_in_cr4(X86_CR4_MCE);
584 rdmsrl(MSR_IA32_MCG_CAP, cap);
585 if (cap & MCG_CTL_P)
586 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
588 for (i = 0; i < banks; i++) {
589 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
590 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
594 /* Add per CPU specific workarounds here */
595 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
597 /* This should be disabled by the BIOS, but isn't always */
598 if (c->x86_vendor == X86_VENDOR_AMD) {
599 if (c->x86 == 15 && banks > 4)
600 /* disable GART TBL walk error reporting, which trips off
601 incorrectly with the IOMMU & 3ware & Cerberus. */
602 clear_bit(10, (unsigned long *)&bank[4]);
603 if(c->x86 <= 17 && mce_bootlog < 0)
604 /* Lots of broken BIOS around that don't clear them
605 by default and leave crap in there. Don't log. */
606 mce_bootlog = 0;
611 static void mce_cpu_features(struct cpuinfo_x86 *c)
613 switch (c->x86_vendor) {
614 case X86_VENDOR_INTEL:
615 mce_intel_feature_init(c);
616 break;
617 case X86_VENDOR_AMD:
618 mce_amd_feature_init(c);
619 break;
620 default:
621 break;
625 static void mce_init_timer(void)
627 struct timer_list *t = &__get_cpu_var(mce_timer);
629 /* data race harmless because everyone sets to the same value */
630 if (!next_interval)
631 next_interval = check_interval * HZ;
632 if (!next_interval)
633 return;
634 setup_timer(t, mcheck_timer, smp_processor_id());
635 t->expires = round_jiffies_relative(jiffies + next_interval);
636 add_timer(t);
640 * Called for each booted CPU to set up machine checks.
641 * Must be called with preempt off.
643 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
645 if (!mce_available(c))
646 return;
648 if (mce_cap_init() < 0) {
649 mce_dont_init = 1;
650 return;
652 mce_cpu_quirks(c);
654 mce_init(NULL);
655 mce_cpu_features(c);
656 mce_init_timer();
660 * Character device to read and clear the MCE log.
663 static DEFINE_SPINLOCK(mce_state_lock);
664 static int open_count; /* #times opened */
665 static int open_exclu; /* already open exclusive? */
667 static int mce_open(struct inode *inode, struct file *file)
669 lock_kernel();
670 spin_lock(&mce_state_lock);
672 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
673 spin_unlock(&mce_state_lock);
674 unlock_kernel();
675 return -EBUSY;
678 if (file->f_flags & O_EXCL)
679 open_exclu = 1;
680 open_count++;
682 spin_unlock(&mce_state_lock);
683 unlock_kernel();
685 return nonseekable_open(inode, file);
688 static int mce_release(struct inode *inode, struct file *file)
690 spin_lock(&mce_state_lock);
692 open_count--;
693 open_exclu = 0;
695 spin_unlock(&mce_state_lock);
697 return 0;
700 static void collect_tscs(void *data)
702 unsigned long *cpu_tsc = (unsigned long *)data;
704 rdtscll(cpu_tsc[smp_processor_id()]);
707 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
708 loff_t *off)
710 unsigned long *cpu_tsc;
711 static DEFINE_MUTEX(mce_read_mutex);
712 unsigned prev, next;
713 char __user *buf = ubuf;
714 int i, err;
716 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
717 if (!cpu_tsc)
718 return -ENOMEM;
720 mutex_lock(&mce_read_mutex);
721 next = rcu_dereference(mcelog.next);
723 /* Only supports full reads right now */
724 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
725 mutex_unlock(&mce_read_mutex);
726 kfree(cpu_tsc);
727 return -EINVAL;
730 err = 0;
731 prev = 0;
732 do {
733 for (i = prev; i < next; i++) {
734 unsigned long start = jiffies;
736 while (!mcelog.entry[i].finished) {
737 if (time_after_eq(jiffies, start + 2)) {
738 memset(mcelog.entry + i, 0,
739 sizeof(struct mce));
740 goto timeout;
742 cpu_relax();
744 smp_rmb();
745 err |= copy_to_user(buf, mcelog.entry + i,
746 sizeof(struct mce));
747 buf += sizeof(struct mce);
748 timeout:
752 memset(mcelog.entry + prev, 0,
753 (next - prev) * sizeof(struct mce));
754 prev = next;
755 next = cmpxchg(&mcelog.next, prev, 0);
756 } while (next != prev);
758 synchronize_sched();
761 * Collect entries that were still getting written before the
762 * synchronize.
764 on_each_cpu(collect_tscs, cpu_tsc, 1);
765 for (i = next; i < MCE_LOG_LEN; i++) {
766 if (mcelog.entry[i].finished &&
767 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
768 err |= copy_to_user(buf, mcelog.entry+i,
769 sizeof(struct mce));
770 smp_rmb();
771 buf += sizeof(struct mce);
772 memset(&mcelog.entry[i], 0, sizeof(struct mce));
775 mutex_unlock(&mce_read_mutex);
776 kfree(cpu_tsc);
777 return err ? -EFAULT : buf - ubuf;
780 static unsigned int mce_poll(struct file *file, poll_table *wait)
782 poll_wait(file, &mce_wait, wait);
783 if (rcu_dereference(mcelog.next))
784 return POLLIN | POLLRDNORM;
785 return 0;
788 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
790 int __user *p = (int __user *)arg;
792 if (!capable(CAP_SYS_ADMIN))
793 return -EPERM;
794 switch (cmd) {
795 case MCE_GET_RECORD_LEN:
796 return put_user(sizeof(struct mce), p);
797 case MCE_GET_LOG_LEN:
798 return put_user(MCE_LOG_LEN, p);
799 case MCE_GETCLEAR_FLAGS: {
800 unsigned flags;
802 do {
803 flags = mcelog.flags;
804 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
805 return put_user(flags, p);
807 default:
808 return -ENOTTY;
812 static const struct file_operations mce_chrdev_ops = {
813 .open = mce_open,
814 .release = mce_release,
815 .read = mce_read,
816 .poll = mce_poll,
817 .unlocked_ioctl = mce_ioctl,
820 static struct miscdevice mce_log_device = {
821 MISC_MCELOG_MINOR,
822 "mcelog",
823 &mce_chrdev_ops,
827 * Old style boot options parsing. Only for compatibility.
829 static int __init mcheck_disable(char *str)
831 mce_dont_init = 1;
832 return 1;
835 /* mce=off disables machine check.
836 mce=TOLERANCELEVEL (number, see above)
837 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
838 mce=nobootlog Don't log MCEs from before booting. */
839 static int __init mcheck_enable(char *str)
841 if (!strcmp(str, "off"))
842 mce_dont_init = 1;
843 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
844 mce_bootlog = str[0] == 'b';
845 else if (isdigit(str[0]))
846 get_option(&str, &tolerant);
847 else
848 printk("mce= argument %s ignored. Please use /sys", str);
849 return 1;
852 __setup("nomce", mcheck_disable);
853 __setup("mce=", mcheck_enable);
856 * Sysfs support
860 * Disable machine checks on suspend and shutdown. We can't really handle
861 * them later.
863 static int mce_disable(void)
865 int i;
867 for (i = 0; i < banks; i++)
868 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
869 return 0;
872 static int mce_suspend(struct sys_device *dev, pm_message_t state)
874 return mce_disable();
877 static int mce_shutdown(struct sys_device *dev)
879 return mce_disable();
882 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
883 Only one CPU is active at this time, the others get readded later using
884 CPU hotplug. */
885 static int mce_resume(struct sys_device *dev)
887 mce_init(NULL);
888 mce_cpu_features(&current_cpu_data);
889 return 0;
892 static void mce_cpu_restart(void *data)
894 del_timer_sync(&__get_cpu_var(mce_timer));
895 if (mce_available(&current_cpu_data))
896 mce_init(NULL);
897 mce_init_timer();
900 /* Reinit MCEs after user configuration changes */
901 static void mce_restart(void)
903 next_interval = check_interval * HZ;
904 on_each_cpu(mce_cpu_restart, NULL, 1);
907 static struct sysdev_class mce_sysclass = {
908 .suspend = mce_suspend,
909 .shutdown = mce_shutdown,
910 .resume = mce_resume,
911 .name = "machinecheck",
914 DEFINE_PER_CPU(struct sys_device, device_mce);
915 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
917 /* Why are there no generic functions for this? */
918 #define ACCESSOR(name, var, start) \
919 static ssize_t show_ ## name(struct sys_device *s, \
920 struct sysdev_attribute *attr, \
921 char *buf) { \
922 return sprintf(buf, "%lx\n", (unsigned long)var); \
924 static ssize_t set_ ## name(struct sys_device *s, \
925 struct sysdev_attribute *attr, \
926 const char *buf, size_t siz) { \
927 char *end; \
928 unsigned long new = simple_strtoul(buf, &end, 0); \
929 if (end == buf) return -EINVAL; \
930 var = new; \
931 start; \
932 return end-buf; \
934 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
936 static struct sysdev_attribute *bank_attrs;
938 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
939 char *buf)
941 u64 b = bank[attr - bank_attrs];
942 return sprintf(buf, "%llx\n", b);
945 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
946 const char *buf, size_t siz)
948 char *end;
949 u64 new = simple_strtoull(buf, &end, 0);
950 if (end == buf)
951 return -EINVAL;
952 bank[attr - bank_attrs] = new;
953 mce_restart();
954 return end-buf;
957 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
958 char *buf)
960 strcpy(buf, trigger);
961 strcat(buf, "\n");
962 return strlen(trigger) + 1;
965 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
966 const char *buf,size_t siz)
968 char *p;
969 int len;
970 strncpy(trigger, buf, sizeof(trigger));
971 trigger[sizeof(trigger)-1] = 0;
972 len = strlen(trigger);
973 p = strchr(trigger, '\n');
974 if (*p) *p = 0;
975 return len;
978 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
979 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
980 ACCESSOR(check_interval,check_interval,mce_restart())
981 static struct sysdev_attribute *mce_attributes[] = {
982 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
983 NULL
986 static cpumask_t mce_device_initialized = CPU_MASK_NONE;
988 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
989 static __cpuinit int mce_create_device(unsigned int cpu)
991 int err;
992 int i;
994 if (!mce_available(&boot_cpu_data))
995 return -EIO;
997 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
998 per_cpu(device_mce,cpu).id = cpu;
999 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1001 err = sysdev_register(&per_cpu(device_mce,cpu));
1002 if (err)
1003 return err;
1005 for (i = 0; mce_attributes[i]; i++) {
1006 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1007 mce_attributes[i]);
1008 if (err)
1009 goto error;
1011 for (i = 0; i < banks; i++) {
1012 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1013 &bank_attrs[i]);
1014 if (err)
1015 goto error2;
1017 cpu_set(cpu, mce_device_initialized);
1019 return 0;
1020 error2:
1021 while (--i >= 0) {
1022 sysdev_remove_file(&per_cpu(device_mce, cpu),
1023 &bank_attrs[i]);
1025 error:
1026 while (--i >= 0) {
1027 sysdev_remove_file(&per_cpu(device_mce,cpu),
1028 mce_attributes[i]);
1030 sysdev_unregister(&per_cpu(device_mce,cpu));
1032 return err;
1035 static __cpuinit void mce_remove_device(unsigned int cpu)
1037 int i;
1039 if (!cpu_isset(cpu, mce_device_initialized))
1040 return;
1042 for (i = 0; mce_attributes[i]; i++)
1043 sysdev_remove_file(&per_cpu(device_mce,cpu),
1044 mce_attributes[i]);
1045 for (i = 0; i < banks; i++)
1046 sysdev_remove_file(&per_cpu(device_mce, cpu),
1047 &bank_attrs[i]);
1048 sysdev_unregister(&per_cpu(device_mce,cpu));
1049 cpu_clear(cpu, mce_device_initialized);
1052 /* Make sure there are no machine checks on offlined CPUs. */
1053 static void mce_disable_cpu(void *h)
1055 int i;
1057 if (!mce_available(&current_cpu_data))
1058 return;
1059 for (i = 0; i < banks; i++)
1060 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1063 static void mce_reenable_cpu(void *h)
1065 int i;
1067 if (!mce_available(&current_cpu_data))
1068 return;
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1073 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1074 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1075 unsigned long action, void *hcpu)
1077 unsigned int cpu = (unsigned long)hcpu;
1078 struct timer_list *t = &per_cpu(mce_timer, cpu);
1080 switch (action) {
1081 case CPU_ONLINE:
1082 case CPU_ONLINE_FROZEN:
1083 mce_create_device(cpu);
1084 if (threshold_cpu_callback)
1085 threshold_cpu_callback(action, cpu);
1086 break;
1087 case CPU_DEAD:
1088 case CPU_DEAD_FROZEN:
1089 if (threshold_cpu_callback)
1090 threshold_cpu_callback(action, cpu);
1091 mce_remove_device(cpu);
1092 break;
1093 case CPU_DOWN_PREPARE:
1094 case CPU_DOWN_PREPARE_FROZEN:
1095 del_timer_sync(t);
1096 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
1097 break;
1098 case CPU_DOWN_FAILED:
1099 case CPU_DOWN_FAILED_FROZEN:
1100 t->expires = round_jiffies_relative(jiffies + next_interval);
1101 add_timer_on(t, cpu);
1102 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
1103 break;
1105 return NOTIFY_OK;
1108 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1109 .notifier_call = mce_cpu_callback,
1112 static __init int mce_init_banks(void)
1114 int i;
1116 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1117 GFP_KERNEL);
1118 if (!bank_attrs)
1119 return -ENOMEM;
1121 for (i = 0; i < banks; i++) {
1122 struct sysdev_attribute *a = &bank_attrs[i];
1123 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1124 if (!a->attr.name)
1125 goto nomem;
1126 a->attr.mode = 0644;
1127 a->show = show_bank;
1128 a->store = set_bank;
1130 return 0;
1132 nomem:
1133 while (--i >= 0)
1134 kfree(bank_attrs[i].attr.name);
1135 kfree(bank_attrs);
1136 bank_attrs = NULL;
1137 return -ENOMEM;
1140 static __init int mce_init_device(void)
1142 int err;
1143 int i = 0;
1145 if (!mce_available(&boot_cpu_data))
1146 return -EIO;
1148 err = mce_init_banks();
1149 if (err)
1150 return err;
1152 err = sysdev_class_register(&mce_sysclass);
1153 if (err)
1154 return err;
1156 for_each_online_cpu(i) {
1157 err = mce_create_device(i);
1158 if (err)
1159 return err;
1162 register_hotcpu_notifier(&mce_cpu_notifier);
1163 misc_register(&mce_log_device);
1164 return err;
1167 device_initcall(mce_init_device);