x86, mce: remove invalid __cpuinit/__cpuexit annotations
[linux-2.6/mini2440.git] / arch / x86 / kernel / cpu / mcheck / mce_64.c
blob0625993bf95538ffed50c577c8e4663cdf0a8239
1 /*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
10 #include <linux/init.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/sched.h>
14 #include <linux/smp_lock.h>
15 #include <linux/string.h>
16 #include <linux/rcupdate.h>
17 #include <linux/kallsyms.h>
18 #include <linux/sysdev.h>
19 #include <linux/miscdevice.h>
20 #include <linux/fs.h>
21 #include <linux/capability.h>
22 #include <linux/cpu.h>
23 #include <linux/percpu.h>
24 #include <linux/poll.h>
25 #include <linux/thread_info.h>
26 #include <linux/ctype.h>
27 #include <linux/kmod.h>
28 #include <linux/kdebug.h>
29 #include <linux/kobject.h>
30 #include <linux/sysfs.h>
31 #include <asm/processor.h>
32 #include <asm/msr.h>
33 #include <asm/mce.h>
34 #include <asm/uaccess.h>
35 #include <asm/smp.h>
36 #include <asm/idle.h>
38 #define MISC_MCELOG_MINOR 227
41 * To support more than 128 would need to escape the predefined
42 * Linux defined extended banks first.
44 #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
46 atomic_t mce_entry;
48 static int mce_dont_init;
51 * Tolerant levels:
52 * 0: always panic on uncorrected errors, log corrected errors
53 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
54 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
55 * 3: never panic or SIGBUS, log all errors (for testing only)
57 static int tolerant = 1;
58 static int banks;
59 static u64 *bank;
60 static unsigned long notify_user;
61 static int rip_msr;
62 static int mce_bootlog = -1;
63 static atomic_t mce_events;
65 static char trigger[128];
66 static char *trigger_argv[2] = { trigger, NULL };
68 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
70 /* Do initial initialization of a struct mce */
71 void mce_setup(struct mce *m)
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
84 static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
89 void mce_log(struct mce *mce)
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
109 break;
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
121 set_bit(0, &notify_user);
124 static void print_mce(struct mce *m)
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
150 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
152 int i;
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
164 if (backup)
165 print_mce(backup);
166 panic(msg);
169 static int mce_available(struct cpuinfo_x86 *c)
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
176 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
197 * This is executed in standard interrupt context.
199 void machine_check_poll(enum mcp_flags flags)
201 struct mce m;
202 int i;
204 mce_setup(&m);
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i])
209 continue;
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
226 * TBD do the same check for MCI_STATUS_EN here?
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
247 * Clear state for this bank.
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
266 void do_machine_check(struct pt_regs * regs, long error_code)
268 struct mce m, panicm;
269 u64 mcestart = 0;
270 int i;
271 int panicm_found = 0;
273 * If no_way_out gets set, there is no safe way to recover from this
274 * MCE. If tolerant is cranked up, we'll try anyway.
276 int no_way_out = 0;
278 * If kill_it gets set, there might be a way to recover from this
279 * error.
281 int kill_it = 0;
282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
284 atomic_inc(&mce_entry);
286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
287 18, SIGKILL) == NOTIFY_STOP)
288 goto out2;
289 if (!banks)
290 goto out2;
292 mce_setup(&m);
294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
295 /* if the restart IP is not valid, we're done for */
296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
297 no_way_out = 1;
299 rdtscll(mcestart);
300 barrier();
302 for (i = 0; i < banks; i++) {
303 __clear_bit(i, toclear);
304 if (!bank[i])
305 continue;
307 m.misc = 0;
308 m.addr = 0;
309 m.bank = i;
311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
312 if ((m.status & MCI_STATUS_VAL) == 0)
313 continue;
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
323 * Set taint even when machine check was not enabled.
325 add_taint(TAINT_MACHINE_CHECK);
327 __set_bit(i, toclear);
329 if (m.status & MCI_STATUS_EN) {
330 /* if PCC was set, there's no way out */
331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 * If this error was uncorrectable and there was
334 * an overflow, we're in trouble. If no overflow,
335 * we might get away with just killing a task.
337 if (m.status & MCI_STATUS_UC) {
338 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
339 no_way_out = 1;
340 kill_it = 1;
342 } else {
344 * Machine check event was not enabled. Clear, but
345 * ignore.
347 continue;
350 if (m.status & MCI_STATUS_MISCV)
351 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
352 if (m.status & MCI_STATUS_ADDRV)
353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355 mce_get_rip(&m, regs);
356 mce_log(&m);
358 /* Did this bank cause the exception? */
359 /* Assume that the bank with uncorrectable errors did it,
360 and that there is only a single one. */
361 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
362 panicm = m;
363 panicm_found = 1;
367 /* If we didn't find an uncorrectable error, pick
368 the last one (shouldn't happen, just being safe). */
369 if (!panicm_found)
370 panicm = m;
373 * If we have decided that we just CAN'T continue, and the user
374 * has not set tolerant to an insane level, give up and die.
376 if (no_way_out && tolerant < 3)
377 mce_panic("Machine check", &panicm, mcestart);
380 * If the error seems to be unrecoverable, something should be
381 * done. Try to kill as little as possible. If we can kill just
382 * one task, do that. If the user has set the tolerance very
383 * high, don't try to do anything at all.
385 if (kill_it && tolerant < 3) {
386 int user_space = 0;
389 * If the EIPV bit is set, it means the saved IP is the
390 * instruction which caused the MCE.
392 if (m.mcgstatus & MCG_STATUS_EIPV)
393 user_space = panicm.ip && (panicm.cs & 3);
396 * If we know that the error was in user space, send a
397 * SIGBUS. Otherwise, panic if tolerance is low.
399 * force_sig() takes an awful lot of locks and has a slight
400 * risk of deadlocking.
402 if (user_space) {
403 force_sig(SIGBUS, current);
404 } else if (panic_on_oops || tolerant < 2) {
405 mce_panic("Uncorrected machine check",
406 &panicm, mcestart);
410 /* notify userspace ASAP */
411 set_thread_flag(TIF_MCE_NOTIFY);
413 /* the last thing we do is clear state */
414 for (i = 0; i < banks; i++) {
415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
419 out2:
420 atomic_dec(&mce_entry);
423 #ifdef CONFIG_X86_MCE_INTEL
424 /***
425 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
426 * @cpu: The CPU on which the event occurred.
427 * @status: Event status information
429 * This function should be called by the thermal interrupt after the
430 * event has been processed and the decision was made to log the event
431 * further.
433 * The status parameter will be saved to the 'status' field of 'struct mce'
434 * and historically has been the register value of the
435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
437 void mce_log_therm_throt_event(__u64 status)
439 struct mce m;
441 mce_setup(&m);
442 m.bank = MCE_THERMAL_BANK;
443 m.status = status;
444 mce_log(&m);
446 #endif /* CONFIG_X86_MCE_INTEL */
449 * Periodic polling timer for "silent" machine check errors. If the
450 * poller finds an MCE, poll 2x faster. When the poller finds no more
451 * errors, poll 2x slower (up to check_interval seconds).
454 static int check_interval = 5 * 60; /* 5 minutes */
455 static int next_interval; /* in jiffies */
456 static void mcheck_timer(unsigned long);
457 static DEFINE_PER_CPU(struct timer_list, mce_timer);
459 static void mcheck_timer(unsigned long data)
461 struct timer_list *t = &per_cpu(mce_timer, data);
463 WARN_ON(smp_processor_id() != data);
465 if (mce_available(&current_cpu_data))
466 machine_check_poll(MCP_TIMESTAMP);
469 * Alert userspace if needed. If we logged an MCE, reduce the
470 * polling interval, otherwise increase the polling interval.
472 if (mce_notify_user()) {
473 next_interval = max(next_interval/2, HZ/100);
474 } else {
475 next_interval = min(next_interval * 2,
476 (int)round_jiffies_relative(check_interval*HZ));
479 t->expires = jiffies + next_interval;
480 add_timer(t);
483 static void mce_do_trigger(struct work_struct *work)
485 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
488 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
491 * Notify the user(s) about new machine check events.
492 * Can be called from interrupt context, but not from machine check/NMI
493 * context.
495 int mce_notify_user(void)
497 clear_thread_flag(TIF_MCE_NOTIFY);
498 if (test_and_clear_bit(0, &notify_user)) {
499 static unsigned long last_print;
500 unsigned long now = jiffies;
502 wake_up_interruptible(&mce_wait);
505 * There is no risk of missing notifications because
506 * work_pending is always cleared before the function is
507 * executed.
509 if (trigger[0] && !work_pending(&mce_trigger_work))
510 schedule_work(&mce_trigger_work);
512 if (time_after_eq(now, last_print + (check_interval*HZ))) {
513 last_print = now;
514 printk(KERN_INFO "Machine check events logged\n");
517 return 1;
519 return 0;
522 /* see if the idle task needs to notify userspace */
523 static int
524 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
526 /* IDLE_END should be safe - interrupts are back on */
527 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
528 mce_notify_user();
530 return NOTIFY_OK;
533 static struct notifier_block mce_idle_notifier = {
534 .notifier_call = mce_idle_callback,
537 static __init int periodic_mcheck_init(void)
539 idle_notifier_register(&mce_idle_notifier);
540 return 0;
542 __initcall(periodic_mcheck_init);
545 * Initialize Machine Checks for a CPU.
547 static int mce_cap_init(void)
549 u64 cap;
550 unsigned b;
552 rdmsrl(MSR_IA32_MCG_CAP, cap);
553 b = cap & 0xff;
554 if (b > MAX_NR_BANKS) {
555 printk(KERN_WARNING
556 "MCE: Using only %u machine check banks out of %u\n",
557 MAX_NR_BANKS, b);
558 b = MAX_NR_BANKS;
561 /* Don't support asymmetric configurations today */
562 WARN_ON(banks != 0 && b != banks);
563 banks = b;
564 if (!bank) {
565 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
566 if (!bank)
567 return -ENOMEM;
568 memset(bank, 0xff, banks * sizeof(u64));
571 /* Use accurate RIP reporting if available. */
572 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
573 rip_msr = MSR_IA32_MCG_EIP;
575 return 0;
578 static void mce_init(void *dummy)
580 u64 cap;
581 int i;
584 * Log the machine checks left over from the previous reset.
586 machine_check_poll(MCP_UC);
588 set_in_cr4(X86_CR4_MCE);
590 rdmsrl(MSR_IA32_MCG_CAP, cap);
591 if (cap & MCG_CTL_P)
592 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
594 for (i = 0; i < banks; i++) {
595 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
596 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
600 /* Add per CPU specific workarounds here */
601 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
603 /* This should be disabled by the BIOS, but isn't always */
604 if (c->x86_vendor == X86_VENDOR_AMD) {
605 if (c->x86 == 15 && banks > 4)
606 /* disable GART TBL walk error reporting, which trips off
607 incorrectly with the IOMMU & 3ware & Cerberus. */
608 clear_bit(10, (unsigned long *)&bank[4]);
609 if(c->x86 <= 17 && mce_bootlog < 0)
610 /* Lots of broken BIOS around that don't clear them
611 by default and leave crap in there. Don't log. */
612 mce_bootlog = 0;
617 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
619 switch (c->x86_vendor) {
620 case X86_VENDOR_INTEL:
621 mce_intel_feature_init(c);
622 break;
623 case X86_VENDOR_AMD:
624 mce_amd_feature_init(c);
625 break;
626 default:
627 break;
631 static void mce_init_timer(void)
633 struct timer_list *t = &__get_cpu_var(mce_timer);
635 /* data race harmless because everyone sets to the same value */
636 if (!next_interval)
637 next_interval = check_interval * HZ;
638 if (!next_interval)
639 return;
640 setup_timer(t, mcheck_timer, smp_processor_id());
641 t->expires = round_jiffies_relative(jiffies + next_interval);
642 add_timer(t);
646 * Called for each booted CPU to set up machine checks.
647 * Must be called with preempt off.
649 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
651 if (!mce_available(c))
652 return;
654 if (mce_cap_init() < 0) {
655 mce_dont_init = 1;
656 return;
658 mce_cpu_quirks(c);
660 mce_init(NULL);
661 mce_cpu_features(c);
662 mce_init_timer();
666 * Character device to read and clear the MCE log.
669 static DEFINE_SPINLOCK(mce_state_lock);
670 static int open_count; /* #times opened */
671 static int open_exclu; /* already open exclusive? */
673 static int mce_open(struct inode *inode, struct file *file)
675 lock_kernel();
676 spin_lock(&mce_state_lock);
678 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
679 spin_unlock(&mce_state_lock);
680 unlock_kernel();
681 return -EBUSY;
684 if (file->f_flags & O_EXCL)
685 open_exclu = 1;
686 open_count++;
688 spin_unlock(&mce_state_lock);
689 unlock_kernel();
691 return nonseekable_open(inode, file);
694 static int mce_release(struct inode *inode, struct file *file)
696 spin_lock(&mce_state_lock);
698 open_count--;
699 open_exclu = 0;
701 spin_unlock(&mce_state_lock);
703 return 0;
706 static void collect_tscs(void *data)
708 unsigned long *cpu_tsc = (unsigned long *)data;
710 rdtscll(cpu_tsc[smp_processor_id()]);
713 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
714 loff_t *off)
716 unsigned long *cpu_tsc;
717 static DEFINE_MUTEX(mce_read_mutex);
718 unsigned prev, next;
719 char __user *buf = ubuf;
720 int i, err;
722 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
723 if (!cpu_tsc)
724 return -ENOMEM;
726 mutex_lock(&mce_read_mutex);
727 next = rcu_dereference(mcelog.next);
729 /* Only supports full reads right now */
730 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
731 mutex_unlock(&mce_read_mutex);
732 kfree(cpu_tsc);
733 return -EINVAL;
736 err = 0;
737 prev = 0;
738 do {
739 for (i = prev; i < next; i++) {
740 unsigned long start = jiffies;
742 while (!mcelog.entry[i].finished) {
743 if (time_after_eq(jiffies, start + 2)) {
744 memset(mcelog.entry + i, 0,
745 sizeof(struct mce));
746 goto timeout;
748 cpu_relax();
750 smp_rmb();
751 err |= copy_to_user(buf, mcelog.entry + i,
752 sizeof(struct mce));
753 buf += sizeof(struct mce);
754 timeout:
758 memset(mcelog.entry + prev, 0,
759 (next - prev) * sizeof(struct mce));
760 prev = next;
761 next = cmpxchg(&mcelog.next, prev, 0);
762 } while (next != prev);
764 synchronize_sched();
767 * Collect entries that were still getting written before the
768 * synchronize.
770 on_each_cpu(collect_tscs, cpu_tsc, 1);
771 for (i = next; i < MCE_LOG_LEN; i++) {
772 if (mcelog.entry[i].finished &&
773 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
774 err |= copy_to_user(buf, mcelog.entry+i,
775 sizeof(struct mce));
776 smp_rmb();
777 buf += sizeof(struct mce);
778 memset(&mcelog.entry[i], 0, sizeof(struct mce));
781 mutex_unlock(&mce_read_mutex);
782 kfree(cpu_tsc);
783 return err ? -EFAULT : buf - ubuf;
786 static unsigned int mce_poll(struct file *file, poll_table *wait)
788 poll_wait(file, &mce_wait, wait);
789 if (rcu_dereference(mcelog.next))
790 return POLLIN | POLLRDNORM;
791 return 0;
794 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
796 int __user *p = (int __user *)arg;
798 if (!capable(CAP_SYS_ADMIN))
799 return -EPERM;
800 switch (cmd) {
801 case MCE_GET_RECORD_LEN:
802 return put_user(sizeof(struct mce), p);
803 case MCE_GET_LOG_LEN:
804 return put_user(MCE_LOG_LEN, p);
805 case MCE_GETCLEAR_FLAGS: {
806 unsigned flags;
808 do {
809 flags = mcelog.flags;
810 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
811 return put_user(flags, p);
813 default:
814 return -ENOTTY;
818 static const struct file_operations mce_chrdev_ops = {
819 .open = mce_open,
820 .release = mce_release,
821 .read = mce_read,
822 .poll = mce_poll,
823 .unlocked_ioctl = mce_ioctl,
826 static struct miscdevice mce_log_device = {
827 MISC_MCELOG_MINOR,
828 "mcelog",
829 &mce_chrdev_ops,
833 * Old style boot options parsing. Only for compatibility.
835 static int __init mcheck_disable(char *str)
837 mce_dont_init = 1;
838 return 1;
841 /* mce=off disables machine check.
842 mce=TOLERANCELEVEL (number, see above)
843 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
844 mce=nobootlog Don't log MCEs from before booting. */
845 static int __init mcheck_enable(char *str)
847 if (!strcmp(str, "off"))
848 mce_dont_init = 1;
849 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
850 mce_bootlog = str[0] == 'b';
851 else if (isdigit(str[0]))
852 get_option(&str, &tolerant);
853 else
854 printk("mce= argument %s ignored. Please use /sys", str);
855 return 1;
858 __setup("nomce", mcheck_disable);
859 __setup("mce=", mcheck_enable);
862 * Sysfs support
866 * Disable machine checks on suspend and shutdown. We can't really handle
867 * them later.
869 static int mce_disable(void)
871 int i;
873 for (i = 0; i < banks; i++)
874 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
875 return 0;
878 static int mce_suspend(struct sys_device *dev, pm_message_t state)
880 return mce_disable();
883 static int mce_shutdown(struct sys_device *dev)
885 return mce_disable();
888 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
889 Only one CPU is active at this time, the others get readded later using
890 CPU hotplug. */
891 static int mce_resume(struct sys_device *dev)
893 mce_init(NULL);
894 mce_cpu_features(&current_cpu_data);
895 return 0;
898 static void mce_cpu_restart(void *data)
900 del_timer_sync(&__get_cpu_var(mce_timer));
901 if (mce_available(&current_cpu_data))
902 mce_init(NULL);
903 mce_init_timer();
906 /* Reinit MCEs after user configuration changes */
907 static void mce_restart(void)
909 next_interval = check_interval * HZ;
910 on_each_cpu(mce_cpu_restart, NULL, 1);
913 static struct sysdev_class mce_sysclass = {
914 .suspend = mce_suspend,
915 .shutdown = mce_shutdown,
916 .resume = mce_resume,
917 .name = "machinecheck",
920 DEFINE_PER_CPU(struct sys_device, device_mce);
921 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
923 /* Why are there no generic functions for this? */
924 #define ACCESSOR(name, var, start) \
925 static ssize_t show_ ## name(struct sys_device *s, \
926 struct sysdev_attribute *attr, \
927 char *buf) { \
928 return sprintf(buf, "%lx\n", (unsigned long)var); \
930 static ssize_t set_ ## name(struct sys_device *s, \
931 struct sysdev_attribute *attr, \
932 const char *buf, size_t siz) { \
933 char *end; \
934 unsigned long new = simple_strtoul(buf, &end, 0); \
935 if (end == buf) return -EINVAL; \
936 var = new; \
937 start; \
938 return end-buf; \
940 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
942 static struct sysdev_attribute *bank_attrs;
944 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
945 char *buf)
947 u64 b = bank[attr - bank_attrs];
948 return sprintf(buf, "%llx\n", b);
951 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
952 const char *buf, size_t siz)
954 char *end;
955 u64 new = simple_strtoull(buf, &end, 0);
956 if (end == buf)
957 return -EINVAL;
958 bank[attr - bank_attrs] = new;
959 mce_restart();
960 return end-buf;
963 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
964 char *buf)
966 strcpy(buf, trigger);
967 strcat(buf, "\n");
968 return strlen(trigger) + 1;
971 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
972 const char *buf,size_t siz)
974 char *p;
975 int len;
976 strncpy(trigger, buf, sizeof(trigger));
977 trigger[sizeof(trigger)-1] = 0;
978 len = strlen(trigger);
979 p = strchr(trigger, '\n');
980 if (*p) *p = 0;
981 return len;
984 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
985 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
986 ACCESSOR(check_interval,check_interval,mce_restart())
987 static struct sysdev_attribute *mce_attributes[] = {
988 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
989 NULL
992 static cpumask_t mce_device_initialized = CPU_MASK_NONE;
994 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
995 static __cpuinit int mce_create_device(unsigned int cpu)
997 int err;
998 int i;
1000 if (!mce_available(&boot_cpu_data))
1001 return -EIO;
1003 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1004 per_cpu(device_mce,cpu).id = cpu;
1005 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007 err = sysdev_register(&per_cpu(device_mce,cpu));
1008 if (err)
1009 return err;
1011 for (i = 0; mce_attributes[i]; i++) {
1012 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1013 mce_attributes[i]);
1014 if (err)
1015 goto error;
1017 for (i = 0; i < banks; i++) {
1018 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1019 &bank_attrs[i]);
1020 if (err)
1021 goto error2;
1023 cpu_set(cpu, mce_device_initialized);
1025 return 0;
1026 error2:
1027 while (--i >= 0) {
1028 sysdev_remove_file(&per_cpu(device_mce, cpu),
1029 &bank_attrs[i]);
1031 error:
1032 while (--i >= 0) {
1033 sysdev_remove_file(&per_cpu(device_mce,cpu),
1034 mce_attributes[i]);
1036 sysdev_unregister(&per_cpu(device_mce,cpu));
1038 return err;
1041 static __cpuinit void mce_remove_device(unsigned int cpu)
1043 int i;
1045 if (!cpu_isset(cpu, mce_device_initialized))
1046 return;
1048 for (i = 0; mce_attributes[i]; i++)
1049 sysdev_remove_file(&per_cpu(device_mce,cpu),
1050 mce_attributes[i]);
1051 for (i = 0; i < banks; i++)
1052 sysdev_remove_file(&per_cpu(device_mce, cpu),
1053 &bank_attrs[i]);
1054 sysdev_unregister(&per_cpu(device_mce,cpu));
1055 cpu_clear(cpu, mce_device_initialized);
1058 /* Make sure there are no machine checks on offlined CPUs. */
1059 static void mce_disable_cpu(void *h)
1061 int i;
1063 if (!mce_available(&current_cpu_data))
1064 return;
1065 for (i = 0; i < banks; i++)
1066 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1069 static void mce_reenable_cpu(void *h)
1071 int i;
1073 if (!mce_available(&current_cpu_data))
1074 return;
1075 for (i = 0; i < banks; i++)
1076 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1079 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1080 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1081 unsigned long action, void *hcpu)
1083 unsigned int cpu = (unsigned long)hcpu;
1084 struct timer_list *t = &per_cpu(mce_timer, cpu);
1086 switch (action) {
1087 case CPU_ONLINE:
1088 case CPU_ONLINE_FROZEN:
1089 mce_create_device(cpu);
1090 if (threshold_cpu_callback)
1091 threshold_cpu_callback(action, cpu);
1092 break;
1093 case CPU_DEAD:
1094 case CPU_DEAD_FROZEN:
1095 if (threshold_cpu_callback)
1096 threshold_cpu_callback(action, cpu);
1097 mce_remove_device(cpu);
1098 break;
1099 case CPU_DOWN_PREPARE:
1100 case CPU_DOWN_PREPARE_FROZEN:
1101 del_timer_sync(t);
1102 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
1103 break;
1104 case CPU_DOWN_FAILED:
1105 case CPU_DOWN_FAILED_FROZEN:
1106 t->expires = round_jiffies_relative(jiffies + next_interval);
1107 add_timer_on(t, cpu);
1108 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
1109 break;
1111 return NOTIFY_OK;
1114 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1115 .notifier_call = mce_cpu_callback,
1118 static __init int mce_init_banks(void)
1120 int i;
1122 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1123 GFP_KERNEL);
1124 if (!bank_attrs)
1125 return -ENOMEM;
1127 for (i = 0; i < banks; i++) {
1128 struct sysdev_attribute *a = &bank_attrs[i];
1129 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1130 if (!a->attr.name)
1131 goto nomem;
1132 a->attr.mode = 0644;
1133 a->show = show_bank;
1134 a->store = set_bank;
1136 return 0;
1138 nomem:
1139 while (--i >= 0)
1140 kfree(bank_attrs[i].attr.name);
1141 kfree(bank_attrs);
1142 bank_attrs = NULL;
1143 return -ENOMEM;
1146 static __init int mce_init_device(void)
1148 int err;
1149 int i = 0;
1151 if (!mce_available(&boot_cpu_data))
1152 return -EIO;
1154 err = mce_init_banks();
1155 if (err)
1156 return err;
1158 err = sysdev_class_register(&mce_sysclass);
1159 if (err)
1160 return err;
1162 for_each_online_cpu(i) {
1163 err = mce_create_device(i);
1164 if (err)
1165 return err;
1168 register_hotcpu_notifier(&mce_cpu_notifier);
1169 misc_register(&mce_log_device);
1170 return err;
1173 device_initcall(mce_init_device);