cpumask: alloc zeroed cpumask for static cpumask_var_ts
[linux-2.6/mini2440.git] / arch / x86 / kernel / cpu / mcheck / mce_64.c
blob09dd1d414fc36bf40eb8218619298e76bf1ac443
1 /*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
8 */
10 #include <linux/init.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/sched.h>
14 #include <linux/smp_lock.h>
15 #include <linux/string.h>
16 #include <linux/rcupdate.h>
17 #include <linux/kallsyms.h>
18 #include <linux/sysdev.h>
19 #include <linux/miscdevice.h>
20 #include <linux/fs.h>
21 #include <linux/capability.h>
22 #include <linux/cpu.h>
23 #include <linux/percpu.h>
24 #include <linux/poll.h>
25 #include <linux/thread_info.h>
26 #include <linux/ctype.h>
27 #include <linux/kmod.h>
28 #include <linux/kdebug.h>
29 #include <linux/kobject.h>
30 #include <linux/sysfs.h>
31 #include <linux/ratelimit.h>
32 #include <asm/processor.h>
33 #include <asm/msr.h>
34 #include <asm/mce.h>
35 #include <asm/uaccess.h>
36 #include <asm/smp.h>
37 #include <asm/idle.h>
39 #define MISC_MCELOG_MINOR 227
41 atomic_t mce_entry;
43 static int mce_dont_init;
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
52 static int tolerant = 1;
53 static int banks;
54 static u64 *bank;
55 static unsigned long notify_user;
56 static int rip_msr;
57 static int mce_bootlog = -1;
58 static atomic_t mce_events;
60 static char trigger[128];
61 static char *trigger_argv[2] = { trigger, NULL };
63 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
65 /* MCA banks polled by the period polling timer for corrected events */
66 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
70 /* Do initial initialization of a struct mce */
71 void mce_setup(struct mce *m)
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
84 static struct mce_log mcelog = {
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
89 void mce_log(struct mce *mce)
91 unsigned next, entry;
92 atomic_inc(&mce_events);
93 mce->finished = 0;
94 wmb();
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
102 return;
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
109 break;
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
117 wmb();
118 mcelog.entry[entry].finished = 1;
119 wmb();
121 set_bit(0, &notify_user);
124 static void print_mce(struct mce *m)
126 printk(KERN_EMERG "\n"
127 KERN_EMERG "HARDWARE ERROR\n"
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
131 if (m->ip) {
132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
134 m->cs, m->ip);
135 if (m->cs == __KERNEL_CS)
136 print_symbol("{%s}", m->ip);
137 printk("\n");
139 printk(KERN_EMERG "TSC %llx ", m->tsc);
140 if (m->addr)
141 printk("ADDR %llx ", m->addr);
142 if (m->misc)
143 printk("MISC %llx ", m->misc);
144 printk("\n");
145 printk(KERN_EMERG "This is not a software problem!\n");
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
150 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
152 int i;
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
158 if (time_before(tsc, start))
159 continue;
160 print_mce(&mcelog.entry[i]);
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
164 if (backup)
165 print_mce(backup);
166 panic(msg);
169 int mce_available(struct cpuinfo_x86 *c)
171 if (mce_dont_init)
172 return 0;
173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
176 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
179 m->ip = regs->ip;
180 m->cs = regs->cs;
181 } else {
182 m->ip = 0;
183 m->cs = 0;
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
188 rdmsrl(rip_msr, m->ip);
189 m->cs = 0;
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
197 * This is executed in standard interrupt context.
199 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
201 struct mce m;
202 int i;
204 mce_setup(&m);
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i] || !test_bit(i, *b))
209 continue;
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
226 * TBD do the same check for MCI_STATUS_EN here?
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
248 * Clear state for this bank.
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
267 void do_machine_check(struct pt_regs * regs, long error_code)
269 struct mce m, panicm;
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
277 int no_way_out = 0;
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
282 int kill_it = 0;
283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
285 atomic_inc(&mce_entry);
287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
288 18, SIGKILL) == NOTIFY_STOP)
289 goto out2;
290 if (!banks)
291 goto out2;
293 mce_setup(&m);
295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
298 no_way_out = 1;
300 rdtscll(mcestart);
301 barrier();
303 for (i = 0; i < banks; i++) {
304 __clear_bit(i, toclear);
305 if (!bank[i])
306 continue;
308 m.misc = 0;
309 m.addr = 0;
310 m.bank = i;
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
324 * Set taint even when machine check was not enabled.
326 add_taint(TAINT_MACHINE_CHECK);
328 __set_bit(i, toclear);
330 if (m.status & MCI_STATUS_EN) {
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
343 } else {
345 * Machine check event was not enabled. Clear, but
346 * ignore.
348 continue;
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
356 mce_get_rip(&m, regs);
357 mce_log(&m);
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
377 if (no_way_out && tolerant < 3)
378 mce_panic("Machine check", &panicm, mcestart);
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
386 if (kill_it && tolerant < 3) {
387 int user_space = 0;
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
393 if (m.mcgstatus & MCG_STATUS_EIPV)
394 user_space = panicm.ip && (panicm.cs & 3);
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
403 if (user_space) {
404 force_sig(SIGBUS, current);
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
414 /* the last thing we do is clear state */
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
420 out2:
421 atomic_dec(&mce_entry);
424 #ifdef CONFIG_X86_MCE_INTEL
425 /***
426 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
427 * @cpu: The CPU on which the event occurred.
428 * @status: Event status information
430 * This function should be called by the thermal interrupt after the
431 * event has been processed and the decision was made to log the event
432 * further.
434 * The status parameter will be saved to the 'status' field of 'struct mce'
435 * and historically has been the register value of the
436 * MSR_IA32_THERMAL_STATUS (Intel) msr.
438 void mce_log_therm_throt_event(__u64 status)
440 struct mce m;
442 mce_setup(&m);
443 m.bank = MCE_THERMAL_BANK;
444 m.status = status;
445 mce_log(&m);
447 #endif /* CONFIG_X86_MCE_INTEL */
450 * Periodic polling timer for "silent" machine check errors. If the
451 * poller finds an MCE, poll 2x faster. When the poller finds no more
452 * errors, poll 2x slower (up to check_interval seconds).
455 static int check_interval = 5 * 60; /* 5 minutes */
456 static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
457 static void mcheck_timer(unsigned long);
458 static DEFINE_PER_CPU(struct timer_list, mce_timer);
460 static void mcheck_timer(unsigned long data)
462 struct timer_list *t = &per_cpu(mce_timer, data);
463 int *n;
465 WARN_ON(smp_processor_id() != data);
467 if (mce_available(&current_cpu_data))
468 machine_check_poll(MCP_TIMESTAMP,
469 &__get_cpu_var(mce_poll_banks));
472 * Alert userspace if needed. If we logged an MCE, reduce the
473 * polling interval, otherwise increase the polling interval.
475 n = &__get_cpu_var(next_interval);
476 if (mce_notify_user()) {
477 *n = max(*n/2, HZ/100);
478 } else {
479 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
482 t->expires = jiffies + *n;
483 add_timer(t);
486 static void mce_do_trigger(struct work_struct *work)
488 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
491 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
494 * Notify the user(s) about new machine check events.
495 * Can be called from interrupt context, but not from machine check/NMI
496 * context.
498 int mce_notify_user(void)
500 /* Not more than two messages every minute */
501 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
503 clear_thread_flag(TIF_MCE_NOTIFY);
504 if (test_and_clear_bit(0, &notify_user)) {
505 wake_up_interruptible(&mce_wait);
508 * There is no risk of missing notifications because
509 * work_pending is always cleared before the function is
510 * executed.
512 if (trigger[0] && !work_pending(&mce_trigger_work))
513 schedule_work(&mce_trigger_work);
515 if (__ratelimit(&ratelimit))
516 printk(KERN_INFO "Machine check events logged\n");
518 return 1;
520 return 0;
523 /* see if the idle task needs to notify userspace */
524 static int
525 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
527 /* IDLE_END should be safe - interrupts are back on */
528 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
529 mce_notify_user();
531 return NOTIFY_OK;
534 static struct notifier_block mce_idle_notifier = {
535 .notifier_call = mce_idle_callback,
538 static __init int periodic_mcheck_init(void)
540 idle_notifier_register(&mce_idle_notifier);
541 return 0;
543 __initcall(periodic_mcheck_init);
546 * Initialize Machine Checks for a CPU.
548 static int mce_cap_init(void)
550 u64 cap;
551 unsigned b;
553 rdmsrl(MSR_IA32_MCG_CAP, cap);
554 b = cap & 0xff;
555 if (b > MAX_NR_BANKS) {
556 printk(KERN_WARNING
557 "MCE: Using only %u machine check banks out of %u\n",
558 MAX_NR_BANKS, b);
559 b = MAX_NR_BANKS;
562 /* Don't support asymmetric configurations today */
563 WARN_ON(banks != 0 && b != banks);
564 banks = b;
565 if (!bank) {
566 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
567 if (!bank)
568 return -ENOMEM;
569 memset(bank, 0xff, banks * sizeof(u64));
572 /* Use accurate RIP reporting if available. */
573 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
574 rip_msr = MSR_IA32_MCG_EIP;
576 return 0;
579 static void mce_init(void *dummy)
581 u64 cap;
582 int i;
583 mce_banks_t all_banks;
586 * Log the machine checks left over from the previous reset.
588 bitmap_fill(all_banks, MAX_NR_BANKS);
589 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
591 set_in_cr4(X86_CR4_MCE);
593 rdmsrl(MSR_IA32_MCG_CAP, cap);
594 if (cap & MCG_CTL_P)
595 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
597 for (i = 0; i < banks; i++) {
598 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
599 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
603 /* Add per CPU specific workarounds here */
604 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
606 /* This should be disabled by the BIOS, but isn't always */
607 if (c->x86_vendor == X86_VENDOR_AMD) {
608 if (c->x86 == 15 && banks > 4)
609 /* disable GART TBL walk error reporting, which trips off
610 incorrectly with the IOMMU & 3ware & Cerberus. */
611 clear_bit(10, (unsigned long *)&bank[4]);
612 if(c->x86 <= 17 && mce_bootlog < 0)
613 /* Lots of broken BIOS around that don't clear them
614 by default and leave crap in there. Don't log. */
615 mce_bootlog = 0;
620 static void mce_cpu_features(struct cpuinfo_x86 *c)
622 switch (c->x86_vendor) {
623 case X86_VENDOR_INTEL:
624 mce_intel_feature_init(c);
625 break;
626 case X86_VENDOR_AMD:
627 mce_amd_feature_init(c);
628 break;
629 default:
630 break;
634 static void mce_init_timer(void)
636 struct timer_list *t = &__get_cpu_var(mce_timer);
637 int *n = &__get_cpu_var(next_interval);
639 *n = check_interval * HZ;
640 if (!*n)
641 return;
642 setup_timer(t, mcheck_timer, smp_processor_id());
643 t->expires = round_jiffies(jiffies + *n);
644 add_timer(t);
648 * Called for each booted CPU to set up machine checks.
649 * Must be called with preempt off.
651 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
653 if (!mce_available(c))
654 return;
656 if (mce_cap_init() < 0) {
657 mce_dont_init = 1;
658 return;
660 mce_cpu_quirks(c);
662 mce_init(NULL);
663 mce_cpu_features(c);
664 mce_init_timer();
668 * Character device to read and clear the MCE log.
671 static DEFINE_SPINLOCK(mce_state_lock);
672 static int open_count; /* #times opened */
673 static int open_exclu; /* already open exclusive? */
675 static int mce_open(struct inode *inode, struct file *file)
677 lock_kernel();
678 spin_lock(&mce_state_lock);
680 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
681 spin_unlock(&mce_state_lock);
682 unlock_kernel();
683 return -EBUSY;
686 if (file->f_flags & O_EXCL)
687 open_exclu = 1;
688 open_count++;
690 spin_unlock(&mce_state_lock);
691 unlock_kernel();
693 return nonseekable_open(inode, file);
696 static int mce_release(struct inode *inode, struct file *file)
698 spin_lock(&mce_state_lock);
700 open_count--;
701 open_exclu = 0;
703 spin_unlock(&mce_state_lock);
705 return 0;
708 static void collect_tscs(void *data)
710 unsigned long *cpu_tsc = (unsigned long *)data;
712 rdtscll(cpu_tsc[smp_processor_id()]);
715 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
716 loff_t *off)
718 unsigned long *cpu_tsc;
719 static DEFINE_MUTEX(mce_read_mutex);
720 unsigned prev, next;
721 char __user *buf = ubuf;
722 int i, err;
724 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
725 if (!cpu_tsc)
726 return -ENOMEM;
728 mutex_lock(&mce_read_mutex);
729 next = rcu_dereference(mcelog.next);
731 /* Only supports full reads right now */
732 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
733 mutex_unlock(&mce_read_mutex);
734 kfree(cpu_tsc);
735 return -EINVAL;
738 err = 0;
739 prev = 0;
740 do {
741 for (i = prev; i < next; i++) {
742 unsigned long start = jiffies;
744 while (!mcelog.entry[i].finished) {
745 if (time_after_eq(jiffies, start + 2)) {
746 memset(mcelog.entry + i, 0,
747 sizeof(struct mce));
748 goto timeout;
750 cpu_relax();
752 smp_rmb();
753 err |= copy_to_user(buf, mcelog.entry + i,
754 sizeof(struct mce));
755 buf += sizeof(struct mce);
756 timeout:
760 memset(mcelog.entry + prev, 0,
761 (next - prev) * sizeof(struct mce));
762 prev = next;
763 next = cmpxchg(&mcelog.next, prev, 0);
764 } while (next != prev);
766 synchronize_sched();
769 * Collect entries that were still getting written before the
770 * synchronize.
772 on_each_cpu(collect_tscs, cpu_tsc, 1);
773 for (i = next; i < MCE_LOG_LEN; i++) {
774 if (mcelog.entry[i].finished &&
775 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
776 err |= copy_to_user(buf, mcelog.entry+i,
777 sizeof(struct mce));
778 smp_rmb();
779 buf += sizeof(struct mce);
780 memset(&mcelog.entry[i], 0, sizeof(struct mce));
783 mutex_unlock(&mce_read_mutex);
784 kfree(cpu_tsc);
785 return err ? -EFAULT : buf - ubuf;
788 static unsigned int mce_poll(struct file *file, poll_table *wait)
790 poll_wait(file, &mce_wait, wait);
791 if (rcu_dereference(mcelog.next))
792 return POLLIN | POLLRDNORM;
793 return 0;
796 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
798 int __user *p = (int __user *)arg;
800 if (!capable(CAP_SYS_ADMIN))
801 return -EPERM;
802 switch (cmd) {
803 case MCE_GET_RECORD_LEN:
804 return put_user(sizeof(struct mce), p);
805 case MCE_GET_LOG_LEN:
806 return put_user(MCE_LOG_LEN, p);
807 case MCE_GETCLEAR_FLAGS: {
808 unsigned flags;
810 do {
811 flags = mcelog.flags;
812 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
813 return put_user(flags, p);
815 default:
816 return -ENOTTY;
820 static const struct file_operations mce_chrdev_ops = {
821 .open = mce_open,
822 .release = mce_release,
823 .read = mce_read,
824 .poll = mce_poll,
825 .unlocked_ioctl = mce_ioctl,
828 static struct miscdevice mce_log_device = {
829 MISC_MCELOG_MINOR,
830 "mcelog",
831 &mce_chrdev_ops,
835 * Old style boot options parsing. Only for compatibility.
837 static int __init mcheck_disable(char *str)
839 mce_dont_init = 1;
840 return 1;
843 /* mce=off disables machine check.
844 mce=TOLERANCELEVEL (number, see above)
845 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
846 mce=nobootlog Don't log MCEs from before booting. */
847 static int __init mcheck_enable(char *str)
849 if (!strcmp(str, "off"))
850 mce_dont_init = 1;
851 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
852 mce_bootlog = str[0] == 'b';
853 else if (isdigit(str[0]))
854 get_option(&str, &tolerant);
855 else
856 printk("mce= argument %s ignored. Please use /sys", str);
857 return 1;
860 __setup("nomce", mcheck_disable);
861 __setup("mce=", mcheck_enable);
864 * Sysfs support
868 * Disable machine checks on suspend and shutdown. We can't really handle
869 * them later.
871 static int mce_disable(void)
873 int i;
875 for (i = 0; i < banks; i++)
876 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
877 return 0;
880 static int mce_suspend(struct sys_device *dev, pm_message_t state)
882 return mce_disable();
885 static int mce_shutdown(struct sys_device *dev)
887 return mce_disable();
890 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
891 Only one CPU is active at this time, the others get readded later using
892 CPU hotplug. */
893 static int mce_resume(struct sys_device *dev)
895 mce_init(NULL);
896 mce_cpu_features(&current_cpu_data);
897 return 0;
900 static void mce_cpu_restart(void *data)
902 del_timer_sync(&__get_cpu_var(mce_timer));
903 if (mce_available(&current_cpu_data))
904 mce_init(NULL);
905 mce_init_timer();
908 /* Reinit MCEs after user configuration changes */
909 static void mce_restart(void)
911 on_each_cpu(mce_cpu_restart, NULL, 1);
914 static struct sysdev_class mce_sysclass = {
915 .suspend = mce_suspend,
916 .shutdown = mce_shutdown,
917 .resume = mce_resume,
918 .name = "machinecheck",
921 DEFINE_PER_CPU(struct sys_device, device_mce);
922 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
924 /* Why are there no generic functions for this? */
925 #define ACCESSOR(name, var, start) \
926 static ssize_t show_ ## name(struct sys_device *s, \
927 struct sysdev_attribute *attr, \
928 char *buf) { \
929 return sprintf(buf, "%lx\n", (unsigned long)var); \
931 static ssize_t set_ ## name(struct sys_device *s, \
932 struct sysdev_attribute *attr, \
933 const char *buf, size_t siz) { \
934 char *end; \
935 unsigned long new = simple_strtoul(buf, &end, 0); \
936 if (end == buf) return -EINVAL; \
937 var = new; \
938 start; \
939 return end-buf; \
941 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
943 static struct sysdev_attribute *bank_attrs;
945 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
946 char *buf)
948 u64 b = bank[attr - bank_attrs];
949 return sprintf(buf, "%llx\n", b);
952 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
953 const char *buf, size_t siz)
955 char *end;
956 u64 new = simple_strtoull(buf, &end, 0);
957 if (end == buf)
958 return -EINVAL;
959 bank[attr - bank_attrs] = new;
960 mce_restart();
961 return end-buf;
964 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
965 char *buf)
967 strcpy(buf, trigger);
968 strcat(buf, "\n");
969 return strlen(trigger) + 1;
972 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
973 const char *buf,size_t siz)
975 char *p;
976 int len;
977 strncpy(trigger, buf, sizeof(trigger));
978 trigger[sizeof(trigger)-1] = 0;
979 len = strlen(trigger);
980 p = strchr(trigger, '\n');
981 if (*p) *p = 0;
982 return len;
985 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
986 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
987 ACCESSOR(check_interval,check_interval,mce_restart())
988 static struct sysdev_attribute *mce_attributes[] = {
989 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
990 NULL
993 static cpumask_var_t mce_device_initialized;
995 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
996 static __cpuinit int mce_create_device(unsigned int cpu)
998 int err;
999 int i;
1001 if (!mce_available(&boot_cpu_data))
1002 return -EIO;
1004 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005 per_cpu(device_mce,cpu).id = cpu;
1006 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1008 err = sysdev_register(&per_cpu(device_mce,cpu));
1009 if (err)
1010 return err;
1012 for (i = 0; mce_attributes[i]; i++) {
1013 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014 mce_attributes[i]);
1015 if (err)
1016 goto error;
1018 for (i = 0; i < banks; i++) {
1019 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020 &bank_attrs[i]);
1021 if (err)
1022 goto error2;
1024 cpumask_set_cpu(cpu, mce_device_initialized);
1026 return 0;
1027 error2:
1028 while (--i >= 0) {
1029 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030 &bank_attrs[i]);
1032 error:
1033 while (--i >= 0) {
1034 sysdev_remove_file(&per_cpu(device_mce,cpu),
1035 mce_attributes[i]);
1037 sysdev_unregister(&per_cpu(device_mce,cpu));
1039 return err;
1042 static __cpuinit void mce_remove_device(unsigned int cpu)
1044 int i;
1046 if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047 return;
1049 for (i = 0; mce_attributes[i]; i++)
1050 sysdev_remove_file(&per_cpu(device_mce,cpu),
1051 mce_attributes[i]);
1052 for (i = 0; i < banks; i++)
1053 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054 &bank_attrs[i]);
1055 sysdev_unregister(&per_cpu(device_mce,cpu));
1056 cpumask_clear_cpu(cpu, mce_device_initialized);
1059 /* Make sure there are no machine checks on offlined CPUs. */
1060 static void mce_disable_cpu(void *h)
1062 int i;
1063 unsigned long action = *(unsigned long *)h;
1065 if (!mce_available(&current_cpu_data))
1066 return;
1067 if (!(action & CPU_TASKS_FROZEN))
1068 cmci_clear();
1069 for (i = 0; i < banks; i++)
1070 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1073 static void mce_reenable_cpu(void *h)
1075 int i;
1076 unsigned long action = *(unsigned long *)h;
1078 if (!mce_available(&current_cpu_data))
1079 return;
1080 if (!(action & CPU_TASKS_FROZEN))
1081 cmci_reenable();
1082 for (i = 0; i < banks; i++)
1083 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1086 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1088 unsigned long action, void *hcpu)
1090 unsigned int cpu = (unsigned long)hcpu;
1091 struct timer_list *t = &per_cpu(mce_timer, cpu);
1093 switch (action) {
1094 case CPU_ONLINE:
1095 case CPU_ONLINE_FROZEN:
1096 mce_create_device(cpu);
1097 if (threshold_cpu_callback)
1098 threshold_cpu_callback(action, cpu);
1099 break;
1100 case CPU_DEAD:
1101 case CPU_DEAD_FROZEN:
1102 if (threshold_cpu_callback)
1103 threshold_cpu_callback(action, cpu);
1104 mce_remove_device(cpu);
1105 break;
1106 case CPU_DOWN_PREPARE:
1107 case CPU_DOWN_PREPARE_FROZEN:
1108 del_timer_sync(t);
1109 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110 break;
1111 case CPU_DOWN_FAILED:
1112 case CPU_DOWN_FAILED_FROZEN:
1113 t->expires = round_jiffies(jiffies +
1114 __get_cpu_var(next_interval));
1115 add_timer_on(t, cpu);
1116 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1117 break;
1118 case CPU_POST_DEAD:
1119 /* intentionally ignoring frozen here */
1120 cmci_rediscover(cpu);
1121 break;
1123 return NOTIFY_OK;
1126 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1127 .notifier_call = mce_cpu_callback,
1130 static __init int mce_init_banks(void)
1132 int i;
1134 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1135 GFP_KERNEL);
1136 if (!bank_attrs)
1137 return -ENOMEM;
1139 for (i = 0; i < banks; i++) {
1140 struct sysdev_attribute *a = &bank_attrs[i];
1141 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1142 if (!a->attr.name)
1143 goto nomem;
1144 a->attr.mode = 0644;
1145 a->show = show_bank;
1146 a->store = set_bank;
1148 return 0;
1150 nomem:
1151 while (--i >= 0)
1152 kfree(bank_attrs[i].attr.name);
1153 kfree(bank_attrs);
1154 bank_attrs = NULL;
1155 return -ENOMEM;
1158 static __init int mce_init_device(void)
1160 int err;
1161 int i = 0;
1163 if (!mce_available(&boot_cpu_data))
1164 return -EIO;
1166 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1168 err = mce_init_banks();
1169 if (err)
1170 return err;
1172 err = sysdev_class_register(&mce_sysclass);
1173 if (err)
1174 return err;
1176 for_each_online_cpu(i) {
1177 err = mce_create_device(i);
1178 if (err)
1179 return err;
1182 register_hotcpu_notifier(&mce_cpu_notifier);
1183 misc_register(&mce_log_device);
1184 return err;
1187 device_initcall(mce_init_device);