2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 * Copyright 2008 Intel Corporation
10 #include <linux/init.h>
11 #include <linux/types.h>
12 #include <linux/kernel.h>
13 #include <linux/sched.h>
14 #include <linux/smp_lock.h>
15 #include <linux/string.h>
16 #include <linux/rcupdate.h>
17 #include <linux/kallsyms.h>
18 #include <linux/sysdev.h>
19 #include <linux/miscdevice.h>
21 #include <linux/capability.h>
22 #include <linux/cpu.h>
23 #include <linux/percpu.h>
24 #include <linux/poll.h>
25 #include <linux/thread_info.h>
26 #include <linux/ctype.h>
27 #include <linux/kmod.h>
28 #include <linux/kdebug.h>
29 #include <linux/kobject.h>
30 #include <linux/sysfs.h>
31 #include <linux/ratelimit.h>
32 #include <asm/processor.h>
35 #include <asm/uaccess.h>
39 #define MISC_MCELOG_MINOR 227
43 static int mce_dont_init
;
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
52 static int tolerant
= 1;
55 static unsigned long notify_user
;
57 static int mce_bootlog
= -1;
58 static atomic_t mce_events
;
60 static char trigger
[128];
61 static char *trigger_argv
[2] = { trigger
, NULL
};
63 static DECLARE_WAIT_QUEUE_HEAD(mce_wait
);
65 /* MCA banks polled by the period polling timer for corrected events */
66 DEFINE_PER_CPU(mce_banks_t
, mce_poll_banks
) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS
)-1] = ~0UL
70 /* Do initial initialization of a struct mce */
71 void mce_setup(struct mce
*m
)
73 memset(m
, 0, sizeof(struct mce
));
74 m
->cpu
= smp_processor_id();
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
84 static struct mce_log mcelog
= {
89 void mce_log(struct mce
*mce
)
92 atomic_inc(&mce_events
);
96 entry
= rcu_dereference(mcelog
.next
);
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry
>= MCE_LOG_LEN
) {
101 set_bit(MCE_OVERFLOW
, (unsigned long *)&mcelog
.flags
);
104 /* Old left over entry. Skip. */
105 if (mcelog
.entry
[entry
].finished
) {
113 if (cmpxchg(&mcelog
.next
, entry
, next
) == entry
)
116 memcpy(mcelog
.entry
+ entry
, mce
, sizeof(struct mce
));
118 mcelog
.entry
[entry
].finished
= 1;
121 set_bit(0, ¬ify_user
);
124 static void print_mce(struct mce
*m
)
126 printk(KERN_EMERG
"\n"
127 KERN_EMERG
"HARDWARE ERROR\n"
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m
->cpu
, m
->mcgstatus
, m
->bank
, m
->status
);
132 printk(KERN_EMERG
"RIP%s %02x:<%016Lx> ",
133 !(m
->mcgstatus
& MCG_STATUS_EIPV
) ? " !INEXACT!" : "",
135 if (m
->cs
== __KERNEL_CS
)
136 print_symbol("{%s}", m
->ip
);
139 printk(KERN_EMERG
"TSC %llx ", m
->tsc
);
141 printk("ADDR %llx ", m
->addr
);
143 printk("MISC %llx ", m
->misc
);
145 printk(KERN_EMERG
"This is not a software problem!\n");
146 printk(KERN_EMERG
"Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
150 static void mce_panic(char *msg
, struct mce
*backup
, unsigned long start
)
155 for (i
= 0; i
< MCE_LOG_LEN
; i
++) {
156 unsigned long tsc
= mcelog
.entry
[i
].tsc
;
158 if (time_before(tsc
, start
))
160 print_mce(&mcelog
.entry
[i
]);
161 if (backup
&& mcelog
.entry
[i
].tsc
== backup
->tsc
)
169 int mce_available(struct cpuinfo_x86
*c
)
173 return cpu_has(c
, X86_FEATURE_MCE
) && cpu_has(c
, X86_FEATURE_MCA
);
176 static inline void mce_get_rip(struct mce
*m
, struct pt_regs
*regs
)
178 if (regs
&& (m
->mcgstatus
& MCG_STATUS_RIPV
)) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m
->mcgstatus
|= MCG_STATUS_EIPV
;
188 rdmsrl(rip_msr
, m
->ip
);
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
197 * This is executed in standard interrupt context.
199 void machine_check_poll(enum mcp_flags flags
, mce_banks_t
*b
)
206 rdmsrl(MSR_IA32_MCG_STATUS
, m
.mcgstatus
);
207 for (i
= 0; i
< banks
; i
++) {
208 if (!bank
[i
] || !test_bit(i
, *b
))
217 rdmsrl(MSR_IA32_MC0_STATUS
+ i
*4, m
.status
);
218 if (!(m
.status
& MCI_STATUS_VAL
))
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
226 * TBD do the same check for MCI_STATUS_EN here?
228 if ((m
.status
& MCI_STATUS_UC
) && !(flags
& MCP_UC
))
231 if (m
.status
& MCI_STATUS_MISCV
)
232 rdmsrl(MSR_IA32_MC0_MISC
+ i
*4, m
.misc
);
233 if (m
.status
& MCI_STATUS_ADDRV
)
234 rdmsrl(MSR_IA32_MC0_ADDR
+ i
*4, m
.addr
);
236 if (!(flags
& MCP_TIMESTAMP
))
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
242 if (!(flags
& MCP_DONTLOG
)) {
244 add_taint(TAINT_MACHINE_CHECK
);
248 * Clear state for this bank.
250 wrmsrl(MSR_IA32_MC0_STATUS
+4*i
, 0);
254 * Don't clear MCG_STATUS here because it's only defined for
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
267 void do_machine_check(struct pt_regs
* regs
, long error_code
)
269 struct mce m
, panicm
;
272 int panicm_found
= 0;
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
279 * If kill_it gets set, there might be a way to recover from this
283 DECLARE_BITMAP(toclear
, MAX_NR_BANKS
);
285 atomic_inc(&mce_entry
);
287 if (notify_die(DIE_NMI
, "machine check", regs
, error_code
,
288 18, SIGKILL
) == NOTIFY_STOP
)
295 rdmsrl(MSR_IA32_MCG_STATUS
, m
.mcgstatus
);
296 /* if the restart IP is not valid, we're done for */
297 if (!(m
.mcgstatus
& MCG_STATUS_RIPV
))
303 for (i
= 0; i
< banks
; i
++) {
304 __clear_bit(i
, toclear
);
312 rdmsrl(MSR_IA32_MC0_STATUS
+ i
*4, m
.status
);
313 if ((m
.status
& MCI_STATUS_VAL
) == 0)
317 * Non uncorrected errors are handled by machine_check_poll
320 if ((m
.status
& MCI_STATUS_UC
) == 0)
324 * Set taint even when machine check was not enabled.
326 add_taint(TAINT_MACHINE_CHECK
);
328 __set_bit(i
, toclear
);
330 if (m
.status
& MCI_STATUS_EN
) {
331 /* if PCC was set, there's no way out */
332 no_way_out
|= !!(m
.status
& MCI_STATUS_PCC
);
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
338 if (m
.status
& MCI_STATUS_UC
) {
339 if (tolerant
< 1 || m
.status
& MCI_STATUS_OVER
)
345 * Machine check event was not enabled. Clear, but
351 if (m
.status
& MCI_STATUS_MISCV
)
352 rdmsrl(MSR_IA32_MC0_MISC
+ i
*4, m
.misc
);
353 if (m
.status
& MCI_STATUS_ADDRV
)
354 rdmsrl(MSR_IA32_MC0_ADDR
+ i
*4, m
.addr
);
356 mce_get_rip(&m
, regs
);
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m
.status
& MCI_STATUS_UC
) && (m
.status
& MCI_STATUS_EN
)) {
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
377 if (no_way_out
&& tolerant
< 3)
378 mce_panic("Machine check", &panicm
, mcestart
);
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
386 if (kill_it
&& tolerant
< 3) {
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
393 if (m
.mcgstatus
& MCG_STATUS_EIPV
)
394 user_space
= panicm
.ip
&& (panicm
.cs
& 3);
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
400 * force_sig() takes an awful lot of locks and has a slight
401 * risk of deadlocking.
404 force_sig(SIGBUS
, current
);
405 } else if (panic_on_oops
|| tolerant
< 2) {
406 mce_panic("Uncorrected machine check",
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY
);
414 /* the last thing we do is clear state */
415 for (i
= 0; i
< banks
; i
++) {
416 if (test_bit(i
, toclear
))
417 wrmsrl(MSR_IA32_MC0_STATUS
+4*i
, 0);
419 wrmsrl(MSR_IA32_MCG_STATUS
, 0);
421 atomic_dec(&mce_entry
);
423 EXPORT_SYMBOL_GPL(do_machine_check
);
425 #ifdef CONFIG_X86_MCE_INTEL
427 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
428 * @cpu: The CPU on which the event occurred.
429 * @status: Event status information
431 * This function should be called by the thermal interrupt after the
432 * event has been processed and the decision was made to log the event
435 * The status parameter will be saved to the 'status' field of 'struct mce'
436 * and historically has been the register value of the
437 * MSR_IA32_THERMAL_STATUS (Intel) msr.
439 void mce_log_therm_throt_event(__u64 status
)
444 m
.bank
= MCE_THERMAL_BANK
;
448 #endif /* CONFIG_X86_MCE_INTEL */
451 * Periodic polling timer for "silent" machine check errors. If the
452 * poller finds an MCE, poll 2x faster. When the poller finds no more
453 * errors, poll 2x slower (up to check_interval seconds).
456 static int check_interval
= 5 * 60; /* 5 minutes */
457 static DEFINE_PER_CPU(int, next_interval
); /* in jiffies */
458 static void mcheck_timer(unsigned long);
459 static DEFINE_PER_CPU(struct timer_list
, mce_timer
);
461 static void mcheck_timer(unsigned long data
)
463 struct timer_list
*t
= &per_cpu(mce_timer
, data
);
466 WARN_ON(smp_processor_id() != data
);
468 if (mce_available(¤t_cpu_data
))
469 machine_check_poll(MCP_TIMESTAMP
,
470 &__get_cpu_var(mce_poll_banks
));
473 * Alert userspace if needed. If we logged an MCE, reduce the
474 * polling interval, otherwise increase the polling interval.
476 n
= &__get_cpu_var(next_interval
);
477 if (mce_notify_user()) {
478 *n
= max(*n
/2, HZ
/100);
480 *n
= min(*n
*2, (int)round_jiffies_relative(check_interval
*HZ
));
483 t
->expires
= jiffies
+ *n
;
487 static void mce_do_trigger(struct work_struct
*work
)
489 call_usermodehelper(trigger
, trigger_argv
, NULL
, UMH_NO_WAIT
);
492 static DECLARE_WORK(mce_trigger_work
, mce_do_trigger
);
495 * Notify the user(s) about new machine check events.
496 * Can be called from interrupt context, but not from machine check/NMI
499 int mce_notify_user(void)
501 /* Not more than two messages every minute */
502 static DEFINE_RATELIMIT_STATE(ratelimit
, 60*HZ
, 2);
504 clear_thread_flag(TIF_MCE_NOTIFY
);
505 if (test_and_clear_bit(0, ¬ify_user
)) {
506 wake_up_interruptible(&mce_wait
);
509 * There is no risk of missing notifications because
510 * work_pending is always cleared before the function is
513 if (trigger
[0] && !work_pending(&mce_trigger_work
))
514 schedule_work(&mce_trigger_work
);
516 if (__ratelimit(&ratelimit
))
517 printk(KERN_INFO
"Machine check events logged\n");
524 /* see if the idle task needs to notify userspace */
526 mce_idle_callback(struct notifier_block
*nfb
, unsigned long action
, void *junk
)
528 /* IDLE_END should be safe - interrupts are back on */
529 if (action
== IDLE_END
&& test_thread_flag(TIF_MCE_NOTIFY
))
535 static struct notifier_block mce_idle_notifier
= {
536 .notifier_call
= mce_idle_callback
,
539 static __init
int periodic_mcheck_init(void)
541 idle_notifier_register(&mce_idle_notifier
);
544 __initcall(periodic_mcheck_init
);
547 * Initialize Machine Checks for a CPU.
549 static int mce_cap_init(void)
554 rdmsrl(MSR_IA32_MCG_CAP
, cap
);
556 if (b
> MAX_NR_BANKS
) {
558 "MCE: Using only %u machine check banks out of %u\n",
563 /* Don't support asymmetric configurations today */
564 WARN_ON(banks
!= 0 && b
!= banks
);
567 bank
= kmalloc(banks
* sizeof(u64
), GFP_KERNEL
);
570 memset(bank
, 0xff, banks
* sizeof(u64
));
573 /* Use accurate RIP reporting if available. */
574 if ((cap
& (1<<9)) && ((cap
>> 16) & 0xff) >= 9)
575 rip_msr
= MSR_IA32_MCG_EIP
;
580 static void mce_init(void *dummy
)
584 mce_banks_t all_banks
;
587 * Log the machine checks left over from the previous reset.
589 bitmap_fill(all_banks
, MAX_NR_BANKS
);
590 machine_check_poll(MCP_UC
|(!mce_bootlog
? MCP_DONTLOG
: 0), &all_banks
);
592 set_in_cr4(X86_CR4_MCE
);
594 rdmsrl(MSR_IA32_MCG_CAP
, cap
);
596 wrmsr(MSR_IA32_MCG_CTL
, 0xffffffff, 0xffffffff);
598 for (i
= 0; i
< banks
; i
++) {
599 wrmsrl(MSR_IA32_MC0_CTL
+4*i
, bank
[i
]);
600 wrmsrl(MSR_IA32_MC0_STATUS
+4*i
, 0);
604 /* Add per CPU specific workarounds here */
605 static void mce_cpu_quirks(struct cpuinfo_x86
*c
)
607 /* This should be disabled by the BIOS, but isn't always */
608 if (c
->x86_vendor
== X86_VENDOR_AMD
) {
609 if (c
->x86
== 15 && banks
> 4)
610 /* disable GART TBL walk error reporting, which trips off
611 incorrectly with the IOMMU & 3ware & Cerberus. */
612 clear_bit(10, (unsigned long *)&bank
[4]);
613 if(c
->x86
<= 17 && mce_bootlog
< 0)
614 /* Lots of broken BIOS around that don't clear them
615 by default and leave crap in there. Don't log. */
621 static void mce_cpu_features(struct cpuinfo_x86
*c
)
623 switch (c
->x86_vendor
) {
624 case X86_VENDOR_INTEL
:
625 mce_intel_feature_init(c
);
628 mce_amd_feature_init(c
);
635 static void mce_init_timer(void)
637 struct timer_list
*t
= &__get_cpu_var(mce_timer
);
638 int *n
= &__get_cpu_var(next_interval
);
640 *n
= check_interval
* HZ
;
643 setup_timer(t
, mcheck_timer
, smp_processor_id());
644 t
->expires
= round_jiffies(jiffies
+ *n
);
649 * Called for each booted CPU to set up machine checks.
650 * Must be called with preempt off.
652 void __cpuinit
mcheck_init(struct cpuinfo_x86
*c
)
654 if (!mce_available(c
))
657 if (mce_cap_init() < 0) {
669 * Character device to read and clear the MCE log.
672 static DEFINE_SPINLOCK(mce_state_lock
);
673 static int open_count
; /* #times opened */
674 static int open_exclu
; /* already open exclusive? */
676 static int mce_open(struct inode
*inode
, struct file
*file
)
679 spin_lock(&mce_state_lock
);
681 if (open_exclu
|| (open_count
&& (file
->f_flags
& O_EXCL
))) {
682 spin_unlock(&mce_state_lock
);
687 if (file
->f_flags
& O_EXCL
)
691 spin_unlock(&mce_state_lock
);
694 return nonseekable_open(inode
, file
);
697 static int mce_release(struct inode
*inode
, struct file
*file
)
699 spin_lock(&mce_state_lock
);
704 spin_unlock(&mce_state_lock
);
709 static void collect_tscs(void *data
)
711 unsigned long *cpu_tsc
= (unsigned long *)data
;
713 rdtscll(cpu_tsc
[smp_processor_id()]);
716 static ssize_t
mce_read(struct file
*filp
, char __user
*ubuf
, size_t usize
,
719 unsigned long *cpu_tsc
;
720 static DEFINE_MUTEX(mce_read_mutex
);
722 char __user
*buf
= ubuf
;
725 cpu_tsc
= kmalloc(nr_cpu_ids
* sizeof(long), GFP_KERNEL
);
729 mutex_lock(&mce_read_mutex
);
730 next
= rcu_dereference(mcelog
.next
);
732 /* Only supports full reads right now */
733 if (*off
!= 0 || usize
< MCE_LOG_LEN
*sizeof(struct mce
)) {
734 mutex_unlock(&mce_read_mutex
);
742 for (i
= prev
; i
< next
; i
++) {
743 unsigned long start
= jiffies
;
745 while (!mcelog
.entry
[i
].finished
) {
746 if (time_after_eq(jiffies
, start
+ 2)) {
747 memset(mcelog
.entry
+ i
, 0,
754 err
|= copy_to_user(buf
, mcelog
.entry
+ i
,
756 buf
+= sizeof(struct mce
);
761 memset(mcelog
.entry
+ prev
, 0,
762 (next
- prev
) * sizeof(struct mce
));
764 next
= cmpxchg(&mcelog
.next
, prev
, 0);
765 } while (next
!= prev
);
770 * Collect entries that were still getting written before the
773 on_each_cpu(collect_tscs
, cpu_tsc
, 1);
774 for (i
= next
; i
< MCE_LOG_LEN
; i
++) {
775 if (mcelog
.entry
[i
].finished
&&
776 mcelog
.entry
[i
].tsc
< cpu_tsc
[mcelog
.entry
[i
].cpu
]) {
777 err
|= copy_to_user(buf
, mcelog
.entry
+i
,
780 buf
+= sizeof(struct mce
);
781 memset(&mcelog
.entry
[i
], 0, sizeof(struct mce
));
784 mutex_unlock(&mce_read_mutex
);
786 return err
? -EFAULT
: buf
- ubuf
;
789 static unsigned int mce_poll(struct file
*file
, poll_table
*wait
)
791 poll_wait(file
, &mce_wait
, wait
);
792 if (rcu_dereference(mcelog
.next
))
793 return POLLIN
| POLLRDNORM
;
797 static long mce_ioctl(struct file
*f
, unsigned int cmd
, unsigned long arg
)
799 int __user
*p
= (int __user
*)arg
;
801 if (!capable(CAP_SYS_ADMIN
))
804 case MCE_GET_RECORD_LEN
:
805 return put_user(sizeof(struct mce
), p
);
806 case MCE_GET_LOG_LEN
:
807 return put_user(MCE_LOG_LEN
, p
);
808 case MCE_GETCLEAR_FLAGS
: {
812 flags
= mcelog
.flags
;
813 } while (cmpxchg(&mcelog
.flags
, flags
, 0) != flags
);
814 return put_user(flags
, p
);
821 static const struct file_operations mce_chrdev_ops
= {
823 .release
= mce_release
,
826 .unlocked_ioctl
= mce_ioctl
,
829 static struct miscdevice mce_log_device
= {
836 * Old style boot options parsing. Only for compatibility.
838 static int __init
mcheck_disable(char *str
)
844 /* mce=off disables machine check.
845 mce=TOLERANCELEVEL (number, see above)
846 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
847 mce=nobootlog Don't log MCEs from before booting. */
848 static int __init
mcheck_enable(char *str
)
850 if (!strcmp(str
, "off"))
852 else if (!strcmp(str
, "bootlog") || !strcmp(str
,"nobootlog"))
853 mce_bootlog
= str
[0] == 'b';
854 else if (isdigit(str
[0]))
855 get_option(&str
, &tolerant
);
857 printk("mce= argument %s ignored. Please use /sys", str
);
861 __setup("nomce", mcheck_disable
);
862 __setup("mce=", mcheck_enable
);
869 * Disable machine checks on suspend and shutdown. We can't really handle
872 static int mce_disable(void)
876 for (i
= 0; i
< banks
; i
++)
877 wrmsrl(MSR_IA32_MC0_CTL
+ i
*4, 0);
881 static int mce_suspend(struct sys_device
*dev
, pm_message_t state
)
883 return mce_disable();
886 static int mce_shutdown(struct sys_device
*dev
)
888 return mce_disable();
891 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
892 Only one CPU is active at this time, the others get readded later using
894 static int mce_resume(struct sys_device
*dev
)
897 mce_cpu_features(¤t_cpu_data
);
901 static void mce_cpu_restart(void *data
)
903 del_timer_sync(&__get_cpu_var(mce_timer
));
904 if (mce_available(¤t_cpu_data
))
909 /* Reinit MCEs after user configuration changes */
910 static void mce_restart(void)
912 on_each_cpu(mce_cpu_restart
, NULL
, 1);
915 static struct sysdev_class mce_sysclass
= {
916 .suspend
= mce_suspend
,
917 .shutdown
= mce_shutdown
,
918 .resume
= mce_resume
,
919 .name
= "machinecheck",
922 DEFINE_PER_CPU(struct sys_device
, device_mce
);
923 void (*threshold_cpu_callback
)(unsigned long action
, unsigned int cpu
) __cpuinitdata
;
925 /* Why are there no generic functions for this? */
926 #define ACCESSOR(name, var, start) \
927 static ssize_t show_ ## name(struct sys_device *s, \
928 struct sysdev_attribute *attr, \
930 return sprintf(buf, "%lx\n", (unsigned long)var); \
932 static ssize_t set_ ## name(struct sys_device *s, \
933 struct sysdev_attribute *attr, \
934 const char *buf, size_t siz) { \
936 unsigned long new = simple_strtoul(buf, &end, 0); \
937 if (end == buf) return -EINVAL; \
942 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
944 static struct sysdev_attribute
*bank_attrs
;
946 static ssize_t
show_bank(struct sys_device
*s
, struct sysdev_attribute
*attr
,
949 u64 b
= bank
[attr
- bank_attrs
];
950 return sprintf(buf
, "%llx\n", b
);
953 static ssize_t
set_bank(struct sys_device
*s
, struct sysdev_attribute
*attr
,
954 const char *buf
, size_t siz
)
957 u64
new = simple_strtoull(buf
, &end
, 0);
960 bank
[attr
- bank_attrs
] = new;
965 static ssize_t
show_trigger(struct sys_device
*s
, struct sysdev_attribute
*attr
,
968 strcpy(buf
, trigger
);
970 return strlen(trigger
) + 1;
973 static ssize_t
set_trigger(struct sys_device
*s
, struct sysdev_attribute
*attr
,
974 const char *buf
,size_t siz
)
978 strncpy(trigger
, buf
, sizeof(trigger
));
979 trigger
[sizeof(trigger
)-1] = 0;
980 len
= strlen(trigger
);
981 p
= strchr(trigger
, '\n');
986 static SYSDEV_ATTR(trigger
, 0644, show_trigger
, set_trigger
);
987 static SYSDEV_INT_ATTR(tolerant
, 0644, tolerant
);
988 ACCESSOR(check_interval
,check_interval
,mce_restart())
989 static struct sysdev_attribute
*mce_attributes
[] = {
990 &attr_tolerant
.attr
, &attr_check_interval
, &attr_trigger
,
994 static cpumask_var_t mce_device_initialized
;
996 /* Per cpu sysdev init. All of the cpus still share the same ctl bank */
997 static __cpuinit
int mce_create_device(unsigned int cpu
)
1002 if (!mce_available(&boot_cpu_data
))
1005 memset(&per_cpu(device_mce
, cpu
).kobj
, 0, sizeof(struct kobject
));
1006 per_cpu(device_mce
,cpu
).id
= cpu
;
1007 per_cpu(device_mce
,cpu
).cls
= &mce_sysclass
;
1009 err
= sysdev_register(&per_cpu(device_mce
,cpu
));
1013 for (i
= 0; mce_attributes
[i
]; i
++) {
1014 err
= sysdev_create_file(&per_cpu(device_mce
,cpu
),
1019 for (i
= 0; i
< banks
; i
++) {
1020 err
= sysdev_create_file(&per_cpu(device_mce
, cpu
),
1025 cpumask_set_cpu(cpu
, mce_device_initialized
);
1030 sysdev_remove_file(&per_cpu(device_mce
, cpu
),
1035 sysdev_remove_file(&per_cpu(device_mce
,cpu
),
1038 sysdev_unregister(&per_cpu(device_mce
,cpu
));
1043 static __cpuinit
void mce_remove_device(unsigned int cpu
)
1047 if (!cpumask_test_cpu(cpu
, mce_device_initialized
))
1050 for (i
= 0; mce_attributes
[i
]; i
++)
1051 sysdev_remove_file(&per_cpu(device_mce
,cpu
),
1053 for (i
= 0; i
< banks
; i
++)
1054 sysdev_remove_file(&per_cpu(device_mce
, cpu
),
1056 sysdev_unregister(&per_cpu(device_mce
,cpu
));
1057 cpumask_clear_cpu(cpu
, mce_device_initialized
);
1060 /* Make sure there are no machine checks on offlined CPUs. */
1061 static void mce_disable_cpu(void *h
)
1064 unsigned long action
= *(unsigned long *)h
;
1066 if (!mce_available(¤t_cpu_data
))
1068 if (!(action
& CPU_TASKS_FROZEN
))
1070 for (i
= 0; i
< banks
; i
++)
1071 wrmsrl(MSR_IA32_MC0_CTL
+ i
*4, 0);
1074 static void mce_reenable_cpu(void *h
)
1077 unsigned long action
= *(unsigned long *)h
;
1079 if (!mce_available(¤t_cpu_data
))
1081 if (!(action
& CPU_TASKS_FROZEN
))
1083 for (i
= 0; i
< banks
; i
++)
1084 wrmsrl(MSR_IA32_MC0_CTL
+ i
*4, bank
[i
]);
1087 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1088 static int __cpuinit
mce_cpu_callback(struct notifier_block
*nfb
,
1089 unsigned long action
, void *hcpu
)
1091 unsigned int cpu
= (unsigned long)hcpu
;
1092 struct timer_list
*t
= &per_cpu(mce_timer
, cpu
);
1096 case CPU_ONLINE_FROZEN
:
1097 mce_create_device(cpu
);
1098 if (threshold_cpu_callback
)
1099 threshold_cpu_callback(action
, cpu
);
1102 case CPU_DEAD_FROZEN
:
1103 if (threshold_cpu_callback
)
1104 threshold_cpu_callback(action
, cpu
);
1105 mce_remove_device(cpu
);
1107 case CPU_DOWN_PREPARE
:
1108 case CPU_DOWN_PREPARE_FROZEN
:
1110 smp_call_function_single(cpu
, mce_disable_cpu
, &action
, 1);
1112 case CPU_DOWN_FAILED
:
1113 case CPU_DOWN_FAILED_FROZEN
:
1114 t
->expires
= round_jiffies(jiffies
+
1115 __get_cpu_var(next_interval
));
1116 add_timer_on(t
, cpu
);
1117 smp_call_function_single(cpu
, mce_reenable_cpu
, &action
, 1);
1120 /* intentionally ignoring frozen here */
1121 cmci_rediscover(cpu
);
1127 static struct notifier_block mce_cpu_notifier __cpuinitdata
= {
1128 .notifier_call
= mce_cpu_callback
,
1131 static __init
int mce_init_banks(void)
1135 bank_attrs
= kzalloc(sizeof(struct sysdev_attribute
) * banks
,
1140 for (i
= 0; i
< banks
; i
++) {
1141 struct sysdev_attribute
*a
= &bank_attrs
[i
];
1142 a
->attr
.name
= kasprintf(GFP_KERNEL
, "bank%d", i
);
1145 a
->attr
.mode
= 0644;
1146 a
->show
= show_bank
;
1147 a
->store
= set_bank
;
1153 kfree(bank_attrs
[i
].attr
.name
);
1159 static __init
int mce_init_device(void)
1164 if (!mce_available(&boot_cpu_data
))
1167 zalloc_cpumask_var(&mce_device_initialized
, GFP_KERNEL
);
1169 err
= mce_init_banks();
1173 err
= sysdev_class_register(&mce_sysclass
);
1177 for_each_online_cpu(i
) {
1178 err
= mce_create_device(i
);
1183 register_hotcpu_notifier(&mce_cpu_notifier
);
1184 misc_register(&mce_log_device
);
1188 device_initcall(mce_init_device
);