[PATCH] i386 prepare nmi watchdog for dynticks
[linux-2.6/libata-dev.git] / arch / i386 / kernel / nmi.c
blob821df34d2b3a3a1b8a71ada0a77b64229e0f0200
1 /*
2 * linux/arch/i386/nmi.c
4 * NMI watchdog support on APIC systems
6 * Started by Ingo Molnar <mingo@redhat.com>
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
12 * Pavel Machek and
13 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
16 #include <linux/delay.h>
17 #include <linux/interrupt.h>
18 #include <linux/module.h>
19 #include <linux/nmi.h>
20 #include <linux/sysdev.h>
21 #include <linux/sysctl.h>
22 #include <linux/percpu.h>
23 #include <linux/dmi.h>
24 #include <linux/kprobes.h>
25 #include <linux/cpumask.h>
26 #include <linux/kernel_stat.h>
28 #include <asm/smp.h>
29 #include <asm/nmi.h>
30 #include <asm/kdebug.h>
31 #include <asm/intel_arch_perfmon.h>
33 #include "mach_traps.h"
35 int unknown_nmi_panic;
36 int nmi_watchdog_enabled;
38 /* perfctr_nmi_owner tracks the ownership of the perfctr registers:
39 * evtsel_nmi_owner tracks the ownership of the event selection
40 * - different performance counters/ event selection may be reserved for
41 * different subsystems this reservation system just tries to coordinate
42 * things a little
44 static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
45 static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
47 static cpumask_t backtrace_mask = CPU_MASK_NONE;
49 /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
50 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
52 #define NMI_MAX_COUNTER_BITS 66
54 /* nmi_active:
55 * >0: the lapic NMI watchdog is active, but can be disabled
56 * <0: the lapic NMI watchdog has not been set up, and cannot
57 * be enabled
58 * 0: the lapic NMI watchdog is disabled, but can be enabled
60 atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
62 unsigned int nmi_watchdog = NMI_DEFAULT;
63 static unsigned int nmi_hz = HZ;
65 struct nmi_watchdog_ctlblk {
66 int enabled;
67 u64 check_bit;
68 unsigned int cccr_msr;
69 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
70 unsigned int evntsel_msr; /* the MSR to select the events to handle */
72 static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
74 /* local prototypes */
75 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
77 extern void show_registers(struct pt_regs *regs);
78 extern int unknown_nmi_panic;
80 /* converts an msr to an appropriate reservation bit */
81 static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
83 /* returns the bit offset of the performance counter register */
84 switch (boot_cpu_data.x86_vendor) {
85 case X86_VENDOR_AMD:
86 return (msr - MSR_K7_PERFCTR0);
87 case X86_VENDOR_INTEL:
88 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
89 return (msr - MSR_ARCH_PERFMON_PERFCTR0);
91 switch (boot_cpu_data.x86) {
92 case 6:
93 return (msr - MSR_P6_PERFCTR0);
94 case 15:
95 return (msr - MSR_P4_BPU_PERFCTR0);
98 return 0;
101 /* converts an msr to an appropriate reservation bit */
102 static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
104 /* returns the bit offset of the event selection register */
105 switch (boot_cpu_data.x86_vendor) {
106 case X86_VENDOR_AMD:
107 return (msr - MSR_K7_EVNTSEL0);
108 case X86_VENDOR_INTEL:
109 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
110 return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
112 switch (boot_cpu_data.x86) {
113 case 6:
114 return (msr - MSR_P6_EVNTSEL0);
115 case 15:
116 return (msr - MSR_P4_BSU_ESCR0);
119 return 0;
122 /* checks for a bit availability (hack for oprofile) */
123 int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
127 return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
130 /* checks the an msr for availability */
131 int avail_to_resrv_perfctr_nmi(unsigned int msr)
133 unsigned int counter;
135 counter = nmi_perfctr_msr_to_bit(msr);
136 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
138 return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
141 int reserve_perfctr_nmi(unsigned int msr)
143 unsigned int counter;
145 counter = nmi_perfctr_msr_to_bit(msr);
146 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
148 if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
149 return 1;
150 return 0;
153 void release_perfctr_nmi(unsigned int msr)
155 unsigned int counter;
157 counter = nmi_perfctr_msr_to_bit(msr);
158 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
160 clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
163 int reserve_evntsel_nmi(unsigned int msr)
165 unsigned int counter;
167 counter = nmi_evntsel_msr_to_bit(msr);
168 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
170 if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
171 return 1;
172 return 0;
175 void release_evntsel_nmi(unsigned int msr)
177 unsigned int counter;
179 counter = nmi_evntsel_msr_to_bit(msr);
180 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
182 clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
185 static __cpuinit inline int nmi_known_cpu(void)
187 switch (boot_cpu_data.x86_vendor) {
188 case X86_VENDOR_AMD:
189 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
190 || (boot_cpu_data.x86 == 16));
191 case X86_VENDOR_INTEL:
192 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
193 return 1;
194 else
195 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
197 return 0;
200 static int endflag __initdata = 0;
202 #ifdef CONFIG_SMP
203 /* The performance counters used by NMI_LOCAL_APIC don't trigger when
204 * the CPU is idle. To make sure the NMI watchdog really ticks on all
205 * CPUs during the test make them busy.
207 static __init void nmi_cpu_busy(void *data)
209 local_irq_enable_in_hardirq();
210 /* Intentionally don't use cpu_relax here. This is
211 to make sure that the performance counter really ticks,
212 even if there is a simulator or similar that catches the
213 pause instruction. On a real HT machine this is fine because
214 all other CPUs are busy with "useless" delay loops and don't
215 care if they get somewhat less cycles. */
216 while (endflag == 0)
217 mb();
219 #endif
221 static unsigned int adjust_for_32bit_ctr(unsigned int hz)
223 u64 counter_val;
224 unsigned int retval = hz;
227 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
228 * are writable, with higher bits sign extending from bit 31.
229 * So, we can only program the counter with 31 bit values and
230 * 32nd bit should be 1, for 33.. to be 1.
231 * Find the appropriate nmi_hz
233 counter_val = (u64)cpu_khz * 1000;
234 do_div(counter_val, retval);
235 if (counter_val > 0x7fffffffULL) {
236 u64 count = (u64)cpu_khz * 1000;
237 do_div(count, 0x7fffffffUL);
238 retval = count + 1;
240 return retval;
243 static int __init check_nmi_watchdog(void)
245 unsigned int *prev_nmi_count;
246 int cpu;
248 /* Enable NMI watchdog for newer systems.
249 Probably safe on most older systems too, but let's be careful.
250 IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
251 which hangs the system. Disable watchdog for all thinkpads */
252 if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
253 !dmi_name_in_vendors("ThinkPad"))
254 nmi_watchdog = NMI_LOCAL_APIC;
256 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
257 return 0;
259 if (!atomic_read(&nmi_active))
260 return 0;
262 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
263 if (!prev_nmi_count)
264 return -1;
266 printk(KERN_INFO "Testing NMI watchdog ... ");
268 if (nmi_watchdog == NMI_LOCAL_APIC)
269 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
271 for_each_possible_cpu(cpu)
272 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
273 local_irq_enable();
274 mdelay((10*1000)/nmi_hz); // wait 10 ticks
276 for_each_possible_cpu(cpu) {
277 #ifdef CONFIG_SMP
278 /* Check cpu_callin_map here because that is set
279 after the timer is started. */
280 if (!cpu_isset(cpu, cpu_callin_map))
281 continue;
282 #endif
283 if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
284 continue;
285 if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
286 printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
287 cpu,
288 prev_nmi_count[cpu],
289 nmi_count(cpu));
290 per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
291 atomic_dec(&nmi_active);
294 if (!atomic_read(&nmi_active)) {
295 kfree(prev_nmi_count);
296 atomic_set(&nmi_active, -1);
297 return -1;
299 endflag = 1;
300 printk("OK.\n");
302 /* now that we know it works we can reduce NMI frequency to
303 something more reasonable; makes a difference in some configs */
304 if (nmi_watchdog == NMI_LOCAL_APIC) {
305 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
307 nmi_hz = 1;
309 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
310 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
311 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
315 kfree(prev_nmi_count);
316 return 0;
318 /* This needs to happen later in boot so counters are working */
319 late_initcall(check_nmi_watchdog);
321 static int __init setup_nmi_watchdog(char *str)
323 int nmi;
325 get_option(&str, &nmi);
327 if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
328 return 0;
330 nmi_watchdog = nmi;
331 return 1;
334 __setup("nmi_watchdog=", setup_nmi_watchdog);
336 static void disable_lapic_nmi_watchdog(void)
338 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
340 if (atomic_read(&nmi_active) <= 0)
341 return;
343 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
345 BUG_ON(atomic_read(&nmi_active) != 0);
348 static void enable_lapic_nmi_watchdog(void)
350 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
352 /* are we already enabled */
353 if (atomic_read(&nmi_active) != 0)
354 return;
356 /* are we lapic aware */
357 if (nmi_known_cpu() <= 0)
358 return;
360 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
361 touch_nmi_watchdog();
364 void disable_timer_nmi_watchdog(void)
366 BUG_ON(nmi_watchdog != NMI_IO_APIC);
368 if (atomic_read(&nmi_active) <= 0)
369 return;
371 disable_irq(0);
372 on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
374 BUG_ON(atomic_read(&nmi_active) != 0);
377 void enable_timer_nmi_watchdog(void)
379 BUG_ON(nmi_watchdog != NMI_IO_APIC);
381 if (atomic_read(&nmi_active) == 0) {
382 touch_nmi_watchdog();
383 on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
384 enable_irq(0);
388 static void __acpi_nmi_disable(void *__unused)
390 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
394 * Disable timer based NMIs on all CPUs:
396 void acpi_nmi_disable(void)
398 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
399 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
402 static void __acpi_nmi_enable(void *__unused)
404 apic_write_around(APIC_LVT0, APIC_DM_NMI);
408 * Enable timer based NMIs on all CPUs:
410 void acpi_nmi_enable(void)
412 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
413 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
416 #ifdef CONFIG_PM
418 static int nmi_pm_active; /* nmi_active before suspend */
420 static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
422 /* only CPU0 goes here, other CPUs should be offline */
423 nmi_pm_active = atomic_read(&nmi_active);
424 stop_apic_nmi_watchdog(NULL);
425 BUG_ON(atomic_read(&nmi_active) != 0);
426 return 0;
429 static int lapic_nmi_resume(struct sys_device *dev)
431 /* only CPU0 goes here, other CPUs should be offline */
432 if (nmi_pm_active > 0) {
433 setup_apic_nmi_watchdog(NULL);
434 touch_nmi_watchdog();
436 return 0;
440 static struct sysdev_class nmi_sysclass = {
441 set_kset_name("lapic_nmi"),
442 .resume = lapic_nmi_resume,
443 .suspend = lapic_nmi_suspend,
446 static struct sys_device device_lapic_nmi = {
447 .id = 0,
448 .cls = &nmi_sysclass,
451 static int __init init_lapic_nmi_sysfs(void)
453 int error;
455 /* should really be a BUG_ON but b/c this is an
456 * init call, it just doesn't work. -dcz
458 if (nmi_watchdog != NMI_LOCAL_APIC)
459 return 0;
461 if ( atomic_read(&nmi_active) < 0 )
462 return 0;
464 error = sysdev_class_register(&nmi_sysclass);
465 if (!error)
466 error = sysdev_register(&device_lapic_nmi);
467 return error;
469 /* must come after the local APIC's device_initcall() */
470 late_initcall(init_lapic_nmi_sysfs);
472 #endif /* CONFIG_PM */
475 * Activate the NMI watchdog via the local APIC.
476 * Original code written by Keith Owens.
479 static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
481 u64 count = (u64)cpu_khz * 1000;
483 do_div(count, nmi_hz);
484 if(descr)
485 Dprintk("setting %s to -0x%08Lx\n", descr, count);
486 wrmsrl(perfctr_msr, 0 - count);
489 static void write_watchdog_counter32(unsigned int perfctr_msr,
490 const char *descr)
492 u64 count = (u64)cpu_khz * 1000;
494 do_div(count, nmi_hz);
495 if(descr)
496 Dprintk("setting %s to -0x%08Lx\n", descr, count);
497 wrmsr(perfctr_msr, (u32)(-count), 0);
500 /* Note that these events don't tick when the CPU idles. This means
501 the frequency varies with CPU load. */
503 #define K7_EVNTSEL_ENABLE (1 << 22)
504 #define K7_EVNTSEL_INT (1 << 20)
505 #define K7_EVNTSEL_OS (1 << 17)
506 #define K7_EVNTSEL_USR (1 << 16)
507 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
508 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
510 static int setup_k7_watchdog(void)
512 unsigned int perfctr_msr, evntsel_msr;
513 unsigned int evntsel;
514 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
516 perfctr_msr = MSR_K7_PERFCTR0;
517 evntsel_msr = MSR_K7_EVNTSEL0;
518 if (!reserve_perfctr_nmi(perfctr_msr))
519 goto fail;
521 if (!reserve_evntsel_nmi(evntsel_msr))
522 goto fail1;
524 wrmsrl(perfctr_msr, 0UL);
526 evntsel = K7_EVNTSEL_INT
527 | K7_EVNTSEL_OS
528 | K7_EVNTSEL_USR
529 | K7_NMI_EVENT;
531 /* setup the timer */
532 wrmsr(evntsel_msr, evntsel, 0);
533 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
534 apic_write(APIC_LVTPC, APIC_DM_NMI);
535 evntsel |= K7_EVNTSEL_ENABLE;
536 wrmsr(evntsel_msr, evntsel, 0);
538 wd->perfctr_msr = perfctr_msr;
539 wd->evntsel_msr = evntsel_msr;
540 wd->cccr_msr = 0; //unused
541 wd->check_bit = 1ULL<<63;
542 return 1;
543 fail1:
544 release_perfctr_nmi(perfctr_msr);
545 fail:
546 return 0;
549 static void stop_k7_watchdog(void)
551 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
553 wrmsr(wd->evntsel_msr, 0, 0);
555 release_evntsel_nmi(wd->evntsel_msr);
556 release_perfctr_nmi(wd->perfctr_msr);
559 #define P6_EVNTSEL0_ENABLE (1 << 22)
560 #define P6_EVNTSEL_INT (1 << 20)
561 #define P6_EVNTSEL_OS (1 << 17)
562 #define P6_EVNTSEL_USR (1 << 16)
563 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
564 #define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
566 static int setup_p6_watchdog(void)
568 unsigned int perfctr_msr, evntsel_msr;
569 unsigned int evntsel;
570 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
572 perfctr_msr = MSR_P6_PERFCTR0;
573 evntsel_msr = MSR_P6_EVNTSEL0;
574 if (!reserve_perfctr_nmi(perfctr_msr))
575 goto fail;
577 if (!reserve_evntsel_nmi(evntsel_msr))
578 goto fail1;
580 wrmsrl(perfctr_msr, 0UL);
582 evntsel = P6_EVNTSEL_INT
583 | P6_EVNTSEL_OS
584 | P6_EVNTSEL_USR
585 | P6_NMI_EVENT;
587 /* setup the timer */
588 wrmsr(evntsel_msr, evntsel, 0);
589 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
590 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
591 apic_write(APIC_LVTPC, APIC_DM_NMI);
592 evntsel |= P6_EVNTSEL0_ENABLE;
593 wrmsr(evntsel_msr, evntsel, 0);
595 wd->perfctr_msr = perfctr_msr;
596 wd->evntsel_msr = evntsel_msr;
597 wd->cccr_msr = 0; //unused
598 wd->check_bit = 1ULL<<39;
599 return 1;
600 fail1:
601 release_perfctr_nmi(perfctr_msr);
602 fail:
603 return 0;
606 static void stop_p6_watchdog(void)
608 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
610 wrmsr(wd->evntsel_msr, 0, 0);
612 release_evntsel_nmi(wd->evntsel_msr);
613 release_perfctr_nmi(wd->perfctr_msr);
616 /* Note that these events don't tick when the CPU idles. This means
617 the frequency varies with CPU load. */
619 #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
620 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
621 #define P4_ESCR_OS (1<<3)
622 #define P4_ESCR_USR (1<<2)
623 #define P4_CCCR_OVF_PMI0 (1<<26)
624 #define P4_CCCR_OVF_PMI1 (1<<27)
625 #define P4_CCCR_THRESHOLD(N) ((N)<<20)
626 #define P4_CCCR_COMPLEMENT (1<<19)
627 #define P4_CCCR_COMPARE (1<<18)
628 #define P4_CCCR_REQUIRED (3<<16)
629 #define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
630 #define P4_CCCR_ENABLE (1<<12)
631 #define P4_CCCR_OVF (1<<31)
632 /* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
633 CRU_ESCR0 (with any non-null event selector) through a complemented
634 max threshold. [IA32-Vol3, Section 14.9.9] */
636 static int setup_p4_watchdog(void)
638 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
639 unsigned int evntsel, cccr_val;
640 unsigned int misc_enable, dummy;
641 unsigned int ht_num;
642 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
644 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
645 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
646 return 0;
648 #ifdef CONFIG_SMP
649 /* detect which hyperthread we are on */
650 if (smp_num_siblings == 2) {
651 unsigned int ebx, apicid;
653 ebx = cpuid_ebx(1);
654 apicid = (ebx >> 24) & 0xff;
655 ht_num = apicid & 1;
656 } else
657 #endif
658 ht_num = 0;
660 /* performance counters are shared resources
661 * assign each hyperthread its own set
662 * (re-use the ESCR0 register, seems safe
663 * and keeps the cccr_val the same)
665 if (!ht_num) {
666 /* logical cpu 0 */
667 perfctr_msr = MSR_P4_IQ_PERFCTR0;
668 evntsel_msr = MSR_P4_CRU_ESCR0;
669 cccr_msr = MSR_P4_IQ_CCCR0;
670 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
671 } else {
672 /* logical cpu 1 */
673 perfctr_msr = MSR_P4_IQ_PERFCTR1;
674 evntsel_msr = MSR_P4_CRU_ESCR0;
675 cccr_msr = MSR_P4_IQ_CCCR1;
676 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
679 if (!reserve_perfctr_nmi(perfctr_msr))
680 goto fail;
682 if (!reserve_evntsel_nmi(evntsel_msr))
683 goto fail1;
685 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
686 | P4_ESCR_OS
687 | P4_ESCR_USR;
689 cccr_val |= P4_CCCR_THRESHOLD(15)
690 | P4_CCCR_COMPLEMENT
691 | P4_CCCR_COMPARE
692 | P4_CCCR_REQUIRED;
694 wrmsr(evntsel_msr, evntsel, 0);
695 wrmsr(cccr_msr, cccr_val, 0);
696 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
697 apic_write(APIC_LVTPC, APIC_DM_NMI);
698 cccr_val |= P4_CCCR_ENABLE;
699 wrmsr(cccr_msr, cccr_val, 0);
700 wd->perfctr_msr = perfctr_msr;
701 wd->evntsel_msr = evntsel_msr;
702 wd->cccr_msr = cccr_msr;
703 wd->check_bit = 1ULL<<39;
704 return 1;
705 fail1:
706 release_perfctr_nmi(perfctr_msr);
707 fail:
708 return 0;
711 static void stop_p4_watchdog(void)
713 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
715 wrmsr(wd->cccr_msr, 0, 0);
716 wrmsr(wd->evntsel_msr, 0, 0);
718 release_evntsel_nmi(wd->evntsel_msr);
719 release_perfctr_nmi(wd->perfctr_msr);
722 #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
723 #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
725 static int setup_intel_arch_watchdog(void)
727 unsigned int ebx;
728 union cpuid10_eax eax;
729 unsigned int unused;
730 unsigned int perfctr_msr, evntsel_msr;
731 unsigned int evntsel;
732 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
735 * Check whether the Architectural PerfMon supports
736 * Unhalted Core Cycles Event or not.
737 * NOTE: Corresponding bit = 0 in ebx indicates event present.
739 cpuid(10, &(eax.full), &ebx, &unused, &unused);
740 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
741 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
742 goto fail;
744 perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
745 evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
747 if (!reserve_perfctr_nmi(perfctr_msr))
748 goto fail;
750 if (!reserve_evntsel_nmi(evntsel_msr))
751 goto fail1;
753 wrmsrl(perfctr_msr, 0UL);
755 evntsel = ARCH_PERFMON_EVENTSEL_INT
756 | ARCH_PERFMON_EVENTSEL_OS
757 | ARCH_PERFMON_EVENTSEL_USR
758 | ARCH_PERFMON_NMI_EVENT_SEL
759 | ARCH_PERFMON_NMI_EVENT_UMASK;
761 /* setup the timer */
762 wrmsr(evntsel_msr, evntsel, 0);
763 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
764 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
765 apic_write(APIC_LVTPC, APIC_DM_NMI);
766 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
767 wrmsr(evntsel_msr, evntsel, 0);
769 wd->perfctr_msr = perfctr_msr;
770 wd->evntsel_msr = evntsel_msr;
771 wd->cccr_msr = 0; //unused
772 wd->check_bit = 1ULL << (eax.split.bit_width - 1);
773 return 1;
774 fail1:
775 release_perfctr_nmi(perfctr_msr);
776 fail:
777 return 0;
780 static void stop_intel_arch_watchdog(void)
782 unsigned int ebx;
783 union cpuid10_eax eax;
784 unsigned int unused;
785 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
788 * Check whether the Architectural PerfMon supports
789 * Unhalted Core Cycles Event or not.
790 * NOTE: Corresponding bit = 0 in ebx indicates event present.
792 cpuid(10, &(eax.full), &ebx, &unused, &unused);
793 if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
794 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
795 return;
797 wrmsr(wd->evntsel_msr, 0, 0);
798 release_evntsel_nmi(wd->evntsel_msr);
799 release_perfctr_nmi(wd->perfctr_msr);
802 void setup_apic_nmi_watchdog (void *unused)
804 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
806 /* only support LOCAL and IO APICs for now */
807 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
808 (nmi_watchdog != NMI_IO_APIC))
809 return;
811 if (wd->enabled == 1)
812 return;
814 /* cheap hack to support suspend/resume */
815 /* if cpu0 is not active neither should the other cpus */
816 if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
817 return;
819 if (nmi_watchdog == NMI_LOCAL_APIC) {
820 switch (boot_cpu_data.x86_vendor) {
821 case X86_VENDOR_AMD:
822 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
823 boot_cpu_data.x86 != 16)
824 return;
825 if (!setup_k7_watchdog())
826 return;
827 break;
828 case X86_VENDOR_INTEL:
829 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
830 if (!setup_intel_arch_watchdog())
831 return;
832 break;
834 switch (boot_cpu_data.x86) {
835 case 6:
836 if (boot_cpu_data.x86_model > 0xd)
837 return;
839 if (!setup_p6_watchdog())
840 return;
841 break;
842 case 15:
843 if (boot_cpu_data.x86_model > 0x4)
844 return;
846 if (!setup_p4_watchdog())
847 return;
848 break;
849 default:
850 return;
852 break;
853 default:
854 return;
857 wd->enabled = 1;
858 atomic_inc(&nmi_active);
861 void stop_apic_nmi_watchdog(void *unused)
863 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
865 /* only support LOCAL and IO APICs for now */
866 if ((nmi_watchdog != NMI_LOCAL_APIC) &&
867 (nmi_watchdog != NMI_IO_APIC))
868 return;
870 if (wd->enabled == 0)
871 return;
873 if (nmi_watchdog == NMI_LOCAL_APIC) {
874 switch (boot_cpu_data.x86_vendor) {
875 case X86_VENDOR_AMD:
876 stop_k7_watchdog();
877 break;
878 case X86_VENDOR_INTEL:
879 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
880 stop_intel_arch_watchdog();
881 break;
883 switch (boot_cpu_data.x86) {
884 case 6:
885 if (boot_cpu_data.x86_model > 0xd)
886 break;
887 stop_p6_watchdog();
888 break;
889 case 15:
890 if (boot_cpu_data.x86_model > 0x4)
891 break;
892 stop_p4_watchdog();
893 break;
895 break;
896 default:
897 return;
900 wd->enabled = 0;
901 atomic_dec(&nmi_active);
905 * the best way to detect whether a CPU has a 'hard lockup' problem
906 * is to check it's local APIC timer IRQ counts. If they are not
907 * changing then that CPU has some problem.
909 * as these watchdog NMI IRQs are generated on every CPU, we only
910 * have to check the current processor.
912 * since NMIs don't listen to _any_ locks, we have to be extremely
913 * careful not to rely on unsafe variables. The printk might lock
914 * up though, so we have to break up any console locks first ...
915 * [when there will be more tty-related locks, break them up
916 * here too!]
919 static unsigned int
920 last_irq_sums [NR_CPUS],
921 alert_counter [NR_CPUS];
923 void touch_nmi_watchdog (void)
925 if (nmi_watchdog > 0) {
926 unsigned cpu;
929 * Just reset the alert counters, (other CPUs might be
930 * spinning on locks we hold):
932 for_each_present_cpu (cpu)
933 alert_counter[cpu] = 0;
937 * Tickle the softlockup detector too:
939 touch_softlockup_watchdog();
941 EXPORT_SYMBOL(touch_nmi_watchdog);
943 extern void die_nmi(struct pt_regs *, const char *msg);
945 __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
949 * Since current_thread_info()-> is always on the stack, and we
950 * always switch the stack NMI-atomically, it's safe to use
951 * smp_processor_id().
953 unsigned int sum;
954 int touched = 0;
955 int cpu = smp_processor_id();
956 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
957 u64 dummy;
958 int rc=0;
960 /* check for other users first */
961 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
962 == NOTIFY_STOP) {
963 rc = 1;
964 touched = 1;
967 if (cpu_isset(cpu, backtrace_mask)) {
968 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
970 spin_lock(&lock);
971 printk("NMI backtrace for cpu %d\n", cpu);
972 dump_stack();
973 spin_unlock(&lock);
974 cpu_clear(cpu, backtrace_mask);
978 * Take the local apic timer and PIT/HPET into account. We don't
979 * know which one is active, when we have highres/dyntick on
981 sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
983 /* if the none of the timers isn't firing, this cpu isn't doing much */
984 if (!touched && last_irq_sums[cpu] == sum) {
986 * Ayiee, looks like this CPU is stuck ...
987 * wait a few IRQs (5 seconds) before doing the oops ...
989 alert_counter[cpu]++;
990 if (alert_counter[cpu] == 5*nmi_hz)
992 * die_nmi will return ONLY if NOTIFY_STOP happens..
994 die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
995 } else {
996 last_irq_sums[cpu] = sum;
997 alert_counter[cpu] = 0;
999 /* see if the nmi watchdog went off */
1000 if (wd->enabled) {
1001 if (nmi_watchdog == NMI_LOCAL_APIC) {
1002 rdmsrl(wd->perfctr_msr, dummy);
1003 if (dummy & wd->check_bit){
1004 /* this wasn't a watchdog timer interrupt */
1005 goto done;
1008 /* only Intel P4 uses the cccr msr */
1009 if (wd->cccr_msr != 0) {
1011 * P4 quirks:
1012 * - An overflown perfctr will assert its interrupt
1013 * until the OVF flag in its CCCR is cleared.
1014 * - LVTPC is masked on interrupt and must be
1015 * unmasked by the LVTPC handler.
1017 rdmsrl(wd->cccr_msr, dummy);
1018 dummy &= ~P4_CCCR_OVF;
1019 wrmsrl(wd->cccr_msr, dummy);
1020 apic_write(APIC_LVTPC, APIC_DM_NMI);
1021 /* start the cycle over again */
1022 write_watchdog_counter(wd->perfctr_msr, NULL);
1024 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
1025 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
1026 /* P6 based Pentium M need to re-unmask
1027 * the apic vector but it doesn't hurt
1028 * other P6 variant.
1029 * ArchPerfom/Core Duo also needs this */
1030 apic_write(APIC_LVTPC, APIC_DM_NMI);
1031 /* P6/ARCH_PERFMON has 32 bit counter write */
1032 write_watchdog_counter32(wd->perfctr_msr, NULL);
1033 } else {
1034 /* start the cycle over again */
1035 write_watchdog_counter(wd->perfctr_msr, NULL);
1037 rc = 1;
1038 } else if (nmi_watchdog == NMI_IO_APIC) {
1039 /* don't know how to accurately check for this.
1040 * just assume it was a watchdog timer interrupt
1041 * This matches the old behaviour.
1043 rc = 1;
1046 done:
1047 return rc;
1050 int do_nmi_callback(struct pt_regs * regs, int cpu)
1052 #ifdef CONFIG_SYSCTL
1053 if (unknown_nmi_panic)
1054 return unknown_nmi_panic_callback(regs, cpu);
1055 #endif
1056 return 0;
1059 #ifdef CONFIG_SYSCTL
1061 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
1063 unsigned char reason = get_nmi_reason();
1064 char buf[64];
1066 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
1067 die_nmi(regs, buf);
1068 return 0;
1072 * proc handler for /proc/sys/kernel/nmi
1074 int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1075 void __user *buffer, size_t *length, loff_t *ppos)
1077 int old_state;
1079 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
1080 old_state = nmi_watchdog_enabled;
1081 proc_dointvec(table, write, file, buffer, length, ppos);
1082 if (!!old_state == !!nmi_watchdog_enabled)
1083 return 0;
1085 if (atomic_read(&nmi_active) < 0) {
1086 printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
1087 return -EIO;
1090 if (nmi_watchdog == NMI_DEFAULT) {
1091 if (nmi_known_cpu() > 0)
1092 nmi_watchdog = NMI_LOCAL_APIC;
1093 else
1094 nmi_watchdog = NMI_IO_APIC;
1097 if (nmi_watchdog == NMI_LOCAL_APIC) {
1098 if (nmi_watchdog_enabled)
1099 enable_lapic_nmi_watchdog();
1100 else
1101 disable_lapic_nmi_watchdog();
1102 } else {
1103 printk( KERN_WARNING
1104 "NMI watchdog doesn't know what hardware to touch\n");
1105 return -EIO;
1107 return 0;
1110 #endif
1112 void __trigger_all_cpu_backtrace(void)
1114 int i;
1116 backtrace_mask = cpu_online_map;
1117 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1118 for (i = 0; i < 10 * 1000; i++) {
1119 if (cpus_empty(backtrace_mask))
1120 break;
1121 mdelay(1);
1125 EXPORT_SYMBOL(nmi_active);
1126 EXPORT_SYMBOL(nmi_watchdog);
1127 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
1128 EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
1129 EXPORT_SYMBOL(reserve_perfctr_nmi);
1130 EXPORT_SYMBOL(release_perfctr_nmi);
1131 EXPORT_SYMBOL(reserve_evntsel_nmi);
1132 EXPORT_SYMBOL(release_evntsel_nmi);
1133 EXPORT_SYMBOL(disable_timer_nmi_watchdog);
1134 EXPORT_SYMBOL(enable_timer_nmi_watchdog);