arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10 #include <linux/thread_info.h>
  11 #include <linux/capability.h>
  12 #include <linux/miscdevice.h>
  13 #include <linux/ratelimit.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/rcupdate.h>
  16 #include <linux/kobject.h>
  17 #include <linux/uaccess.h>
  18 #include <linux/kdebug.h>
  19 #include <linux/kernel.h>
  20 #include <linux/percpu.h>
  21 #include <linux/string.h>
  22 #include <linux/sysdev.h>
  23 #include <linux/ctype.h>
  24 #include <linux/sched.h>
  25 #include <linux/sysfs.h>
  26 #include <linux/types.h>
  27 #include <linux/init.h>
  28 #include <linux/kmod.h>
  29 #include <linux/poll.h>
  30 #include <linux/cpu.h>
  31 #include <linux/smp.h>
  32 #include <linux/fs.h>
  33
  34 #include <asm/processor.h>
  35 #include <asm/idle.h>
  36 #include <asm/mce.h>
  37 #include <asm/msr.h>
  38
  39 #include "mce.h"
  40
  41 /* Handle unconfigured int18 (should never happen) */
  42 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
  43 {
  44         printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
  45                smp_processor_id());
  46 }
  47
  48 /* Call the installed machine check handler for this CPU setup. */
  49 void (*machine_check_vector)(struct pt_regs *, long error_code) =
  50                                                 unexpected_machine_check;
  51
  52 int                             mce_disabled;
  53
  54 #ifdef CONFIG_X86_NEW_MCE
  55
  56 #define MISC_MCELOG_MINOR       227
  57
  58 atomic_t mce_entry;
  59
  60 DEFINE_PER_CPU(unsigned, mce_exception_count);
  61
  62 /*
  63  * Tolerant levels:
  64  *   0: always panic on uncorrected errors, log corrected errors
  65  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  66  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  67  *   3: never panic or SIGBUS, log all errors (for testing only)
  68  */
  69 static int                      tolerant = 1;
  70 static int                      banks;
  71 static u64                      *bank;
  72 static unsigned long            notify_user;
  73 static int                      rip_msr;
  74 static int                      mce_bootlog = -1;
  75
  76 static char                     trigger[128];
  77 static char                     *trigger_argv[2] = { trigger, NULL };
  78
  79 static unsigned long            dont_init_banks;
  80
  81 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  82
  83 /* MCA banks polled by the period polling timer for corrected events */
  84 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  85         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  86 };
  87
  88 static inline int skip_bank_init(int i)
  89 {
  90         return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
  91 }
  92
  93 /* Do initial initialization of a struct mce */
  94 void mce_setup(struct mce *m)
  95 {
  96         memset(m, 0, sizeof(struct mce));
  97         m->cpu = smp_processor_id();
  98         rdtscll(m->tsc);
  99 }
 100
 101 DEFINE_PER_CPU(struct mce, injectm);
 102 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 103
 104 /*
 105  * Lockless MCE logging infrastructure.
 106  * This avoids deadlocks on printk locks without having to break locks. Also
 107  * separate MCEs from kernel messages to avoid bogus bug reports.
 108  */
 109
 110 static struct mce_log mcelog = {
 111         MCE_LOG_SIGNATURE,
 112         MCE_LOG_LEN,
 113 };
 114
 115 void mce_log(struct mce *mce)
 116 {
 117         unsigned next, entry;
 118
 119         mce->finished = 0;
 120         wmb();
 121         for (;;) {
 122                 entry = rcu_dereference(mcelog.next);
 123                 for (;;) {
 124                         /*
 125                          * When the buffer fills up discard new entries.
 126                          * Assume that the earlier errors are the more
 127                          * interesting ones:
 128                          */
 129                         if (entry >= MCE_LOG_LEN) {
 130                                 set_bit(MCE_OVERFLOW,
 131                                         (unsigned long *)&mcelog.flags);
 132                                 return;
 133                         }
 134                         /* Old left over entry. Skip: */
 135                         if (mcelog.entry[entry].finished) {
 136                                 entry++;
 137                                 continue;
 138                         }
 139                         break;
 140                 }
 141                 smp_rmb();
 142                 next = entry + 1;
 143                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 144                         break;
 145         }
 146         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 147         wmb();
 148         mcelog.entry[entry].finished = 1;
 149         wmb();
 150
 151         set_bit(0, &notify_user);
 152 }
 153
 154 static void print_mce(struct mce *m)
 155 {
 156         printk(KERN_EMERG "\n"
 157                KERN_EMERG "HARDWARE ERROR\n"
 158                KERN_EMERG
 159                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 160                m->cpu, m->mcgstatus, m->bank, m->status);
 161         if (m->ip) {
 162                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 163                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 164                        m->cs, m->ip);
 165                 if (m->cs == __KERNEL_CS)
 166                         print_symbol("{%s}", m->ip);
 167                 printk("\n");
 168         }
 169         printk(KERN_EMERG "TSC %llx ", m->tsc);
 170         if (m->addr)
 171                 printk("ADDR %llx ", m->addr);
 172         if (m->misc)
 173                 printk("MISC %llx ", m->misc);
 174         printk("\n");
 175         printk(KERN_EMERG "This is not a software problem!\n");
 176         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 177                "and contact your hardware vendor\n");
 178 }
 179
 180 static void mce_panic(char *msg, struct mce *backup, u64 start)
 181 {
 182         int i;
 183
 184         bust_spinlocks(1);
 185         console_verbose();
 186         for (i = 0; i < MCE_LOG_LEN; i++) {
 187                 u64 tsc = mcelog.entry[i].tsc;
 188
 189                 if ((s64)(tsc - start) < 0)
 190                         continue;
 191                 print_mce(&mcelog.entry[i]);
 192                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 193                         backup = NULL;
 194         }
 195         if (backup)
 196                 print_mce(backup);
 197         panic(msg);
 198 }
 199
 200 /* Support code for software error injection */
 201
 202 static int msr_to_offset(u32 msr)
 203 {
 204         unsigned bank = __get_cpu_var(injectm.bank);
 205         if (msr == rip_msr)
 206                 return offsetof(struct mce, ip);
 207         if (msr == MSR_IA32_MC0_STATUS + bank*4)
 208                 return offsetof(struct mce, status);
 209         if (msr == MSR_IA32_MC0_ADDR + bank*4)
 210                 return offsetof(struct mce, addr);
 211         if (msr == MSR_IA32_MC0_MISC + bank*4)
 212                 return offsetof(struct mce, misc);
 213         if (msr == MSR_IA32_MCG_STATUS)
 214                 return offsetof(struct mce, mcgstatus);
 215         return -1;
 216 }
 217
 218 /* MSR access wrappers used for error injection */
 219 static u64 mce_rdmsrl(u32 msr)
 220 {
 221         u64 v;
 222         if (__get_cpu_var(injectm).finished) {
 223                 int offset = msr_to_offset(msr);
 224                 if (offset < 0)
 225                         return 0;
 226                 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
 227         }
 228         rdmsrl(msr, v);
 229         return v;
 230 }
 231
 232 static void mce_wrmsrl(u32 msr, u64 v)
 233 {
 234         if (__get_cpu_var(injectm).finished) {
 235                 int offset = msr_to_offset(msr);
 236                 if (offset >= 0)
 237                         *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
 238                 return;
 239         }
 240         wrmsrl(msr, v);
 241 }
 242
 243 int mce_available(struct cpuinfo_x86 *c)
 244 {
 245         if (mce_disabled)
 246                 return 0;
 247         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 248 }
 249
 250 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 251 {
 252         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 253                 m->ip = regs->ip;
 254                 m->cs = regs->cs;
 255         } else {
 256                 m->ip = 0;
 257                 m->cs = 0;
 258         }
 259         if (rip_msr) {
 260                 /* Assume the RIP in the MSR is exact. Is this true? */
 261                 m->mcgstatus |= MCG_STATUS_EIPV;
 262                 m->ip = mce_rdmsrl(rip_msr);
 263                 m->cs = 0;
 264         }
 265 }
 266
 267 DEFINE_PER_CPU(unsigned, mce_poll_count);
 268
 269 /*
 270  * Poll for corrected events or events that happened before reset.
 271  * Those are just logged through /dev/mcelog.
 272  *
 273  * This is executed in standard interrupt context.
 274  */
 275 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 276 {
 277         struct mce m;
 278         int i;
 279
 280         __get_cpu_var(mce_poll_count)++;
 281
 282         mce_setup(&m);
 283
 284         m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 285         for (i = 0; i < banks; i++) {
 286                 if (!bank[i] || !test_bit(i, *b))
 287                         continue;
 288
 289                 m.misc = 0;
 290                 m.addr = 0;
 291                 m.bank = i;
 292                 m.tsc = 0;
 293
 294                 barrier();
 295                 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
 296                 if (!(m.status & MCI_STATUS_VAL))
 297                         continue;
 298
 299                 /*
 300                  * Uncorrected events are handled by the exception handler
 301                  * when it is enabled. But when the exception is disabled log
 302                  * everything.
 303                  *
 304                  * TBD do the same check for MCI_STATUS_EN here?
 305                  */
 306                 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
 307                         continue;
 308
 309                 if (m.status & MCI_STATUS_MISCV)
 310                         m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
 311                 if (m.status & MCI_STATUS_ADDRV)
 312                         m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
 313
 314                 if (!(flags & MCP_TIMESTAMP))
 315                         m.tsc = 0;
 316                 /*
 317                  * Don't get the IP here because it's unlikely to
 318                  * have anything to do with the actual error location.
 319                  */
 320                 if (!(flags & MCP_DONTLOG)) {
 321                         mce_log(&m);
 322                         add_taint(TAINT_MACHINE_CHECK);
 323                 }
 324
 325                 /*
 326                  * Clear state for this bank.
 327                  */
 328                 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 329         }
 330
 331         /*
 332          * Don't clear MCG_STATUS here because it's only defined for
 333          * exceptions.
 334          */
 335
 336         sync_core();
 337 }
 338 EXPORT_SYMBOL_GPL(machine_check_poll);
 339
 340 /*
 341  * The actual machine check handler. This only handles real
 342  * exceptions when something got corrupted coming in through int 18.
 343  *
 344  * This is executed in NMI context not subject to normal locking rules. This
 345  * implies that most kernel services cannot be safely used. Don't even
 346  * think about putting a printk in there!
 347  */
 348 void do_machine_check(struct pt_regs *regs, long error_code)
 349 {
 350         struct mce m, panicm;
 351         int panicm_found = 0;
 352         u64 mcestart = 0;
 353         int i;
 354         /*
 355          * If no_way_out gets set, there is no safe way to recover from this
 356          * MCE.  If tolerant is cranked up, we'll try anyway.
 357          */
 358         int no_way_out = 0;
 359         /*
 360          * If kill_it gets set, there might be a way to recover from this
 361          * error.
 362          */
 363         int kill_it = 0;
 364         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 365
 366         atomic_inc(&mce_entry);
 367
 368         __get_cpu_var(mce_exception_count)++;
 369
 370         if (notify_die(DIE_NMI, "machine check", regs, error_code,
 371                            18, SIGKILL) == NOTIFY_STOP)
 372                 goto out;
 373         if (!banks)
 374                 goto out;
 375
 376         mce_setup(&m);
 377
 378         m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 379
 380         /* if the restart IP is not valid, we're done for */
 381         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 382                 no_way_out = 1;
 383
 384         rdtscll(mcestart);
 385         barrier();
 386
 387         for (i = 0; i < banks; i++) {
 388                 __clear_bit(i, toclear);
 389                 if (!bank[i])
 390                         continue;
 391
 392                 m.misc = 0;
 393                 m.addr = 0;
 394                 m.bank = i;
 395
 396                 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4);
 397                 if ((m.status & MCI_STATUS_VAL) == 0)
 398                         continue;
 399
 400                 /*
 401                  * Non uncorrected errors are handled by machine_check_poll
 402                  * Leave them alone.
 403                  */
 404                 if ((m.status & MCI_STATUS_UC) == 0)
 405                         continue;
 406
 407                 /*
 408                  * Set taint even when machine check was not enabled.
 409                  */
 410                 add_taint(TAINT_MACHINE_CHECK);
 411
 412                 __set_bit(i, toclear);
 413
 414                 if (m.status & MCI_STATUS_EN) {
 415                         /* if PCC was set, there's no way out */
 416                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 417                         /*
 418                          * If this error was uncorrectable and there was
 419                          * an overflow, we're in trouble.  If no overflow,
 420                          * we might get away with just killing a task.
 421                          */
 422                         if (m.status & MCI_STATUS_UC) {
 423                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 424                                         no_way_out = 1;
 425                                 kill_it = 1;
 426                         }
 427                 } else {
 428                         /*
 429                          * Machine check event was not enabled. Clear, but
 430                          * ignore.
 431                          */
 432                         continue;
 433                 }
 434
 435                 if (m.status & MCI_STATUS_MISCV)
 436                         m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4);
 437                 if (m.status & MCI_STATUS_ADDRV)
 438                         m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4);
 439
 440                 mce_get_rip(&m, regs);
 441                 mce_log(&m);
 442
 443                 /*
 444                  * Did this bank cause the exception?
 445                  *
 446                  * Assume that the bank with uncorrectable errors did it,
 447                  * and that there is only a single one:
 448                  */
 449                 if ((m.status & MCI_STATUS_UC) &&
 450                                         (m.status & MCI_STATUS_EN)) {
 451                         panicm = m;
 452                         panicm_found = 1;
 453                 }
 454         }
 455
 456         /*
 457          * If we didn't find an uncorrectable error, pick
 458          * the last one (shouldn't happen, just being safe).
 459          */
 460         if (!panicm_found)
 461                 panicm = m;
 462
 463         /*
 464          * If we have decided that we just CAN'T continue, and the user
 465          * has not set tolerant to an insane level, give up and die.
 466          */
 467         if (no_way_out && tolerant < 3)
 468                 mce_panic("Machine check", &panicm, mcestart);
 469
 470         /*
 471          * If the error seems to be unrecoverable, something should be
 472          * done.  Try to kill as little as possible.  If we can kill just
 473          * one task, do that.  If the user has set the tolerance very
 474          * high, don't try to do anything at all.
 475          */
 476         if (kill_it && tolerant < 3) {
 477                 int user_space = 0;
 478
 479                 /*
 480                  * If the EIPV bit is set, it means the saved IP is the
 481                  * instruction which caused the MCE.
 482                  */
 483                 if (m.mcgstatus & MCG_STATUS_EIPV)
 484                         user_space = panicm.ip && (panicm.cs & 3);
 485
 486                 /*
 487                  * If we know that the error was in user space, send a
 488                  * SIGBUS.  Otherwise, panic if tolerance is low.
 489                  *
 490                  * force_sig() takes an awful lot of locks and has a slight
 491                  * risk of deadlocking.
 492                  */
 493                 if (user_space) {
 494                         force_sig(SIGBUS, current);
 495                 } else if (panic_on_oops || tolerant < 2) {
 496                         mce_panic("Uncorrected machine check",
 497                                 &panicm, mcestart);
 498                 }
 499         }
 500
 501         /* notify userspace ASAP */
 502         set_thread_flag(TIF_MCE_NOTIFY);
 503
 504         /* the last thing we do is clear state */
 505         for (i = 0; i < banks; i++) {
 506                 if (test_bit(i, toclear))
 507                         mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 508         }
 509         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 510 out:
 511         atomic_dec(&mce_entry);
 512         sync_core();
 513 }
 514 EXPORT_SYMBOL_GPL(do_machine_check);
 515
 516 #ifdef CONFIG_X86_MCE_INTEL
 517 /***
 518  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 519  * @cpu: The CPU on which the event occurred.
 520  * @status: Event status information
 521  *
 522  * This function should be called by the thermal interrupt after the
 523  * event has been processed and the decision was made to log the event
 524  * further.
 525  *
 526  * The status parameter will be saved to the 'status' field of 'struct mce'
 527  * and historically has been the register value of the
 528  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 529  */
 530 void mce_log_therm_throt_event(__u64 status)
 531 {
 532         struct mce m;
 533
 534         mce_setup(&m);
 535         m.bank = MCE_THERMAL_BANK;
 536         m.status = status;
 537         mce_log(&m);
 538 }
 539 #endif /* CONFIG_X86_MCE_INTEL */
 540
 541 /*
 542  * Periodic polling timer for "silent" machine check errors.  If the
 543  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 544  * errors, poll 2x slower (up to check_interval seconds).
 545  */
 546 static int check_interval = 5 * 60; /* 5 minutes */
 547
 548 static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 549 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 550
 551 static void mcheck_timer(unsigned long data)
 552 {
 553         struct timer_list *t = &per_cpu(mce_timer, data);
 554         int *n;
 555
 556         WARN_ON(smp_processor_id() != data);
 557
 558         if (mce_available(&current_cpu_data)) {
 559                 machine_check_poll(MCP_TIMESTAMP,
 560                                 &__get_cpu_var(mce_poll_banks));
 561         }
 562
 563         /*
 564          * Alert userspace if needed.  If we logged an MCE, reduce the
 565          * polling interval, otherwise increase the polling interval.
 566          */
 567         n = &__get_cpu_var(next_interval);
 568         if (mce_notify_user())
 569                 *n = max(*n/2, HZ/100);
 570         else
 571                 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 572
 573         t->expires = jiffies + *n;
 574         add_timer(t);
 575 }
 576
 577 static void mce_do_trigger(struct work_struct *work)
 578 {
 579         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 580 }
 581
 582 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 583
 584 /*
 585  * Notify the user(s) about new machine check events.
 586  * Can be called from interrupt context, but not from machine check/NMI
 587  * context.
 588  */
 589 int mce_notify_user(void)
 590 {
 591         /* Not more than two messages every minute */
 592         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 593
 594         clear_thread_flag(TIF_MCE_NOTIFY);
 595
 596         if (test_and_clear_bit(0, &notify_user)) {
 597                 wake_up_interruptible(&mce_wait);
 598
 599                 /*
 600                  * There is no risk of missing notifications because
 601                  * work_pending is always cleared before the function is
 602                  * executed.
 603                  */
 604                 if (trigger[0] && !work_pending(&mce_trigger_work))
 605                         schedule_work(&mce_trigger_work);
 606
 607                 if (__ratelimit(&ratelimit))
 608                         printk(KERN_INFO "Machine check events logged\n");
 609
 610                 return 1;
 611         }
 612         return 0;
 613 }
 614 EXPORT_SYMBOL_GPL(mce_notify_user);
 615
 616 /*
 617  * Initialize Machine Checks for a CPU.
 618  */
 619 static int mce_cap_init(void)
 620 {
 621         unsigned b;
 622         u64 cap;
 623
 624         rdmsrl(MSR_IA32_MCG_CAP, cap);
 625
 626         b = cap & MCG_BANKCNT_MASK;
 627         printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
 628
 629         if (b > MAX_NR_BANKS) {
 630                 printk(KERN_WARNING
 631                        "MCE: Using only %u machine check banks out of %u\n",
 632                         MAX_NR_BANKS, b);
 633                 b = MAX_NR_BANKS;
 634         }
 635
 636         /* Don't support asymmetric configurations today */
 637         WARN_ON(banks != 0 && b != banks);
 638         banks = b;
 639         if (!bank) {
 640                 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
 641                 if (!bank)
 642                         return -ENOMEM;
 643                 memset(bank, 0xff, banks * sizeof(u64));
 644         }
 645
 646         /* Use accurate RIP reporting if available. */
 647         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
 648                 rip_msr = MSR_IA32_MCG_EIP;
 649
 650         return 0;
 651 }
 652
 653 static void mce_init(void)
 654 {
 655         mce_banks_t all_banks;
 656         u64 cap;
 657         int i;
 658
 659         /*
 660          * Log the machine checks left over from the previous reset.
 661          */
 662         bitmap_fill(all_banks, MAX_NR_BANKS);
 663         machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 664
 665         set_in_cr4(X86_CR4_MCE);
 666
 667         rdmsrl(MSR_IA32_MCG_CAP, cap);
 668         if (cap & MCG_CTL_P)
 669                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 670
 671         for (i = 0; i < banks; i++) {
 672                 if (skip_bank_init(i))
 673                         continue;
 674                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 675                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 676         }
 677 }
 678
 679 /* Add per CPU specific workarounds here */
 680 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 681 {
 682         /* This should be disabled by the BIOS, but isn't always */
 683         if (c->x86_vendor == X86_VENDOR_AMD) {
 684                 if (c->x86 == 15 && banks > 4) {
 685                         /*
 686                          * disable GART TBL walk error reporting, which
 687                          * trips off incorrectly with the IOMMU & 3ware
 688                          * & Cerberus:
 689                          */
 690                         clear_bit(10, (unsigned long *)&bank[4]);
 691                 }
 692                 if (c->x86 <= 17 && mce_bootlog < 0) {
 693                         /*
 694                          * Lots of broken BIOS around that don't clear them
 695                          * by default and leave crap in there. Don't log:
 696                          */
 697                         mce_bootlog = 0;
 698                 }
 699                 /*
 700                  * Various K7s with broken bank 0 around. Always disable
 701                  * by default.
 702                  */
 703                  if (c->x86 == 6)
 704                         bank[0] = 0;
 705         }
 706
 707         if (c->x86_vendor == X86_VENDOR_INTEL) {
 708                 /*
 709                  * SDM documents that on family 6 bank 0 should not be written
 710                  * because it aliases to another special BIOS controlled
 711                  * register.
 712                  * But it's not aliased anymore on model 0x1a+
 713                  * Don't ignore bank 0 completely because there could be a
 714                  * valid event later, merely don't write CTL0.
 715                  */
 716
 717                 if (c->x86 == 6 && c->x86_model < 0x1A)
 718                         __set_bit(0, &dont_init_banks);
 719         }
 720 }
 721
 722 static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
 723 {
 724         if (c->x86 != 5)
 725                 return;
 726         switch (c->x86_vendor) {
 727         case X86_VENDOR_INTEL:
 728                 if (mce_p5_enabled())
 729                         intel_p5_mcheck_init(c);
 730                 break;
 731         case X86_VENDOR_CENTAUR:
 732                 winchip_mcheck_init(c);
 733                 break;
 734         }
 735 }
 736
 737 static void mce_cpu_features(struct cpuinfo_x86 *c)
 738 {
 739         switch (c->x86_vendor) {
 740         case X86_VENDOR_INTEL:
 741                 mce_intel_feature_init(c);
 742                 break;
 743         case X86_VENDOR_AMD:
 744                 mce_amd_feature_init(c);
 745                 break;
 746         default:
 747                 break;
 748         }
 749 }
 750
 751 static void mce_init_timer(void)
 752 {
 753         struct timer_list *t = &__get_cpu_var(mce_timer);
 754         int *n = &__get_cpu_var(next_interval);
 755
 756         *n = check_interval * HZ;
 757         if (!*n)
 758                 return;
 759         setup_timer(t, mcheck_timer, smp_processor_id());
 760         t->expires = round_jiffies(jiffies + *n);
 761         add_timer(t);
 762 }
 763
 764 /*
 765  * Called for each booted CPU to set up machine checks.
 766  * Must be called with preempt off:
 767  */
 768 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 769 {
 770         if (mce_disabled)
 771                 return;
 772
 773         mce_ancient_init(c);
 774
 775         if (!mce_available(c))
 776                 return;
 777
 778         if (mce_cap_init() < 0) {
 779                 mce_disabled = 1;
 780                 return;
 781         }
 782         mce_cpu_quirks(c);
 783
 784         machine_check_vector = do_machine_check;
 785
 786         mce_init();
 787         mce_cpu_features(c);
 788         mce_init_timer();
 789 }
 790
 791 /*
 792  * Character device to read and clear the MCE log.
 793  */
 794
 795 static DEFINE_SPINLOCK(mce_state_lock);
 796 static int              open_count;             /* #times opened */
 797 static int              open_exclu;             /* already open exclusive? */
 798
 799 static int mce_open(struct inode *inode, struct file *file)
 800 {
 801         spin_lock(&mce_state_lock);
 802
 803         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 804                 spin_unlock(&mce_state_lock);
 805
 806                 return -EBUSY;
 807         }
 808
 809         if (file->f_flags & O_EXCL)
 810                 open_exclu = 1;
 811         open_count++;
 812
 813         spin_unlock(&mce_state_lock);
 814
 815         return nonseekable_open(inode, file);
 816 }
 817
 818 static int mce_release(struct inode *inode, struct file *file)
 819 {
 820         spin_lock(&mce_state_lock);
 821
 822         open_count--;
 823         open_exclu = 0;
 824
 825         spin_unlock(&mce_state_lock);
 826
 827         return 0;
 828 }
 829
 830 static void collect_tscs(void *data)
 831 {
 832         unsigned long *cpu_tsc = (unsigned long *)data;
 833
 834         rdtscll(cpu_tsc[smp_processor_id()]);
 835 }
 836
 837 static DEFINE_MUTEX(mce_read_mutex);
 838
 839 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 840                         loff_t *off)
 841 {
 842         char __user *buf = ubuf;
 843         unsigned long *cpu_tsc;
 844         unsigned prev, next;
 845         int i, err;
 846
 847         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 848         if (!cpu_tsc)
 849                 return -ENOMEM;
 850
 851         mutex_lock(&mce_read_mutex);
 852         next = rcu_dereference(mcelog.next);
 853
 854         /* Only supports full reads right now */
 855         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 856                 mutex_unlock(&mce_read_mutex);
 857                 kfree(cpu_tsc);
 858
 859                 return -EINVAL;
 860         }
 861
 862         err = 0;
 863         prev = 0;
 864         do {
 865                 for (i = prev; i < next; i++) {
 866                         unsigned long start = jiffies;
 867
 868                         while (!mcelog.entry[i].finished) {
 869                                 if (time_after_eq(jiffies, start + 2)) {
 870                                         memset(mcelog.entry + i, 0,
 871                                                sizeof(struct mce));
 872                                         goto timeout;
 873                                 }
 874                                 cpu_relax();
 875                         }
 876                         smp_rmb();
 877                         err |= copy_to_user(buf, mcelog.entry + i,
 878                                             sizeof(struct mce));
 879                         buf += sizeof(struct mce);
 880 timeout:
 881                         ;
 882                 }
 883
 884                 memset(mcelog.entry + prev, 0,
 885                        (next - prev) * sizeof(struct mce));
 886                 prev = next;
 887                 next = cmpxchg(&mcelog.next, prev, 0);
 888         } while (next != prev);
 889
 890         synchronize_sched();
 891
 892         /*
 893          * Collect entries that were still getting written before the
 894          * synchronize.
 895          */
 896         on_each_cpu(collect_tscs, cpu_tsc, 1);
 897
 898         for (i = next; i < MCE_LOG_LEN; i++) {
 899                 if (mcelog.entry[i].finished &&
 900                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 901                         err |= copy_to_user(buf, mcelog.entry+i,
 902                                             sizeof(struct mce));
 903                         smp_rmb();
 904                         buf += sizeof(struct mce);
 905                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 906                 }
 907         }
 908         mutex_unlock(&mce_read_mutex);
 909         kfree(cpu_tsc);
 910
 911         return err ? -EFAULT : buf - ubuf;
 912 }
 913
 914 static unsigned int mce_poll(struct file *file, poll_table *wait)
 915 {
 916         poll_wait(file, &mce_wait, wait);
 917         if (rcu_dereference(mcelog.next))
 918                 return POLLIN | POLLRDNORM;
 919         return 0;
 920 }
 921
 922 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 923 {
 924         int __user *p = (int __user *)arg;
 925
 926         if (!capable(CAP_SYS_ADMIN))
 927                 return -EPERM;
 928
 929         switch (cmd) {
 930         case MCE_GET_RECORD_LEN:
 931                 return put_user(sizeof(struct mce), p);
 932         case MCE_GET_LOG_LEN:
 933                 return put_user(MCE_LOG_LEN, p);
 934         case MCE_GETCLEAR_FLAGS: {
 935                 unsigned flags;
 936
 937                 do {
 938                         flags = mcelog.flags;
 939                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 940
 941                 return put_user(flags, p);
 942         }
 943         default:
 944                 return -ENOTTY;
 945         }
 946 }
 947
 948 /* Modified in mce-inject.c, so not static or const */
 949 struct file_operations mce_chrdev_ops = {
 950         .open                   = mce_open,
 951         .release                = mce_release,
 952         .read                   = mce_read,
 953         .poll                   = mce_poll,
 954         .unlocked_ioctl         = mce_ioctl,
 955 };
 956 EXPORT_SYMBOL_GPL(mce_chrdev_ops);
 957
 958 static struct miscdevice mce_log_device = {
 959         MISC_MCELOG_MINOR,
 960         "mcelog",
 961         &mce_chrdev_ops,
 962 };
 963
 964 /*
 965  * mce=off disables machine check
 966  * mce=TOLERANCELEVEL (number, see above)
 967  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 968  * mce=nobootlog Don't log MCEs from before booting.
 969  */
 970 static int __init mcheck_enable(char *str)
 971 {
 972         if (*str == 0)
 973                 enable_p5_mce();
 974         if (*str == '=')
 975                 str++;
 976         if (!strcmp(str, "off"))
 977                 mce_disabled = 1;
 978         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
 979                 mce_bootlog = (str[0] == 'b');
 980         else if (isdigit(str[0]))
 981                 get_option(&str, &tolerant);
 982         else {
 983                 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
 984                        str);
 985                 return 0;
 986         }
 987         return 1;
 988 }
 989 __setup("mce", mcheck_enable);
 990
 991 /*
 992  * Sysfs support
 993  */
 994
 995 /*
 996  * Disable machine checks on suspend and shutdown. We can't really handle
 997  * them later.
 998  */
 999 static int mce_disable(void)
1000 {
1001         int i;
1002
1003         for (i = 0; i < banks; i++) {
1004                 if (!skip_bank_init(i))
1005                         wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1006         }
1007         return 0;
1008 }
1009
1010 static int mce_suspend(struct sys_device *dev, pm_message_t state)
1011 {
1012         return mce_disable();
1013 }
1014
1015 static int mce_shutdown(struct sys_device *dev)
1016 {
1017         return mce_disable();
1018 }
1019
1020 /*
1021  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1022  * Only one CPU is active at this time, the others get re-added later using
1023  * CPU hotplug:
1024  */
1025 static int mce_resume(struct sys_device *dev)
1026 {
1027         mce_init();
1028         mce_cpu_features(&current_cpu_data);
1029
1030         return 0;
1031 }
1032
1033 static void mce_cpu_restart(void *data)
1034 {
1035         del_timer_sync(&__get_cpu_var(mce_timer));
1036         if (mce_available(&current_cpu_data))
1037                 mce_init();
1038         mce_init_timer();
1039 }
1040
1041 /* Reinit MCEs after user configuration changes */
1042 static void mce_restart(void)
1043 {
1044         on_each_cpu(mce_cpu_restart, NULL, 1);
1045 }
1046
1047 static struct sysdev_class mce_sysclass = {
1048         .suspend        = mce_suspend,
1049         .shutdown       = mce_shutdown,
1050         .resume         = mce_resume,
1051         .name           = "machinecheck",
1052 };
1053
1054 DEFINE_PER_CPU(struct sys_device, mce_dev);
1055
1056 __cpuinitdata
1057 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1058
1059 static struct sysdev_attribute *bank_attrs;
1060
1061 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1062                          char *buf)
1063 {
1064         u64 b = bank[attr - bank_attrs];
1065
1066         return sprintf(buf, "%llx\n", b);
1067 }
1068
1069 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1070                         const char *buf, size_t size)
1071 {
1072         u64 new;
1073
1074         if (strict_strtoull(buf, 0, &new) < 0)
1075                 return -EINVAL;
1076
1077         bank[attr - bank_attrs] = new;
1078         mce_restart();
1079
1080         return size;
1081 }
1082
1083 static ssize_t
1084 show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1085 {
1086         strcpy(buf, trigger);
1087         strcat(buf, "\n");
1088         return strlen(trigger) + 1;
1089 }
1090
1091 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1092                                 const char *buf, size_t siz)
1093 {
1094         char *p;
1095         int len;
1096
1097         strncpy(trigger, buf, sizeof(trigger));
1098         trigger[sizeof(trigger)-1] = 0;
1099         len = strlen(trigger);
1100         p = strchr(trigger, '\n');
1101
1102         if (*p)
1103                 *p = 0;
1104
1105         return len;
1106 }
1107
1108 static ssize_t store_int_with_restart(struct sys_device *s,
1109                                       struct sysdev_attribute *attr,
1110                                       const char *buf, size_t size)
1111 {
1112         ssize_t ret = sysdev_store_int(s, attr, buf, size);
1113         mce_restart();
1114         return ret;
1115 }
1116
1117 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1118 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1119
1120 static struct sysdev_ext_attribute attr_check_interval = {
1121         _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1122                      store_int_with_restart),
1123         &check_interval
1124 };
1125
1126 static struct sysdev_attribute *mce_attrs[] = {
1127         &attr_tolerant.attr, &attr_check_interval.attr, &attr_trigger,
1128         NULL
1129 };
1130
1131 static cpumask_var_t mce_dev_initialized;
1132
1133 /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1134 static __cpuinit int mce_create_device(unsigned int cpu)
1135 {
1136         int err;
1137         int i;
1138
1139         if (!mce_available(&boot_cpu_data))
1140                 return -EIO;
1141
1142         memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1143         per_cpu(mce_dev, cpu).id        = cpu;
1144         per_cpu(mce_dev, cpu).cls       = &mce_sysclass;
1145
1146         err = sysdev_register(&per_cpu(mce_dev, cpu));
1147         if (err)
1148                 return err;
1149
1150         for (i = 0; mce_attrs[i]; i++) {
1151                 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1152                 if (err)
1153                         goto error;
1154         }
1155         for (i = 0; i < banks; i++) {
1156                 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1157                                         &bank_attrs[i]);
1158                 if (err)
1159                         goto error2;
1160         }
1161         cpumask_set_cpu(cpu, mce_dev_initialized);
1162
1163         return 0;
1164 error2:
1165         while (--i >= 0)
1166                 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1167 error:
1168         while (--i >= 0)
1169                 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1170
1171         sysdev_unregister(&per_cpu(mce_dev, cpu));
1172
1173         return err;
1174 }
1175
1176 static __cpuinit void mce_remove_device(unsigned int cpu)
1177 {
1178         int i;
1179
1180         if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1181                 return;
1182
1183         for (i = 0; mce_attrs[i]; i++)
1184                 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1185
1186         for (i = 0; i < banks; i++)
1187                 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1188
1189         sysdev_unregister(&per_cpu(mce_dev, cpu));
1190         cpumask_clear_cpu(cpu, mce_dev_initialized);
1191 }
1192
1193 /* Make sure there are no machine checks on offlined CPUs. */
1194 static void mce_disable_cpu(void *h)
1195 {
1196         unsigned long action = *(unsigned long *)h;
1197         int i;
1198
1199         if (!mce_available(&current_cpu_data))
1200                 return;
1201         if (!(action & CPU_TASKS_FROZEN))
1202                 cmci_clear();
1203         for (i = 0; i < banks; i++) {
1204                 if (!skip_bank_init(i))
1205                         wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1206         }
1207 }
1208
1209 static void mce_reenable_cpu(void *h)
1210 {
1211         unsigned long action = *(unsigned long *)h;
1212         int i;
1213
1214         if (!mce_available(&current_cpu_data))
1215                 return;
1216
1217         if (!(action & CPU_TASKS_FROZEN))
1218                 cmci_reenable();
1219         for (i = 0; i < banks; i++) {
1220                 if (!skip_bank_init(i))
1221                         wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1222         }
1223 }
1224
1225 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1226 static int __cpuinit
1227 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1228 {
1229         unsigned int cpu = (unsigned long)hcpu;
1230         struct timer_list *t = &per_cpu(mce_timer, cpu);
1231
1232         switch (action) {
1233         case CPU_ONLINE:
1234         case CPU_ONLINE_FROZEN:
1235                 mce_create_device(cpu);
1236                 if (threshold_cpu_callback)
1237                         threshold_cpu_callback(action, cpu);
1238                 break;
1239         case CPU_DEAD:
1240         case CPU_DEAD_FROZEN:
1241                 if (threshold_cpu_callback)
1242                         threshold_cpu_callback(action, cpu);
1243                 mce_remove_device(cpu);
1244                 break;
1245         case CPU_DOWN_PREPARE:
1246         case CPU_DOWN_PREPARE_FROZEN:
1247                 del_timer_sync(t);
1248                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1249                 break;
1250         case CPU_DOWN_FAILED:
1251         case CPU_DOWN_FAILED_FROZEN:
1252                 t->expires = round_jiffies(jiffies +
1253                                                 __get_cpu_var(next_interval));
1254                 add_timer_on(t, cpu);
1255                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1256                 break;
1257         case CPU_POST_DEAD:
1258                 /* intentionally ignoring frozen here */
1259                 cmci_rediscover(cpu);
1260                 break;
1261         }
1262         return NOTIFY_OK;
1263 }
1264
1265 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1266         .notifier_call = mce_cpu_callback,
1267 };
1268
1269 static __init int mce_init_banks(void)
1270 {
1271         int i;
1272
1273         bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1274                                 GFP_KERNEL);
1275         if (!bank_attrs)
1276                 return -ENOMEM;
1277
1278         for (i = 0; i < banks; i++) {
1279                 struct sysdev_attribute *a = &bank_attrs[i];
1280
1281                 a->attr.name    = kasprintf(GFP_KERNEL, "bank%d", i);
1282                 if (!a->attr.name)
1283                         goto nomem;
1284
1285                 a->attr.mode    = 0644;
1286                 a->show         = show_bank;
1287                 a->store        = set_bank;
1288         }
1289         return 0;
1290
1291 nomem:
1292         while (--i >= 0)
1293                 kfree(bank_attrs[i].attr.name);
1294         kfree(bank_attrs);
1295         bank_attrs = NULL;
1296
1297         return -ENOMEM;
1298 }
1299
1300 static __init int mce_init_device(void)
1301 {
1302         int err;
1303         int i = 0;
1304
1305         if (!mce_available(&boot_cpu_data))
1306                 return -EIO;
1307
1308         alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1309
1310         err = mce_init_banks();
1311         if (err)
1312                 return err;
1313
1314         err = sysdev_class_register(&mce_sysclass);
1315         if (err)
1316                 return err;
1317
1318         for_each_online_cpu(i) {
1319                 err = mce_create_device(i);
1320                 if (err)
1321                         return err;
1322         }
1323
1324         register_hotcpu_notifier(&mce_cpu_notifier);
1325         misc_register(&mce_log_device);
1326
1327         return err;
1328 }
1329
1330 device_initcall(mce_init_device);
1331
1332 #else /* CONFIG_X86_OLD_MCE: */
1333
1334 int nr_mce_banks;
1335 EXPORT_SYMBOL_GPL(nr_mce_banks);        /* non-fatal.o */
1336
1337 /* This has to be run for each processor */
1338 void mcheck_init(struct cpuinfo_x86 *c)
1339 {
1340         if (mce_disabled == 1)
1341                 return;
1342
1343         switch (c->x86_vendor) {
1344         case X86_VENDOR_AMD:
1345                 amd_mcheck_init(c);
1346                 break;
1347
1348         case X86_VENDOR_INTEL:
1349                 if (c->x86 == 5)
1350                         intel_p5_mcheck_init(c);
1351                 if (c->x86 == 6)
1352                         intel_p6_mcheck_init(c);
1353                 if (c->x86 == 15)
1354                         intel_p4_mcheck_init(c);
1355                 break;
1356
1357         case X86_VENDOR_CENTAUR:
1358                 if (c->x86 == 5)
1359                         winchip_mcheck_init(c);
1360                 break;
1361
1362         default:
1363                 break;
1364         }
1365         printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1366 }
1367
1368 static int __init mcheck_enable(char *str)
1369 {
1370         mce_disabled = -1;
1371         return 1;
1372 }
1373
1374 __setup("mce", mcheck_enable);
1375
1376 #endif /* CONFIG_X86_OLD_MCE */
1377
1378 /*
1379  * Old style boot options parsing. Only for compatibility.
1380  */
1381 static int __init mcheck_disable(char *str)
1382 {
1383         mce_disabled = 1;
1384         return 1;
1385 }
1386 __setup("nomce", mcheck_disable);