arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10 #include <linux/thread_info.h>
  11 #include <linux/capability.h>
  12 #include <linux/miscdevice.h>
  13 #include <linux/ratelimit.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/rcupdate.h>
  16 #include <linux/smp_lock.h>
  17 #include <linux/kobject.h>
  18 #include <linux/kdebug.h>
  19 #include <linux/kernel.h>
  20 #include <linux/percpu.h>
  21 #include <linux/string.h>
  22 #include <linux/sysdev.h>
  23 #include <linux/ctype.h>
  24 #include <linux/sched.h>
  25 #include <linux/sysfs.h>
  26 #include <linux/types.h>
  27 #include <linux/init.h>
  28 #include <linux/kmod.h>
  29 #include <linux/poll.h>
  30 #include <linux/cpu.h>
  31 #include <linux/fs.h>
  32
  33 #include <asm/processor.h>
  34 #include <asm/uaccess.h>
  35 #include <asm/idle.h>
  36 #include <asm/mce.h>
  37 #include <asm/msr.h>
  38 #include <asm/smp.h>
  39
  40 #include "mce.h"
  41
  42 /* Handle unconfigured int18 (should never happen) */
  43 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
  44 {
  45         printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
  46                smp_processor_id());
  47 }
  48
  49 /* Call the installed machine check handler for this CPU setup. */
  50 void (*machine_check_vector)(struct pt_regs *, long error_code) =
  51                                                 unexpected_machine_check;
  52
  53 int                             mce_disabled;
  54
  55 #ifdef CONFIG_X86_64
  56
  57 #define MISC_MCELOG_MINOR       227
  58
  59 atomic_t mce_entry;
  60
  61 /*
  62  * Tolerant levels:
  63  *   0: always panic on uncorrected errors, log corrected errors
  64  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  65  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  66  *   3: never panic or SIGBUS, log all errors (for testing only)
  67  */
  68 static int                      tolerant = 1;
  69 static int                      banks;
  70 static u64                      *bank;
  71 static unsigned long            notify_user;
  72 static int                      rip_msr;
  73 static int                      mce_bootlog = -1;
  74 static atomic_t                 mce_events;
  75
  76 static char                     trigger[128];
  77 static char                     *trigger_argv[2] = { trigger, NULL };
  78
  79 static unsigned long            dont_init_banks;
  80
  81 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  82
  83 /* MCA banks polled by the period polling timer for corrected events */
  84 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  85         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  86 };
  87
  88 static inline int skip_bank_init(int i)
  89 {
  90         return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
  91 }
  92
  93 /* Do initial initialization of a struct mce */
  94 void mce_setup(struct mce *m)
  95 {
  96         memset(m, 0, sizeof(struct mce));
  97         m->cpu = smp_processor_id();
  98         rdtscll(m->tsc);
  99 }
 100
 101 /*
 102  * Lockless MCE logging infrastructure.
 103  * This avoids deadlocks on printk locks without having to break locks. Also
 104  * separate MCEs from kernel messages to avoid bogus bug reports.
 105  */
 106
 107 static struct mce_log mcelog = {
 108         MCE_LOG_SIGNATURE,
 109         MCE_LOG_LEN,
 110 };
 111
 112 void mce_log(struct mce *mce)
 113 {
 114         unsigned next, entry;
 115
 116         atomic_inc(&mce_events);
 117         mce->finished = 0;
 118         wmb();
 119         for (;;) {
 120                 entry = rcu_dereference(mcelog.next);
 121                 for (;;) {
 122                         /*
 123                          * When the buffer fills up discard new entries.
 124                          * Assume that the earlier errors are the more
 125                          * interesting ones:
 126                          */
 127                         if (entry >= MCE_LOG_LEN) {
 128                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 129                                 return;
 130                         }
 131                         /* Old left over entry. Skip: */
 132                         if (mcelog.entry[entry].finished) {
 133                                 entry++;
 134                                 continue;
 135                         }
 136                         break;
 137                 }
 138                 smp_rmb();
 139                 next = entry + 1;
 140                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 141                         break;
 142         }
 143         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 144         wmb();
 145         mcelog.entry[entry].finished = 1;
 146         wmb();
 147
 148         set_bit(0, &notify_user);
 149 }
 150
 151 static void print_mce(struct mce *m)
 152 {
 153         printk(KERN_EMERG "\n"
 154                KERN_EMERG "HARDWARE ERROR\n"
 155                KERN_EMERG
 156                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 157                m->cpu, m->mcgstatus, m->bank, m->status);
 158         if (m->ip) {
 159                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 160                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 161                        m->cs, m->ip);
 162                 if (m->cs == __KERNEL_CS)
 163                         print_symbol("{%s}", m->ip);
 164                 printk("\n");
 165         }
 166         printk(KERN_EMERG "TSC %llx ", m->tsc);
 167         if (m->addr)
 168                 printk("ADDR %llx ", m->addr);
 169         if (m->misc)
 170                 printk("MISC %llx ", m->misc);
 171         printk("\n");
 172         printk(KERN_EMERG "This is not a software problem!\n");
 173         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 174                "and contact your hardware vendor\n");
 175 }
 176
 177 static void mce_panic(char *msg, struct mce *backup, u64 start)
 178 {
 179         int i;
 180
 181         oops_begin();
 182         for (i = 0; i < MCE_LOG_LEN; i++) {
 183                 u64 tsc = mcelog.entry[i].tsc;
 184
 185                 if ((s64)(tsc - start) < 0)
 186                         continue;
 187                 print_mce(&mcelog.entry[i]);
 188                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 189                         backup = NULL;
 190         }
 191         if (backup)
 192                 print_mce(backup);
 193         panic(msg);
 194 }
 195
 196 int mce_available(struct cpuinfo_x86 *c)
 197 {
 198         if (mce_disabled)
 199                 return 0;
 200         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 201 }
 202
 203 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 204 {
 205         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 206                 m->ip = regs->ip;
 207                 m->cs = regs->cs;
 208         } else {
 209                 m->ip = 0;
 210                 m->cs = 0;
 211         }
 212         if (rip_msr) {
 213                 /* Assume the RIP in the MSR is exact. Is this true? */
 214                 m->mcgstatus |= MCG_STATUS_EIPV;
 215                 rdmsrl(rip_msr, m->ip);
 216                 m->cs = 0;
 217         }
 218 }
 219
 220 /*
 221  * Poll for corrected events or events that happened before reset.
 222  * Those are just logged through /dev/mcelog.
 223  *
 224  * This is executed in standard interrupt context.
 225  */
 226 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 227 {
 228         struct mce m;
 229         int i;
 230
 231         mce_setup(&m);
 232
 233         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 234         for (i = 0; i < banks; i++) {
 235                 if (!bank[i] || !test_bit(i, *b))
 236                         continue;
 237
 238                 m.misc = 0;
 239                 m.addr = 0;
 240                 m.bank = i;
 241                 m.tsc = 0;
 242
 243                 barrier();
 244                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 245                 if (!(m.status & MCI_STATUS_VAL))
 246                         continue;
 247
 248                 /*
 249                  * Uncorrected events are handled by the exception handler
 250                  * when it is enabled. But when the exception is disabled log
 251                  * everything.
 252                  *
 253                  * TBD do the same check for MCI_STATUS_EN here?
 254                  */
 255                 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
 256                         continue;
 257
 258                 if (m.status & MCI_STATUS_MISCV)
 259                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 260                 if (m.status & MCI_STATUS_ADDRV)
 261                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 262
 263                 if (!(flags & MCP_TIMESTAMP))
 264                         m.tsc = 0;
 265                 /*
 266                  * Don't get the IP here because it's unlikely to
 267                  * have anything to do with the actual error location.
 268                  */
 269                 if (!(flags & MCP_DONTLOG)) {
 270                         mce_log(&m);
 271                         add_taint(TAINT_MACHINE_CHECK);
 272                 }
 273
 274                 /*
 275                  * Clear state for this bank.
 276                  */
 277                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 278         }
 279
 280         /*
 281          * Don't clear MCG_STATUS here because it's only defined for
 282          * exceptions.
 283          */
 284 }
 285
 286 /*
 287  * The actual machine check handler. This only handles real
 288  * exceptions when something got corrupted coming in through int 18.
 289  *
 290  * This is executed in NMI context not subject to normal locking rules. This
 291  * implies that most kernel services cannot be safely used. Don't even
 292  * think about putting a printk in there!
 293  */
 294 void do_machine_check(struct pt_regs *regs, long error_code)
 295 {
 296         struct mce m, panicm;
 297         int panicm_found = 0;
 298         u64 mcestart = 0;
 299         int i;
 300         /*
 301          * If no_way_out gets set, there is no safe way to recover from this
 302          * MCE.  If tolerant is cranked up, we'll try anyway.
 303          */
 304         int no_way_out = 0;
 305         /*
 306          * If kill_it gets set, there might be a way to recover from this
 307          * error.
 308          */
 309         int kill_it = 0;
 310         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 311
 312         atomic_inc(&mce_entry);
 313
 314         if (notify_die(DIE_NMI, "machine check", regs, error_code,
 315                            18, SIGKILL) == NOTIFY_STOP)
 316                 goto out2;
 317         if (!banks)
 318                 goto out2;
 319
 320         mce_setup(&m);
 321
 322         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 323
 324         /* if the restart IP is not valid, we're done for */
 325         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 326                 no_way_out = 1;
 327
 328         rdtscll(mcestart);
 329         barrier();
 330
 331         for (i = 0; i < banks; i++) {
 332                 __clear_bit(i, toclear);
 333                 if (!bank[i])
 334                         continue;
 335
 336                 m.misc = 0;
 337                 m.addr = 0;
 338                 m.bank = i;
 339
 340                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 341                 if ((m.status & MCI_STATUS_VAL) == 0)
 342                         continue;
 343
 344                 /*
 345                  * Non uncorrected errors are handled by machine_check_poll
 346                  * Leave them alone.
 347                  */
 348                 if ((m.status & MCI_STATUS_UC) == 0)
 349                         continue;
 350
 351                 /*
 352                  * Set taint even when machine check was not enabled.
 353                  */
 354                 add_taint(TAINT_MACHINE_CHECK);
 355
 356                 __set_bit(i, toclear);
 357
 358                 if (m.status & MCI_STATUS_EN) {
 359                         /* if PCC was set, there's no way out */
 360                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 361                         /*
 362                          * If this error was uncorrectable and there was
 363                          * an overflow, we're in trouble.  If no overflow,
 364                          * we might get away with just killing a task.
 365                          */
 366                         if (m.status & MCI_STATUS_UC) {
 367                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 368                                         no_way_out = 1;
 369                                 kill_it = 1;
 370                         }
 371                 } else {
 372                         /*
 373                          * Machine check event was not enabled. Clear, but
 374                          * ignore.
 375                          */
 376                         continue;
 377                 }
 378
 379                 if (m.status & MCI_STATUS_MISCV)
 380                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 381                 if (m.status & MCI_STATUS_ADDRV)
 382                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 383
 384                 mce_get_rip(&m, regs);
 385                 mce_log(&m);
 386
 387                 /*
 388                  * Did this bank cause the exception?
 389                  *
 390                  * Assume that the bank with uncorrectable errors did it,
 391                  * and that there is only a single one:
 392                  */
 393                 if ((m.status & MCI_STATUS_UC) &&
 394                                         (m.status & MCI_STATUS_EN)) {
 395                         panicm = m;
 396                         panicm_found = 1;
 397                 }
 398         }
 399
 400         /*
 401          * If we didn't find an uncorrectable error, pick
 402          * the last one (shouldn't happen, just being safe).
 403          */
 404         if (!panicm_found)
 405                 panicm = m;
 406
 407         /*
 408          * If we have decided that we just CAN'T continue, and the user
 409          * has not set tolerant to an insane level, give up and die.
 410          */
 411         if (no_way_out && tolerant < 3)
 412                 mce_panic("Machine check", &panicm, mcestart);
 413
 414         /*
 415          * If the error seems to be unrecoverable, something should be
 416          * done.  Try to kill as little as possible.  If we can kill just
 417          * one task, do that.  If the user has set the tolerance very
 418          * high, don't try to do anything at all.
 419          */
 420         if (kill_it && tolerant < 3) {
 421                 int user_space = 0;
 422
 423                 /*
 424                  * If the EIPV bit is set, it means the saved IP is the
 425                  * instruction which caused the MCE.
 426                  */
 427                 if (m.mcgstatus & MCG_STATUS_EIPV)
 428                         user_space = panicm.ip && (panicm.cs & 3);
 429
 430                 /*
 431                  * If we know that the error was in user space, send a
 432                  * SIGBUS.  Otherwise, panic if tolerance is low.
 433                  *
 434                  * force_sig() takes an awful lot of locks and has a slight
 435                  * risk of deadlocking.
 436                  */
 437                 if (user_space) {
 438                         force_sig(SIGBUS, current);
 439                 } else if (panic_on_oops || tolerant < 2) {
 440                         mce_panic("Uncorrected machine check",
 441                                 &panicm, mcestart);
 442                 }
 443         }
 444
 445         /* notify userspace ASAP */
 446         set_thread_flag(TIF_MCE_NOTIFY);
 447
 448         /* the last thing we do is clear state */
 449         for (i = 0; i < banks; i++) {
 450                 if (test_bit(i, toclear))
 451                         wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 452         }
 453         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 454  out2:
 455         atomic_dec(&mce_entry);
 456 }
 457
 458 #ifdef CONFIG_X86_MCE_INTEL
 459 /***
 460  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 461  * @cpu: The CPU on which the event occurred.
 462  * @status: Event status information
 463  *
 464  * This function should be called by the thermal interrupt after the
 465  * event has been processed and the decision was made to log the event
 466  * further.
 467  *
 468  * The status parameter will be saved to the 'status' field of 'struct mce'
 469  * and historically has been the register value of the
 470  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 471  */
 472 void mce_log_therm_throt_event(__u64 status)
 473 {
 474         struct mce m;
 475
 476         mce_setup(&m);
 477         m.bank = MCE_THERMAL_BANK;
 478         m.status = status;
 479         mce_log(&m);
 480 }
 481 #endif /* CONFIG_X86_MCE_INTEL */
 482
 483 /*
 484  * Periodic polling timer for "silent" machine check errors.  If the
 485  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 486  * errors, poll 2x slower (up to check_interval seconds).
 487  */
 488 static int check_interval = 5 * 60; /* 5 minutes */
 489
 490 static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 491 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 492
 493 static void mcheck_timer(unsigned long data)
 494 {
 495         struct timer_list *t = &per_cpu(mce_timer, data);
 496         int *n;
 497
 498         WARN_ON(smp_processor_id() != data);
 499
 500         if (mce_available(&current_cpu_data)) {
 501                 machine_check_poll(MCP_TIMESTAMP,
 502                                 &__get_cpu_var(mce_poll_banks));
 503         }
 504
 505         /*
 506          * Alert userspace if needed.  If we logged an MCE, reduce the
 507          * polling interval, otherwise increase the polling interval.
 508          */
 509         n = &__get_cpu_var(next_interval);
 510         if (mce_notify_user()) {
 511                 *n = max(*n/2, HZ/100);
 512         } else {
 513                 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 514         }
 515
 516         t->expires = jiffies + *n;
 517         add_timer(t);
 518 }
 519
 520 static void mce_do_trigger(struct work_struct *work)
 521 {
 522         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 523 }
 524
 525 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 526
 527 /*
 528  * Notify the user(s) about new machine check events.
 529  * Can be called from interrupt context, but not from machine check/NMI
 530  * context.
 531  */
 532 int mce_notify_user(void)
 533 {
 534         /* Not more than two messages every minute */
 535         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 536
 537         clear_thread_flag(TIF_MCE_NOTIFY);
 538
 539         if (test_and_clear_bit(0, &notify_user)) {
 540                 wake_up_interruptible(&mce_wait);
 541
 542                 /*
 543                  * There is no risk of missing notifications because
 544                  * work_pending is always cleared before the function is
 545                  * executed.
 546                  */
 547                 if (trigger[0] && !work_pending(&mce_trigger_work))
 548                         schedule_work(&mce_trigger_work);
 549
 550                 if (__ratelimit(&ratelimit))
 551                         printk(KERN_INFO "Machine check events logged\n");
 552
 553                 return 1;
 554         }
 555         return 0;
 556 }
 557
 558 /*
 559  * Initialize Machine Checks for a CPU.
 560  */
 561 static int mce_cap_init(void)
 562 {
 563         unsigned b;
 564         u64 cap;
 565
 566         rdmsrl(MSR_IA32_MCG_CAP, cap);
 567
 568         b = cap & MCG_BANKCNT_MASK;
 569         printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
 570
 571         if (b > MAX_NR_BANKS) {
 572                 printk(KERN_WARNING
 573                        "MCE: Using only %u machine check banks out of %u\n",
 574                         MAX_NR_BANKS, b);
 575                 b = MAX_NR_BANKS;
 576         }
 577
 578         /* Don't support asymmetric configurations today */
 579         WARN_ON(banks != 0 && b != banks);
 580         banks = b;
 581         if (!bank) {
 582                 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
 583                 if (!bank)
 584                         return -ENOMEM;
 585                 memset(bank, 0xff, banks * sizeof(u64));
 586         }
 587
 588         /* Use accurate RIP reporting if available. */
 589         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
 590                 rip_msr = MSR_IA32_MCG_EIP;
 591
 592         return 0;
 593 }
 594
 595 static void mce_init(void *dummy)
 596 {
 597         mce_banks_t all_banks;
 598         u64 cap;
 599         int i;
 600
 601         /*
 602          * Log the machine checks left over from the previous reset.
 603          */
 604         bitmap_fill(all_banks, MAX_NR_BANKS);
 605         machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 606
 607         set_in_cr4(X86_CR4_MCE);
 608
 609         rdmsrl(MSR_IA32_MCG_CAP, cap);
 610         if (cap & MCG_CTL_P)
 611                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 612
 613         for (i = 0; i < banks; i++) {
 614                 if (skip_bank_init(i))
 615                         continue;
 616                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 617                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 618         }
 619 }
 620
 621 /* Add per CPU specific workarounds here */
 622 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 623 {
 624         /* This should be disabled by the BIOS, but isn't always */
 625         if (c->x86_vendor == X86_VENDOR_AMD) {
 626                 if (c->x86 == 15 && banks > 4) {
 627                         /*
 628                          * disable GART TBL walk error reporting, which
 629                          * trips off incorrectly with the IOMMU & 3ware
 630                          * & Cerberus:
 631                          */
 632                         clear_bit(10, (unsigned long *)&bank[4]);
 633                 }
 634                 if (c->x86 <= 17 && mce_bootlog < 0) {
 635                         /*
 636                          * Lots of broken BIOS around that don't clear them
 637                          * by default and leave crap in there. Don't log:
 638                          */
 639                         mce_bootlog = 0;
 640                 }
 641                 /*
 642                  * Various K7s with broken bank 0 around. Always disable
 643                  * by default.
 644                  */
 645                  if (c->x86 == 6)
 646                         bank[0] = 0;
 647         }
 648
 649         if (c->x86_vendor == X86_VENDOR_INTEL) {
 650                 /*
 651                  * SDM documents that on family 6 bank 0 should not be written
 652                  * because it aliases to another special BIOS controlled
 653                  * register.
 654                  * But it's not aliased anymore on model 0x1a+
 655                  * Don't ignore bank 0 completely because there could be a
 656                  * valid event later, merely don't write CTL0.
 657                  */
 658
 659                 if (c->x86 == 6 && c->x86_model < 0x1A)
 660                         __set_bit(0, &dont_init_banks);
 661         }
 662 }
 663
 664 static void mce_cpu_features(struct cpuinfo_x86 *c)
 665 {
 666         switch (c->x86_vendor) {
 667         case X86_VENDOR_INTEL:
 668                 mce_intel_feature_init(c);
 669                 break;
 670         case X86_VENDOR_AMD:
 671                 mce_amd_feature_init(c);
 672                 break;
 673         default:
 674                 break;
 675         }
 676 }
 677
 678 static void mce_init_timer(void)
 679 {
 680         struct timer_list *t = &__get_cpu_var(mce_timer);
 681         int *n = &__get_cpu_var(next_interval);
 682
 683         *n = check_interval * HZ;
 684         if (!*n)
 685                 return;
 686         setup_timer(t, mcheck_timer, smp_processor_id());
 687         t->expires = round_jiffies(jiffies + *n);
 688         add_timer(t);
 689 }
 690
 691 /*
 692  * Called for each booted CPU to set up machine checks.
 693  * Must be called with preempt off:
 694  */
 695 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 696 {
 697         if (!mce_available(c))
 698                 return;
 699
 700         if (mce_cap_init() < 0) {
 701                 mce_disabled = 1;
 702                 return;
 703         }
 704         mce_cpu_quirks(c);
 705
 706         machine_check_vector = do_machine_check;
 707
 708         mce_init(NULL);
 709         mce_cpu_features(c);
 710         mce_init_timer();
 711 }
 712
 713 /*
 714  * Character device to read and clear the MCE log.
 715  */
 716
 717 static DEFINE_SPINLOCK(mce_state_lock);
 718 static int              open_count;             /* #times opened */
 719 static int              open_exclu;             /* already open exclusive? */
 720
 721 static int mce_open(struct inode *inode, struct file *file)
 722 {
 723         lock_kernel();
 724         spin_lock(&mce_state_lock);
 725
 726         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 727                 spin_unlock(&mce_state_lock);
 728                 unlock_kernel();
 729
 730                 return -EBUSY;
 731         }
 732
 733         if (file->f_flags & O_EXCL)
 734                 open_exclu = 1;
 735         open_count++;
 736
 737         spin_unlock(&mce_state_lock);
 738         unlock_kernel();
 739
 740         return nonseekable_open(inode, file);
 741 }
 742
 743 static int mce_release(struct inode *inode, struct file *file)
 744 {
 745         spin_lock(&mce_state_lock);
 746
 747         open_count--;
 748         open_exclu = 0;
 749
 750         spin_unlock(&mce_state_lock);
 751
 752         return 0;
 753 }
 754
 755 static void collect_tscs(void *data)
 756 {
 757         unsigned long *cpu_tsc = (unsigned long *)data;
 758
 759         rdtscll(cpu_tsc[smp_processor_id()]);
 760 }
 761
 762 static DEFINE_MUTEX(mce_read_mutex);
 763
 764 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 765                         loff_t *off)
 766 {
 767         char __user *buf = ubuf;
 768         unsigned long *cpu_tsc;
 769         unsigned prev, next;
 770         int i, err;
 771
 772         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 773         if (!cpu_tsc)
 774                 return -ENOMEM;
 775
 776         mutex_lock(&mce_read_mutex);
 777         next = rcu_dereference(mcelog.next);
 778
 779         /* Only supports full reads right now */
 780         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 781                 mutex_unlock(&mce_read_mutex);
 782                 kfree(cpu_tsc);
 783
 784                 return -EINVAL;
 785         }
 786
 787         err = 0;
 788         prev = 0;
 789         do {
 790                 for (i = prev; i < next; i++) {
 791                         unsigned long start = jiffies;
 792
 793                         while (!mcelog.entry[i].finished) {
 794                                 if (time_after_eq(jiffies, start + 2)) {
 795                                         memset(mcelog.entry + i, 0,
 796                                                sizeof(struct mce));
 797                                         goto timeout;
 798                                 }
 799                                 cpu_relax();
 800                         }
 801                         smp_rmb();
 802                         err |= copy_to_user(buf, mcelog.entry + i,
 803                                             sizeof(struct mce));
 804                         buf += sizeof(struct mce);
 805 timeout:
 806                         ;
 807                 }
 808
 809                 memset(mcelog.entry + prev, 0,
 810                        (next - prev) * sizeof(struct mce));
 811                 prev = next;
 812                 next = cmpxchg(&mcelog.next, prev, 0);
 813         } while (next != prev);
 814
 815         synchronize_sched();
 816
 817         /*
 818          * Collect entries that were still getting written before the
 819          * synchronize.
 820          */
 821         on_each_cpu(collect_tscs, cpu_tsc, 1);
 822
 823         for (i = next; i < MCE_LOG_LEN; i++) {
 824                 if (mcelog.entry[i].finished &&
 825                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 826                         err |= copy_to_user(buf, mcelog.entry+i,
 827                                             sizeof(struct mce));
 828                         smp_rmb();
 829                         buf += sizeof(struct mce);
 830                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 831                 }
 832         }
 833         mutex_unlock(&mce_read_mutex);
 834         kfree(cpu_tsc);
 835
 836         return err ? -EFAULT : buf - ubuf;
 837 }
 838
 839 static unsigned int mce_poll(struct file *file, poll_table *wait)
 840 {
 841         poll_wait(file, &mce_wait, wait);
 842         if (rcu_dereference(mcelog.next))
 843                 return POLLIN | POLLRDNORM;
 844         return 0;
 845 }
 846
 847 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 848 {
 849         int __user *p = (int __user *)arg;
 850
 851         if (!capable(CAP_SYS_ADMIN))
 852                 return -EPERM;
 853
 854         switch (cmd) {
 855         case MCE_GET_RECORD_LEN:
 856                 return put_user(sizeof(struct mce), p);
 857         case MCE_GET_LOG_LEN:
 858                 return put_user(MCE_LOG_LEN, p);
 859         case MCE_GETCLEAR_FLAGS: {
 860                 unsigned flags;
 861
 862                 do {
 863                         flags = mcelog.flags;
 864                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 865
 866                 return put_user(flags, p);
 867         }
 868         default:
 869                 return -ENOTTY;
 870         }
 871 }
 872
 873 static const struct file_operations mce_chrdev_ops = {
 874         .open                   = mce_open,
 875         .release                = mce_release,
 876         .read                   = mce_read,
 877         .poll                   = mce_poll,
 878         .unlocked_ioctl         = mce_ioctl,
 879 };
 880
 881 static struct miscdevice mce_log_device = {
 882         MISC_MCELOG_MINOR,
 883         "mcelog",
 884         &mce_chrdev_ops,
 885 };
 886
 887 /*
 888  * mce=off disables machine check
 889  * mce=TOLERANCELEVEL (number, see above)
 890  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 891  * mce=nobootlog Don't log MCEs from before booting.
 892  */
 893 static int __init mcheck_enable(char *str)
 894 {
 895         if (!strcmp(str, "off"))
 896                 mce_disabled = 1;
 897         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
 898                 mce_bootlog = (str[0] == 'b');
 899         else if (isdigit(str[0]))
 900                 get_option(&str, &tolerant);
 901         else {
 902                 printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n",
 903                        str);
 904                 return 0;
 905         }
 906         return 1;
 907 }
 908 __setup("mce=", mcheck_enable);
 909
 910 /*
 911  * Sysfs support
 912  */
 913
 914 /*
 915  * Disable machine checks on suspend and shutdown. We can't really handle
 916  * them later.
 917  */
 918 static int mce_disable(void)
 919 {
 920         int i;
 921
 922         for (i = 0; i < banks; i++) {
 923                 if (!skip_bank_init(i))
 924                         wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 925         }
 926         return 0;
 927 }
 928
 929 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 930 {
 931         return mce_disable();
 932 }
 933
 934 static int mce_shutdown(struct sys_device *dev)
 935 {
 936         return mce_disable();
 937 }
 938
 939 /*
 940  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 941  * Only one CPU is active at this time, the others get re-added later using
 942  * CPU hotplug:
 943  */
 944 static int mce_resume(struct sys_device *dev)
 945 {
 946         mce_init(NULL);
 947         mce_cpu_features(&current_cpu_data);
 948
 949         return 0;
 950 }
 951
 952 static void mce_cpu_restart(void *data)
 953 {
 954         del_timer_sync(&__get_cpu_var(mce_timer));
 955         if (mce_available(&current_cpu_data))
 956                 mce_init(NULL);
 957         mce_init_timer();
 958 }
 959
 960 /* Reinit MCEs after user configuration changes */
 961 static void mce_restart(void)
 962 {
 963         on_each_cpu(mce_cpu_restart, NULL, 1);
 964 }
 965
 966 static struct sysdev_class mce_sysclass = {
 967         .suspend        = mce_suspend,
 968         .shutdown       = mce_shutdown,
 969         .resume         = mce_resume,
 970         .name           = "machinecheck",
 971 };
 972
 973 DEFINE_PER_CPU(struct sys_device, mce_dev);
 974
 975 __cpuinitdata
 976 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 977
 978 /* Why are there no generic functions for this? */
 979 #define ACCESSOR(name, var, start) \
 980         static ssize_t show_ ## name(struct sys_device *s,              \
 981                                      struct sysdev_attribute *attr,     \
 982                                      char *buf) {                       \
 983                 return sprintf(buf, "%Lx\n", (u64)var);                 \
 984         }                                                               \
 985         static ssize_t set_ ## name(struct sys_device *s,               \
 986                                     struct sysdev_attribute *attr,      \
 987                                     const char *buf, size_t siz) {      \
 988                 char *end;                                              \
 989                 u64 new = simple_strtoull(buf, &end, 0);                \
 990                                                                         \
 991                 if (end == buf)                                         \
 992                         return -EINVAL;                                 \
 993                 var = new;                                              \
 994                 start;                                                  \
 995                                                                         \
 996                 return end-buf;                                         \
 997         }                                                               \
 998         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 999
1000 static struct sysdev_attribute *bank_attrs;
1001
1002 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1003                          char *buf)
1004 {
1005         u64 b = bank[attr - bank_attrs];
1006
1007         return sprintf(buf, "%llx\n", b);
1008 }
1009
1010 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1011                         const char *buf, size_t siz)
1012 {
1013         char *end;
1014         u64 new = simple_strtoull(buf, &end, 0);
1015
1016         if (end == buf)
1017                 return -EINVAL;
1018
1019         bank[attr - bank_attrs] = new;
1020         mce_restart();
1021
1022         return end-buf;
1023 }
1024
1025 static ssize_t
1026 show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1027 {
1028         strcpy(buf, trigger);
1029         strcat(buf, "\n");
1030         return strlen(trigger) + 1;
1031 }
1032
1033 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1034                                 const char *buf, size_t siz)
1035 {
1036         char *p;
1037         int len;
1038
1039         strncpy(trigger, buf, sizeof(trigger));
1040         trigger[sizeof(trigger)-1] = 0;
1041         len = strlen(trigger);
1042         p = strchr(trigger, '\n');
1043
1044         if (*p)
1045                 *p = 0;
1046
1047         return len;
1048 }
1049
1050 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1051 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1052
1053 ACCESSOR(check_interval, check_interval, mce_restart())
1054
1055 static struct sysdev_attribute *mce_attrs[] = {
1056         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
1057         NULL
1058 };
1059
1060 static cpumask_var_t mce_dev_initialized;
1061
1062 /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1063 static __cpuinit int mce_create_device(unsigned int cpu)
1064 {
1065         int err;
1066         int i;
1067
1068         if (!mce_available(&boot_cpu_data))
1069                 return -EIO;
1070
1071         memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1072         per_cpu(mce_dev, cpu).id        = cpu;
1073         per_cpu(mce_dev, cpu).cls       = &mce_sysclass;
1074
1075         err = sysdev_register(&per_cpu(mce_dev, cpu));
1076         if (err)
1077                 return err;
1078
1079         for (i = 0; mce_attrs[i]; i++) {
1080                 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1081                 if (err)
1082                         goto error;
1083         }
1084         for (i = 0; i < banks; i++) {
1085                 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1086                                         &bank_attrs[i]);
1087                 if (err)
1088                         goto error2;
1089         }
1090         cpumask_set_cpu(cpu, mce_dev_initialized);
1091
1092         return 0;
1093 error2:
1094         while (--i >= 0)
1095                 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1096 error:
1097         while (--i >= 0)
1098                 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1099
1100         sysdev_unregister(&per_cpu(mce_dev, cpu));
1101
1102         return err;
1103 }
1104
1105 static __cpuinit void mce_remove_device(unsigned int cpu)
1106 {
1107         int i;
1108
1109         if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1110                 return;
1111
1112         for (i = 0; mce_attrs[i]; i++)
1113                 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1114
1115         for (i = 0; i < banks; i++)
1116                 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1117
1118         sysdev_unregister(&per_cpu(mce_dev, cpu));
1119         cpumask_clear_cpu(cpu, mce_dev_initialized);
1120 }
1121
1122 /* Make sure there are no machine checks on offlined CPUs. */
1123 static void mce_disable_cpu(void *h)
1124 {
1125         unsigned long action = *(unsigned long *)h;
1126         int i;
1127
1128         if (!mce_available(&current_cpu_data))
1129                 return;
1130         if (!(action & CPU_TASKS_FROZEN))
1131                 cmci_clear();
1132         for (i = 0; i < banks; i++) {
1133                 if (!skip_bank_init(i))
1134                         wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1135         }
1136 }
1137
1138 static void mce_reenable_cpu(void *h)
1139 {
1140         unsigned long action = *(unsigned long *)h;
1141         int i;
1142
1143         if (!mce_available(&current_cpu_data))
1144                 return;
1145
1146         if (!(action & CPU_TASKS_FROZEN))
1147                 cmci_reenable();
1148         for (i = 0; i < banks; i++) {
1149                 if (!skip_bank_init(i))
1150                         wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1151         }
1152 }
1153
1154 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1155 static int __cpuinit
1156 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1157 {
1158         unsigned int cpu = (unsigned long)hcpu;
1159         struct timer_list *t = &per_cpu(mce_timer, cpu);
1160
1161         switch (action) {
1162         case CPU_ONLINE:
1163         case CPU_ONLINE_FROZEN:
1164                 mce_create_device(cpu);
1165                 if (threshold_cpu_callback)
1166                         threshold_cpu_callback(action, cpu);
1167                 break;
1168         case CPU_DEAD:
1169         case CPU_DEAD_FROZEN:
1170                 if (threshold_cpu_callback)
1171                         threshold_cpu_callback(action, cpu);
1172                 mce_remove_device(cpu);
1173                 break;
1174         case CPU_DOWN_PREPARE:
1175         case CPU_DOWN_PREPARE_FROZEN:
1176                 del_timer_sync(t);
1177                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1178                 break;
1179         case CPU_DOWN_FAILED:
1180         case CPU_DOWN_FAILED_FROZEN:
1181                 t->expires = round_jiffies(jiffies +
1182                                                 __get_cpu_var(next_interval));
1183                 add_timer_on(t, cpu);
1184                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1185                 break;
1186         case CPU_POST_DEAD:
1187                 /* intentionally ignoring frozen here */
1188                 cmci_rediscover(cpu);
1189                 break;
1190         }
1191         return NOTIFY_OK;
1192 }
1193
1194 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1195         .notifier_call = mce_cpu_callback,
1196 };
1197
1198 static __init int mce_init_banks(void)
1199 {
1200         int i;
1201
1202         bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1203                                 GFP_KERNEL);
1204         if (!bank_attrs)
1205                 return -ENOMEM;
1206
1207         for (i = 0; i < banks; i++) {
1208                 struct sysdev_attribute *a = &bank_attrs[i];
1209
1210                 a->attr.name    = kasprintf(GFP_KERNEL, "bank%d", i);
1211                 if (!a->attr.name)
1212                         goto nomem;
1213
1214                 a->attr.mode    = 0644;
1215                 a->show         = show_bank;
1216                 a->store        = set_bank;
1217         }
1218         return 0;
1219
1220 nomem:
1221         while (--i >= 0)
1222                 kfree(bank_attrs[i].attr.name);
1223         kfree(bank_attrs);
1224         bank_attrs = NULL;
1225
1226         return -ENOMEM;
1227 }
1228
1229 static __init int mce_init_device(void)
1230 {
1231         int err;
1232         int i = 0;
1233
1234         if (!mce_available(&boot_cpu_data))
1235                 return -EIO;
1236
1237         alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1238
1239         err = mce_init_banks();
1240         if (err)
1241                 return err;
1242
1243         err = sysdev_class_register(&mce_sysclass);
1244         if (err)
1245                 return err;
1246
1247         for_each_online_cpu(i) {
1248                 err = mce_create_device(i);
1249                 if (err)
1250                         return err;
1251         }
1252
1253         register_hotcpu_notifier(&mce_cpu_notifier);
1254         misc_register(&mce_log_device);
1255
1256         return err;
1257 }
1258
1259 device_initcall(mce_init_device);
1260
1261 #else /* CONFIG_X86_32: */
1262
1263 int nr_mce_banks;
1264 EXPORT_SYMBOL_GPL(nr_mce_banks);        /* non-fatal.o */
1265
1266 /* This has to be run for each processor */
1267 void mcheck_init(struct cpuinfo_x86 *c)
1268 {
1269         if (mce_disabled == 1)
1270                 return;
1271
1272         switch (c->x86_vendor) {
1273         case X86_VENDOR_AMD:
1274                 amd_mcheck_init(c);
1275                 break;
1276
1277         case X86_VENDOR_INTEL:
1278                 if (c->x86 == 5)
1279                         intel_p5_mcheck_init(c);
1280                 if (c->x86 == 6)
1281                         intel_p6_mcheck_init(c);
1282                 if (c->x86 == 15)
1283                         intel_p4_mcheck_init(c);
1284                 break;
1285
1286         case X86_VENDOR_CENTAUR:
1287                 if (c->x86 == 5)
1288                         winchip_mcheck_init(c);
1289                 break;
1290
1291         default:
1292                 break;
1293         }
1294         printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1295 }
1296
1297 static int __init mcheck_enable(char *str)
1298 {
1299         mce_disabled = -1;
1300         return 1;
1301 }
1302
1303 __setup("mce", mcheck_enable);
1304
1305 #endif /* CONFIG_X86_OLD_MCE */
1306
1307 /*
1308  * Old style boot options parsing. Only for compatibility.
1309  */
1310 static int __init mcheck_disable(char *str)
1311 {
1312         mce_disabled = 1;
1313         return 1;
1314 }
1315 __setup("nomce", mcheck_disable);