arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10 #include <linux/thread_info.h>
  11 #include <linux/capability.h>
  12 #include <linux/miscdevice.h>
  13 #include <linux/ratelimit.h>
  14 #include <linux/kallsyms.h>
  15 #include <linux/rcupdate.h>
  16 #include <linux/smp_lock.h>
  17 #include <linux/kobject.h>
  18 #include <linux/kdebug.h>
  19 #include <linux/kernel.h>
  20 #include <linux/percpu.h>
  21 #include <linux/string.h>
  22 #include <linux/sysdev.h>
  23 #include <linux/ctype.h>
  24 #include <linux/sched.h>
  25 #include <linux/sysfs.h>
  26 #include <linux/types.h>
  27 #include <linux/init.h>
  28 #include <linux/kmod.h>
  29 #include <linux/poll.h>
  30 #include <linux/cpu.h>
  31 #include <linux/fs.h>
  32
  33 #include <asm/processor.h>
  34 #include <asm/uaccess.h>
  35 #include <asm/idle.h>
  36 #include <asm/mce.h>
  37 #include <asm/msr.h>
  38 #include <asm/smp.h>
  39
  40 #include "mce.h"
  41
  42 /* Handle unconfigured int18 (should never happen) */
  43 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
  44 {
  45         printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
  46                smp_processor_id());
  47 }
  48
  49 /* Call the installed machine check handler for this CPU setup. */
  50 void (*machine_check_vector)(struct pt_regs *, long error_code) =
  51                                                 unexpected_machine_check;
  52 #ifdef CONFIG_X86_64
  53
  54 #define MISC_MCELOG_MINOR       227
  55
  56 atomic_t mce_entry;
  57
  58 static int                      mce_dont_init;
  59
  60 /*
  61  * Tolerant levels:
  62  *   0: always panic on uncorrected errors, log corrected errors
  63  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  64  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  65  *   3: never panic or SIGBUS, log all errors (for testing only)
  66  */
  67 static int                      tolerant = 1;
  68 static int                      banks;
  69 static u64                      *bank;
  70 static unsigned long            notify_user;
  71 static int                      rip_msr;
  72 static int                      mce_bootlog = -1;
  73 static atomic_t                 mce_events;
  74
  75 static char                     trigger[128];
  76 static char                     *trigger_argv[2] = { trigger, NULL };
  77
  78 static unsigned long            dont_init_banks;
  79
  80 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  81
  82 /* MCA banks polled by the period polling timer for corrected events */
  83 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  84         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  85 };
  86
  87 static inline int skip_bank_init(int i)
  88 {
  89         return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
  90 }
  91
  92 /* Do initial initialization of a struct mce */
  93 void mce_setup(struct mce *m)
  94 {
  95         memset(m, 0, sizeof(struct mce));
  96         m->cpu = smp_processor_id();
  97         rdtscll(m->tsc);
  98 }
  99
 100 /*
 101  * Lockless MCE logging infrastructure.
 102  * This avoids deadlocks on printk locks without having to break locks. Also
 103  * separate MCEs from kernel messages to avoid bogus bug reports.
 104  */
 105
 106 static struct mce_log mcelog = {
 107         MCE_LOG_SIGNATURE,
 108         MCE_LOG_LEN,
 109 };
 110
 111 void mce_log(struct mce *mce)
 112 {
 113         unsigned next, entry;
 114
 115         atomic_inc(&mce_events);
 116         mce->finished = 0;
 117         wmb();
 118         for (;;) {
 119                 entry = rcu_dereference(mcelog.next);
 120                 for (;;) {
 121                         /*
 122                          * When the buffer fills up discard new entries.
 123                          * Assume that the earlier errors are the more
 124                          * interesting ones:
 125                          */
 126                         if (entry >= MCE_LOG_LEN) {
 127                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 128                                 return;
 129                         }
 130                         /* Old left over entry. Skip: */
 131                         if (mcelog.entry[entry].finished) {
 132                                 entry++;
 133                                 continue;
 134                         }
 135                         break;
 136                 }
 137                 smp_rmb();
 138                 next = entry + 1;
 139                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 140                         break;
 141         }
 142         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 143         wmb();
 144         mcelog.entry[entry].finished = 1;
 145         wmb();
 146
 147         set_bit(0, &notify_user);
 148 }
 149
 150 static void print_mce(struct mce *m)
 151 {
 152         printk(KERN_EMERG "\n"
 153                KERN_EMERG "HARDWARE ERROR\n"
 154                KERN_EMERG
 155                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 156                m->cpu, m->mcgstatus, m->bank, m->status);
 157         if (m->ip) {
 158                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 159                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 160                        m->cs, m->ip);
 161                 if (m->cs == __KERNEL_CS)
 162                         print_symbol("{%s}", m->ip);
 163                 printk("\n");
 164         }
 165         printk(KERN_EMERG "TSC %llx ", m->tsc);
 166         if (m->addr)
 167                 printk("ADDR %llx ", m->addr);
 168         if (m->misc)
 169                 printk("MISC %llx ", m->misc);
 170         printk("\n");
 171         printk(KERN_EMERG "This is not a software problem!\n");
 172         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 173                "and contact your hardware vendor\n");
 174 }
 175
 176 static void mce_panic(char *msg, struct mce *backup, u64 start)
 177 {
 178         int i;
 179
 180         oops_begin();
 181         for (i = 0; i < MCE_LOG_LEN; i++) {
 182                 u64 tsc = mcelog.entry[i].tsc;
 183
 184                 if ((s64)(tsc - start) < 0)
 185                         continue;
 186                 print_mce(&mcelog.entry[i]);
 187                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 188                         backup = NULL;
 189         }
 190         if (backup)
 191                 print_mce(backup);
 192         panic(msg);
 193 }
 194
 195 int mce_available(struct cpuinfo_x86 *c)
 196 {
 197         if (mce_dont_init)
 198                 return 0;
 199         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 200 }
 201
 202 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 203 {
 204         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 205                 m->ip = regs->ip;
 206                 m->cs = regs->cs;
 207         } else {
 208                 m->ip = 0;
 209                 m->cs = 0;
 210         }
 211         if (rip_msr) {
 212                 /* Assume the RIP in the MSR is exact. Is this true? */
 213                 m->mcgstatus |= MCG_STATUS_EIPV;
 214                 rdmsrl(rip_msr, m->ip);
 215                 m->cs = 0;
 216         }
 217 }
 218
 219 /*
 220  * Poll for corrected events or events that happened before reset.
 221  * Those are just logged through /dev/mcelog.
 222  *
 223  * This is executed in standard interrupt context.
 224  */
 225 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 226 {
 227         struct mce m;
 228         int i;
 229
 230         mce_setup(&m);
 231
 232         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 233         for (i = 0; i < banks; i++) {
 234                 if (!bank[i] || !test_bit(i, *b))
 235                         continue;
 236
 237                 m.misc = 0;
 238                 m.addr = 0;
 239                 m.bank = i;
 240                 m.tsc = 0;
 241
 242                 barrier();
 243                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 244                 if (!(m.status & MCI_STATUS_VAL))
 245                         continue;
 246
 247                 /*
 248                  * Uncorrected events are handled by the exception handler
 249                  * when it is enabled. But when the exception is disabled log
 250                  * everything.
 251                  *
 252                  * TBD do the same check for MCI_STATUS_EN here?
 253                  */
 254                 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
 255                         continue;
 256
 257                 if (m.status & MCI_STATUS_MISCV)
 258                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 259                 if (m.status & MCI_STATUS_ADDRV)
 260                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 261
 262                 if (!(flags & MCP_TIMESTAMP))
 263                         m.tsc = 0;
 264                 /*
 265                  * Don't get the IP here because it's unlikely to
 266                  * have anything to do with the actual error location.
 267                  */
 268                 if (!(flags & MCP_DONTLOG)) {
 269                         mce_log(&m);
 270                         add_taint(TAINT_MACHINE_CHECK);
 271                 }
 272
 273                 /*
 274                  * Clear state for this bank.
 275                  */
 276                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 277         }
 278
 279         /*
 280          * Don't clear MCG_STATUS here because it's only defined for
 281          * exceptions.
 282          */
 283 }
 284
 285 /*
 286  * The actual machine check handler. This only handles real
 287  * exceptions when something got corrupted coming in through int 18.
 288  *
 289  * This is executed in NMI context not subject to normal locking rules. This
 290  * implies that most kernel services cannot be safely used. Don't even
 291  * think about putting a printk in there!
 292  */
 293 void do_machine_check(struct pt_regs *regs, long error_code)
 294 {
 295         struct mce m, panicm;
 296         int panicm_found = 0;
 297         u64 mcestart = 0;
 298         int i;
 299         /*
 300          * If no_way_out gets set, there is no safe way to recover from this
 301          * MCE.  If tolerant is cranked up, we'll try anyway.
 302          */
 303         int no_way_out = 0;
 304         /*
 305          * If kill_it gets set, there might be a way to recover from this
 306          * error.
 307          */
 308         int kill_it = 0;
 309         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 310
 311         atomic_inc(&mce_entry);
 312
 313         if (notify_die(DIE_NMI, "machine check", regs, error_code,
 314                            18, SIGKILL) == NOTIFY_STOP)
 315                 goto out2;
 316         if (!banks)
 317                 goto out2;
 318
 319         mce_setup(&m);
 320
 321         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 322
 323         /* if the restart IP is not valid, we're done for */
 324         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 325                 no_way_out = 1;
 326
 327         rdtscll(mcestart);
 328         barrier();
 329
 330         for (i = 0; i < banks; i++) {
 331                 __clear_bit(i, toclear);
 332                 if (!bank[i])
 333                         continue;
 334
 335                 m.misc = 0;
 336                 m.addr = 0;
 337                 m.bank = i;
 338
 339                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 340                 if ((m.status & MCI_STATUS_VAL) == 0)
 341                         continue;
 342
 343                 /*
 344                  * Non uncorrected errors are handled by machine_check_poll
 345                  * Leave them alone.
 346                  */
 347                 if ((m.status & MCI_STATUS_UC) == 0)
 348                         continue;
 349
 350                 /*
 351                  * Set taint even when machine check was not enabled.
 352                  */
 353                 add_taint(TAINT_MACHINE_CHECK);
 354
 355                 __set_bit(i, toclear);
 356
 357                 if (m.status & MCI_STATUS_EN) {
 358                         /* if PCC was set, there's no way out */
 359                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 360                         /*
 361                          * If this error was uncorrectable and there was
 362                          * an overflow, we're in trouble.  If no overflow,
 363                          * we might get away with just killing a task.
 364                          */
 365                         if (m.status & MCI_STATUS_UC) {
 366                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 367                                         no_way_out = 1;
 368                                 kill_it = 1;
 369                         }
 370                 } else {
 371                         /*
 372                          * Machine check event was not enabled. Clear, but
 373                          * ignore.
 374                          */
 375                         continue;
 376                 }
 377
 378                 if (m.status & MCI_STATUS_MISCV)
 379                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 380                 if (m.status & MCI_STATUS_ADDRV)
 381                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 382
 383                 mce_get_rip(&m, regs);
 384                 mce_log(&m);
 385
 386                 /*
 387                  * Did this bank cause the exception?
 388                  *
 389                  * Assume that the bank with uncorrectable errors did it,
 390                  * and that there is only a single one:
 391                  */
 392                 if ((m.status & MCI_STATUS_UC) &&
 393                                         (m.status & MCI_STATUS_EN)) {
 394                         panicm = m;
 395                         panicm_found = 1;
 396                 }
 397         }
 398
 399         /*
 400          * If we didn't find an uncorrectable error, pick
 401          * the last one (shouldn't happen, just being safe).
 402          */
 403         if (!panicm_found)
 404                 panicm = m;
 405
 406         /*
 407          * If we have decided that we just CAN'T continue, and the user
 408          * has not set tolerant to an insane level, give up and die.
 409          */
 410         if (no_way_out && tolerant < 3)
 411                 mce_panic("Machine check", &panicm, mcestart);
 412
 413         /*
 414          * If the error seems to be unrecoverable, something should be
 415          * done.  Try to kill as little as possible.  If we can kill just
 416          * one task, do that.  If the user has set the tolerance very
 417          * high, don't try to do anything at all.
 418          */
 419         if (kill_it && tolerant < 3) {
 420                 int user_space = 0;
 421
 422                 /*
 423                  * If the EIPV bit is set, it means the saved IP is the
 424                  * instruction which caused the MCE.
 425                  */
 426                 if (m.mcgstatus & MCG_STATUS_EIPV)
 427                         user_space = panicm.ip && (panicm.cs & 3);
 428
 429                 /*
 430                  * If we know that the error was in user space, send a
 431                  * SIGBUS.  Otherwise, panic if tolerance is low.
 432                  *
 433                  * force_sig() takes an awful lot of locks and has a slight
 434                  * risk of deadlocking.
 435                  */
 436                 if (user_space) {
 437                         force_sig(SIGBUS, current);
 438                 } else if (panic_on_oops || tolerant < 2) {
 439                         mce_panic("Uncorrected machine check",
 440                                 &panicm, mcestart);
 441                 }
 442         }
 443
 444         /* notify userspace ASAP */
 445         set_thread_flag(TIF_MCE_NOTIFY);
 446
 447         /* the last thing we do is clear state */
 448         for (i = 0; i < banks; i++) {
 449                 if (test_bit(i, toclear))
 450                         wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 451         }
 452         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 453  out2:
 454         atomic_dec(&mce_entry);
 455 }
 456
 457 #ifdef CONFIG_X86_MCE_INTEL
 458 /***
 459  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 460  * @cpu: The CPU on which the event occurred.
 461  * @status: Event status information
 462  *
 463  * This function should be called by the thermal interrupt after the
 464  * event has been processed and the decision was made to log the event
 465  * further.
 466  *
 467  * The status parameter will be saved to the 'status' field of 'struct mce'
 468  * and historically has been the register value of the
 469  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 470  */
 471 void mce_log_therm_throt_event(__u64 status)
 472 {
 473         struct mce m;
 474
 475         mce_setup(&m);
 476         m.bank = MCE_THERMAL_BANK;
 477         m.status = status;
 478         mce_log(&m);
 479 }
 480 #endif /* CONFIG_X86_MCE_INTEL */
 481
 482 /*
 483  * Periodic polling timer for "silent" machine check errors.  If the
 484  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 485  * errors, poll 2x slower (up to check_interval seconds).
 486  */
 487 static int check_interval = 5 * 60; /* 5 minutes */
 488
 489 static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 490 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 491
 492 static void mcheck_timer(unsigned long data)
 493 {
 494         struct timer_list *t = &per_cpu(mce_timer, data);
 495         int *n;
 496
 497         WARN_ON(smp_processor_id() != data);
 498
 499         if (mce_available(&current_cpu_data)) {
 500                 machine_check_poll(MCP_TIMESTAMP,
 501                                 &__get_cpu_var(mce_poll_banks));
 502         }
 503
 504         /*
 505          * Alert userspace if needed.  If we logged an MCE, reduce the
 506          * polling interval, otherwise increase the polling interval.
 507          */
 508         n = &__get_cpu_var(next_interval);
 509         if (mce_notify_user()) {
 510                 *n = max(*n/2, HZ/100);
 511         } else {
 512                 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 513         }
 514
 515         t->expires = jiffies + *n;
 516         add_timer(t);
 517 }
 518
 519 static void mce_do_trigger(struct work_struct *work)
 520 {
 521         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 522 }
 523
 524 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 525
 526 /*
 527  * Notify the user(s) about new machine check events.
 528  * Can be called from interrupt context, but not from machine check/NMI
 529  * context.
 530  */
 531 int mce_notify_user(void)
 532 {
 533         /* Not more than two messages every minute */
 534         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 535
 536         clear_thread_flag(TIF_MCE_NOTIFY);
 537
 538         if (test_and_clear_bit(0, &notify_user)) {
 539                 wake_up_interruptible(&mce_wait);
 540
 541                 /*
 542                  * There is no risk of missing notifications because
 543                  * work_pending is always cleared before the function is
 544                  * executed.
 545                  */
 546                 if (trigger[0] && !work_pending(&mce_trigger_work))
 547                         schedule_work(&mce_trigger_work);
 548
 549                 if (__ratelimit(&ratelimit))
 550                         printk(KERN_INFO "Machine check events logged\n");
 551
 552                 return 1;
 553         }
 554         return 0;
 555 }
 556
 557 /* see if the idle task needs to notify userspace: */
 558 static int
 559 mce_idle_callback(struct notifier_block *nfb, unsigned long action,
 560                   void *unused)
 561 {
 562         /* IDLE_END should be safe - interrupts are back on */
 563         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 564                 mce_notify_user();
 565
 566         return NOTIFY_OK;
 567 }
 568
 569 static struct notifier_block mce_idle_notifier = {
 570         .notifier_call          = mce_idle_callback,
 571 };
 572
 573 static __init int periodic_mcheck_init(void)
 574 {
 575        idle_notifier_register(&mce_idle_notifier);
 576        return 0;
 577 }
 578 __initcall(periodic_mcheck_init);
 579
 580 /*
 581  * Initialize Machine Checks for a CPU.
 582  */
 583 static int mce_cap_init(void)
 584 {
 585         unsigned b;
 586         u64 cap;
 587
 588         rdmsrl(MSR_IA32_MCG_CAP, cap);
 589
 590         b = cap & MCG_BANKCNT_MASK;
 591         printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
 592
 593         if (b > MAX_NR_BANKS) {
 594                 printk(KERN_WARNING
 595                        "MCE: Using only %u machine check banks out of %u\n",
 596                         MAX_NR_BANKS, b);
 597                 b = MAX_NR_BANKS;
 598         }
 599
 600         /* Don't support asymmetric configurations today */
 601         WARN_ON(banks != 0 && b != banks);
 602         banks = b;
 603         if (!bank) {
 604                 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
 605                 if (!bank)
 606                         return -ENOMEM;
 607                 memset(bank, 0xff, banks * sizeof(u64));
 608         }
 609
 610         /* Use accurate RIP reporting if available. */
 611         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
 612                 rip_msr = MSR_IA32_MCG_EIP;
 613
 614         return 0;
 615 }
 616
 617 static void mce_init(void *dummy)
 618 {
 619         mce_banks_t all_banks;
 620         u64 cap;
 621         int i;
 622
 623         /*
 624          * Log the machine checks left over from the previous reset.
 625          */
 626         bitmap_fill(all_banks, MAX_NR_BANKS);
 627         machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 628
 629         set_in_cr4(X86_CR4_MCE);
 630
 631         rdmsrl(MSR_IA32_MCG_CAP, cap);
 632         if (cap & MCG_CTL_P)
 633                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 634
 635         for (i = 0; i < banks; i++) {
 636                 if (skip_bank_init(i))
 637                         continue;
 638                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 639                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 640         }
 641 }
 642
 643 /* Add per CPU specific workarounds here */
 644 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 645 {
 646         /* This should be disabled by the BIOS, but isn't always */
 647         if (c->x86_vendor == X86_VENDOR_AMD) {
 648                 if (c->x86 == 15 && banks > 4) {
 649                         /*
 650                          * disable GART TBL walk error reporting, which
 651                          * trips off incorrectly with the IOMMU & 3ware
 652                          * & Cerberus:
 653                          */
 654                         clear_bit(10, (unsigned long *)&bank[4]);
 655                 }
 656                 if (c->x86 <= 17 && mce_bootlog < 0) {
 657                         /*
 658                          * Lots of broken BIOS around that don't clear them
 659                          * by default and leave crap in there. Don't log:
 660                          */
 661                         mce_bootlog = 0;
 662                 }
 663                 /*
 664                  * Various K7s with broken bank 0 around. Always disable
 665                  * by default.
 666                  */
 667                  if (c->x86 == 6)
 668                         bank[0] = 0;
 669         }
 670
 671         if (c->x86_vendor == X86_VENDOR_INTEL) {
 672                 /*
 673                  * SDM documents that on family 6 bank 0 should not be written
 674                  * because it aliases to another special BIOS controlled
 675                  * register.
 676                  * But it's not aliased anymore on model 0x1a+
 677                  * Don't ignore bank 0 completely because there could be a
 678                  * valid event later, merely don't write CTL0.
 679                  */
 680
 681                 if (c->x86 == 6 && c->x86_model < 0x1A)
 682                         __set_bit(0, &dont_init_banks);
 683         }
 684 }
 685
 686 static void mce_cpu_features(struct cpuinfo_x86 *c)
 687 {
 688         switch (c->x86_vendor) {
 689         case X86_VENDOR_INTEL:
 690                 mce_intel_feature_init(c);
 691                 break;
 692         case X86_VENDOR_AMD:
 693                 mce_amd_feature_init(c);
 694                 break;
 695         default:
 696                 break;
 697         }
 698 }
 699
 700 static void mce_init_timer(void)
 701 {
 702         struct timer_list *t = &__get_cpu_var(mce_timer);
 703         int *n = &__get_cpu_var(next_interval);
 704
 705         *n = check_interval * HZ;
 706         if (!*n)
 707                 return;
 708         setup_timer(t, mcheck_timer, smp_processor_id());
 709         t->expires = round_jiffies(jiffies + *n);
 710         add_timer(t);
 711 }
 712
 713 /*
 714  * Called for each booted CPU to set up machine checks.
 715  * Must be called with preempt off:
 716  */
 717 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 718 {
 719         if (!mce_available(c))
 720                 return;
 721
 722         if (mce_cap_init() < 0) {
 723                 mce_dont_init = 1;
 724                 return;
 725         }
 726         mce_cpu_quirks(c);
 727
 728         machine_check_vector = do_machine_check;
 729
 730         mce_init(NULL);
 731         mce_cpu_features(c);
 732         mce_init_timer();
 733 }
 734
 735 /*
 736  * Character device to read and clear the MCE log.
 737  */
 738
 739 static DEFINE_SPINLOCK(mce_state_lock);
 740 static int              open_count;             /* #times opened */
 741 static int              open_exclu;             /* already open exclusive? */
 742
 743 static int mce_open(struct inode *inode, struct file *file)
 744 {
 745         lock_kernel();
 746         spin_lock(&mce_state_lock);
 747
 748         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 749                 spin_unlock(&mce_state_lock);
 750                 unlock_kernel();
 751
 752                 return -EBUSY;
 753         }
 754
 755         if (file->f_flags & O_EXCL)
 756                 open_exclu = 1;
 757         open_count++;
 758
 759         spin_unlock(&mce_state_lock);
 760         unlock_kernel();
 761
 762         return nonseekable_open(inode, file);
 763 }
 764
 765 static int mce_release(struct inode *inode, struct file *file)
 766 {
 767         spin_lock(&mce_state_lock);
 768
 769         open_count--;
 770         open_exclu = 0;
 771
 772         spin_unlock(&mce_state_lock);
 773
 774         return 0;
 775 }
 776
 777 static void collect_tscs(void *data)
 778 {
 779         unsigned long *cpu_tsc = (unsigned long *)data;
 780
 781         rdtscll(cpu_tsc[smp_processor_id()]);
 782 }
 783
 784 static DEFINE_MUTEX(mce_read_mutex);
 785
 786 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 787                         loff_t *off)
 788 {
 789         char __user *buf = ubuf;
 790         unsigned long *cpu_tsc;
 791         unsigned prev, next;
 792         int i, err;
 793
 794         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 795         if (!cpu_tsc)
 796                 return -ENOMEM;
 797
 798         mutex_lock(&mce_read_mutex);
 799         next = rcu_dereference(mcelog.next);
 800
 801         /* Only supports full reads right now */
 802         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 803                 mutex_unlock(&mce_read_mutex);
 804                 kfree(cpu_tsc);
 805
 806                 return -EINVAL;
 807         }
 808
 809         err = 0;
 810         prev = 0;
 811         do {
 812                 for (i = prev; i < next; i++) {
 813                         unsigned long start = jiffies;
 814
 815                         while (!mcelog.entry[i].finished) {
 816                                 if (time_after_eq(jiffies, start + 2)) {
 817                                         memset(mcelog.entry + i, 0,
 818                                                sizeof(struct mce));
 819                                         goto timeout;
 820                                 }
 821                                 cpu_relax();
 822                         }
 823                         smp_rmb();
 824                         err |= copy_to_user(buf, mcelog.entry + i,
 825                                             sizeof(struct mce));
 826                         buf += sizeof(struct mce);
 827 timeout:
 828                         ;
 829                 }
 830
 831                 memset(mcelog.entry + prev, 0,
 832                        (next - prev) * sizeof(struct mce));
 833                 prev = next;
 834                 next = cmpxchg(&mcelog.next, prev, 0);
 835         } while (next != prev);
 836
 837         synchronize_sched();
 838
 839         /*
 840          * Collect entries that were still getting written before the
 841          * synchronize.
 842          */
 843         on_each_cpu(collect_tscs, cpu_tsc, 1);
 844
 845         for (i = next; i < MCE_LOG_LEN; i++) {
 846                 if (mcelog.entry[i].finished &&
 847                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 848                         err |= copy_to_user(buf, mcelog.entry+i,
 849                                             sizeof(struct mce));
 850                         smp_rmb();
 851                         buf += sizeof(struct mce);
 852                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 853                 }
 854         }
 855         mutex_unlock(&mce_read_mutex);
 856         kfree(cpu_tsc);
 857
 858         return err ? -EFAULT : buf - ubuf;
 859 }
 860
 861 static unsigned int mce_poll(struct file *file, poll_table *wait)
 862 {
 863         poll_wait(file, &mce_wait, wait);
 864         if (rcu_dereference(mcelog.next))
 865                 return POLLIN | POLLRDNORM;
 866         return 0;
 867 }
 868
 869 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 870 {
 871         int __user *p = (int __user *)arg;
 872
 873         if (!capable(CAP_SYS_ADMIN))
 874                 return -EPERM;
 875
 876         switch (cmd) {
 877         case MCE_GET_RECORD_LEN:
 878                 return put_user(sizeof(struct mce), p);
 879         case MCE_GET_LOG_LEN:
 880                 return put_user(MCE_LOG_LEN, p);
 881         case MCE_GETCLEAR_FLAGS: {
 882                 unsigned flags;
 883
 884                 do {
 885                         flags = mcelog.flags;
 886                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 887
 888                 return put_user(flags, p);
 889         }
 890         default:
 891                 return -ENOTTY;
 892         }
 893 }
 894
 895 static const struct file_operations mce_chrdev_ops = {
 896         .open                   = mce_open,
 897         .release                = mce_release,
 898         .read                   = mce_read,
 899         .poll                   = mce_poll,
 900         .unlocked_ioctl         = mce_ioctl,
 901 };
 902
 903 static struct miscdevice mce_log_device = {
 904         MISC_MCELOG_MINOR,
 905         "mcelog",
 906         &mce_chrdev_ops,
 907 };
 908
 909 /*
 910  * Old style boot options parsing. Only for compatibility.
 911  */
 912 static int __init mcheck_disable(char *str)
 913 {
 914         mce_dont_init = 1;
 915         return 1;
 916 }
 917 __setup("nomce", mcheck_disable);
 918
 919 /*
 920  * mce=off disables machine check
 921  * mce=TOLERANCELEVEL (number, see above)
 922  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 923  * mce=nobootlog Don't log MCEs from before booting.
 924  */
 925 static int __init mcheck_enable(char *str)
 926 {
 927         if (!strcmp(str, "off"))
 928                 mce_dont_init = 1;
 929         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
 930                 mce_bootlog = (str[0] == 'b');
 931         else if (isdigit(str[0]))
 932                 get_option(&str, &tolerant);
 933         else {
 934                 printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n",
 935                        str);
 936                 return 0;
 937         }
 938         return 1;
 939 }
 940 __setup("mce=", mcheck_enable);
 941
 942 /*
 943  * Sysfs support
 944  */
 945
 946 /*
 947  * Disable machine checks on suspend and shutdown. We can't really handle
 948  * them later.
 949  */
 950 static int mce_disable(void)
 951 {
 952         int i;
 953
 954         for (i = 0; i < banks; i++) {
 955                 if (!skip_bank_init(i))
 956                         wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 957         }
 958         return 0;
 959 }
 960
 961 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 962 {
 963         return mce_disable();
 964 }
 965
 966 static int mce_shutdown(struct sys_device *dev)
 967 {
 968         return mce_disable();
 969 }
 970
 971 /*
 972  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 973  * Only one CPU is active at this time, the others get re-added later using
 974  * CPU hotplug:
 975  */
 976 static int mce_resume(struct sys_device *dev)
 977 {
 978         mce_init(NULL);
 979         mce_cpu_features(&current_cpu_data);
 980
 981         return 0;
 982 }
 983
 984 static void mce_cpu_restart(void *data)
 985 {
 986         del_timer_sync(&__get_cpu_var(mce_timer));
 987         if (mce_available(&current_cpu_data))
 988                 mce_init(NULL);
 989         mce_init_timer();
 990 }
 991
 992 /* Reinit MCEs after user configuration changes */
 993 static void mce_restart(void)
 994 {
 995         on_each_cpu(mce_cpu_restart, NULL, 1);
 996 }
 997
 998 static struct sysdev_class mce_sysclass = {
 999         .suspend        = mce_suspend,
1000         .shutdown       = mce_shutdown,
1001         .resume         = mce_resume,
1002         .name           = "machinecheck",
1003 };
1004
1005 DEFINE_PER_CPU(struct sys_device, mce_dev);
1006
1007 __cpuinitdata
1008 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1009
1010 /* Why are there no generic functions for this? */
1011 #define ACCESSOR(name, var, start) \
1012         static ssize_t show_ ## name(struct sys_device *s,              \
1013                                      struct sysdev_attribute *attr,     \
1014                                      char *buf) {                       \
1015                 return sprintf(buf, "%Lx\n", (u64)var);                 \
1016         }                                                               \
1017         static ssize_t set_ ## name(struct sys_device *s,               \
1018                                     struct sysdev_attribute *attr,      \
1019                                     const char *buf, size_t siz) {      \
1020                 char *end;                                              \
1021                 u64 new = simple_strtoull(buf, &end, 0);                \
1022                                                                         \
1023                 if (end == buf)                                         \
1024                         return -EINVAL;                                 \
1025                 var = new;                                              \
1026                 start;                                                  \
1027                                                                         \
1028                 return end-buf;                                         \
1029         }                                                               \
1030         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
1031
1032 static struct sysdev_attribute *bank_attrs;
1033
1034 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1035                          char *buf)
1036 {
1037         u64 b = bank[attr - bank_attrs];
1038
1039         return sprintf(buf, "%llx\n", b);
1040 }
1041
1042 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1043                         const char *buf, size_t siz)
1044 {
1045         char *end;
1046         u64 new = simple_strtoull(buf, &end, 0);
1047
1048         if (end == buf)
1049                 return -EINVAL;
1050
1051         bank[attr - bank_attrs] = new;
1052         mce_restart();
1053
1054         return end-buf;
1055 }
1056
1057 static ssize_t
1058 show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1059 {
1060         strcpy(buf, trigger);
1061         strcat(buf, "\n");
1062         return strlen(trigger) + 1;
1063 }
1064
1065 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1066                                 const char *buf, size_t siz)
1067 {
1068         char *p;
1069         int len;
1070
1071         strncpy(trigger, buf, sizeof(trigger));
1072         trigger[sizeof(trigger)-1] = 0;
1073         len = strlen(trigger);
1074         p = strchr(trigger, '\n');
1075
1076         if (*p)
1077                 *p = 0;
1078
1079         return len;
1080 }
1081
1082 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1083 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1084
1085 ACCESSOR(check_interval, check_interval, mce_restart())
1086
1087 static struct sysdev_attribute *mce_attrs[] = {
1088         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
1089         NULL
1090 };
1091
1092 static cpumask_var_t mce_dev_initialized;
1093
1094 /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1095 static __cpuinit int mce_create_device(unsigned int cpu)
1096 {
1097         int err;
1098         int i;
1099
1100         if (!mce_available(&boot_cpu_data))
1101                 return -EIO;
1102
1103         memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1104         per_cpu(mce_dev, cpu).id        = cpu;
1105         per_cpu(mce_dev, cpu).cls       = &mce_sysclass;
1106
1107         err = sysdev_register(&per_cpu(mce_dev, cpu));
1108         if (err)
1109                 return err;
1110
1111         for (i = 0; mce_attrs[i]; i++) {
1112                 err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1113                 if (err)
1114                         goto error;
1115         }
1116         for (i = 0; i < banks; i++) {
1117                 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1118                                         &bank_attrs[i]);
1119                 if (err)
1120                         goto error2;
1121         }
1122         cpumask_set_cpu(cpu, mce_dev_initialized);
1123
1124         return 0;
1125 error2:
1126         while (--i >= 0)
1127                 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1128 error:
1129         while (--i >= 0)
1130                 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1131
1132         sysdev_unregister(&per_cpu(mce_dev, cpu));
1133
1134         return err;
1135 }
1136
1137 static __cpuinit void mce_remove_device(unsigned int cpu)
1138 {
1139         int i;
1140
1141         if (!cpumask_test_cpu(cpu, mce_dev_initialized))
1142                 return;
1143
1144         for (i = 0; mce_attrs[i]; i++)
1145                 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1146
1147         for (i = 0; i < banks; i++)
1148                 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]);
1149
1150         sysdev_unregister(&per_cpu(mce_dev, cpu));
1151         cpumask_clear_cpu(cpu, mce_dev_initialized);
1152 }
1153
1154 /* Make sure there are no machine checks on offlined CPUs. */
1155 static void mce_disable_cpu(void *h)
1156 {
1157         unsigned long action = *(unsigned long *)h;
1158         int i;
1159
1160         if (!mce_available(&current_cpu_data))
1161                 return;
1162         if (!(action & CPU_TASKS_FROZEN))
1163                 cmci_clear();
1164         for (i = 0; i < banks; i++) {
1165                 if (!skip_bank_init(i))
1166                         wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1167         }
1168 }
1169
1170 static void mce_reenable_cpu(void *h)
1171 {
1172         unsigned long action = *(unsigned long *)h;
1173         int i;
1174
1175         if (!mce_available(&current_cpu_data))
1176                 return;
1177
1178         if (!(action & CPU_TASKS_FROZEN))
1179                 cmci_reenable();
1180         for (i = 0; i < banks; i++) {
1181                 if (!skip_bank_init(i))
1182                         wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1183         }
1184 }
1185
1186 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1187 static int __cpuinit
1188 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1189 {
1190         unsigned int cpu = (unsigned long)hcpu;
1191         struct timer_list *t = &per_cpu(mce_timer, cpu);
1192
1193         switch (action) {
1194         case CPU_ONLINE:
1195         case CPU_ONLINE_FROZEN:
1196                 mce_create_device(cpu);
1197                 if (threshold_cpu_callback)
1198                         threshold_cpu_callback(action, cpu);
1199                 break;
1200         case CPU_DEAD:
1201         case CPU_DEAD_FROZEN:
1202                 if (threshold_cpu_callback)
1203                         threshold_cpu_callback(action, cpu);
1204                 mce_remove_device(cpu);
1205                 break;
1206         case CPU_DOWN_PREPARE:
1207         case CPU_DOWN_PREPARE_FROZEN:
1208                 del_timer_sync(t);
1209                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1210                 break;
1211         case CPU_DOWN_FAILED:
1212         case CPU_DOWN_FAILED_FROZEN:
1213                 t->expires = round_jiffies(jiffies +
1214                                                 __get_cpu_var(next_interval));
1215                 add_timer_on(t, cpu);
1216                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1217                 break;
1218         case CPU_POST_DEAD:
1219                 /* intentionally ignoring frozen here */
1220                 cmci_rediscover(cpu);
1221                 break;
1222         }
1223         return NOTIFY_OK;
1224 }
1225
1226 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1227         .notifier_call = mce_cpu_callback,
1228 };
1229
1230 static __init int mce_init_banks(void)
1231 {
1232         int i;
1233
1234         bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1235                                 GFP_KERNEL);
1236         if (!bank_attrs)
1237                 return -ENOMEM;
1238
1239         for (i = 0; i < banks; i++) {
1240                 struct sysdev_attribute *a = &bank_attrs[i];
1241
1242                 a->attr.name    = kasprintf(GFP_KERNEL, "bank%d", i);
1243                 if (!a->attr.name)
1244                         goto nomem;
1245
1246                 a->attr.mode    = 0644;
1247                 a->show         = show_bank;
1248                 a->store        = set_bank;
1249         }
1250         return 0;
1251
1252 nomem:
1253         while (--i >= 0)
1254                 kfree(bank_attrs[i].attr.name);
1255         kfree(bank_attrs);
1256         bank_attrs = NULL;
1257
1258         return -ENOMEM;
1259 }
1260
1261 static __init int mce_init_device(void)
1262 {
1263         int err;
1264         int i = 0;
1265
1266         if (!mce_available(&boot_cpu_data))
1267                 return -EIO;
1268
1269         alloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1270
1271         err = mce_init_banks();
1272         if (err)
1273                 return err;
1274
1275         err = sysdev_class_register(&mce_sysclass);
1276         if (err)
1277                 return err;
1278
1279         for_each_online_cpu(i) {
1280                 err = mce_create_device(i);
1281                 if (err)
1282                         return err;
1283         }
1284
1285         register_hotcpu_notifier(&mce_cpu_notifier);
1286         misc_register(&mce_log_device);
1287
1288         return err;
1289 }
1290
1291 device_initcall(mce_init_device);
1292
1293 #else /* CONFIG_X86_32: */
1294
1295 int mce_disabled;
1296
1297 int nr_mce_banks;
1298 EXPORT_SYMBOL_GPL(nr_mce_banks);        /* non-fatal.o */
1299
1300 /* This has to be run for each processor */
1301 void mcheck_init(struct cpuinfo_x86 *c)
1302 {
1303         if (mce_disabled == 1)
1304                 return;
1305
1306         switch (c->x86_vendor) {
1307         case X86_VENDOR_AMD:
1308                 amd_mcheck_init(c);
1309                 break;
1310
1311         case X86_VENDOR_INTEL:
1312                 if (c->x86 == 5)
1313                         intel_p5_mcheck_init(c);
1314                 if (c->x86 == 6)
1315                         intel_p6_mcheck_init(c);
1316                 if (c->x86 == 15)
1317                         intel_p4_mcheck_init(c);
1318                 break;
1319
1320         case X86_VENDOR_CENTAUR:
1321                 if (c->x86 == 5)
1322                         winchip_mcheck_init(c);
1323                 break;
1324
1325         default:
1326                 break;
1327         }
1328         printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks);
1329 }
1330
1331 static int __init mcheck_disable(char *str)
1332 {
1333         mce_disabled = 1;
1334         return 1;
1335 }
1336
1337 static int __init mcheck_enable(char *str)
1338 {
1339         mce_disabled = -1;
1340         return 1;
1341 }
1342
1343 __setup("nomce", mcheck_disable);
1344 __setup("mce", mcheck_enable);
1345
1346 #endif /* CONFIG_X86_32 */