arch/x86/kernel/cpu/mcheck/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  * Copyright 2008 Intel Corporation
   7  * Author: Andi Kleen
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/types.h>
  12 #include <linux/kernel.h>
  13 #include <linux/sched.h>
  14 #include <linux/smp_lock.h>
  15 #include <linux/string.h>
  16 #include <linux/rcupdate.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/sysdev.h>
  19 #include <linux/miscdevice.h>
  20 #include <linux/fs.h>
  21 #include <linux/capability.h>
  22 #include <linux/cpu.h>
  23 #include <linux/percpu.h>
  24 #include <linux/poll.h>
  25 #include <linux/thread_info.h>
  26 #include <linux/ctype.h>
  27 #include <linux/kmod.h>
  28 #include <linux/kdebug.h>
  29 #include <linux/kobject.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/ratelimit.h>
  32 #include <asm/processor.h>
  33 #include <asm/msr.h>
  34 #include <asm/mce.h>
  35 #include <asm/uaccess.h>
  36 #include <asm/smp.h>
  37 #include <asm/idle.h>
  38
  39 #define MISC_MCELOG_MINOR 227
  40
  41 atomic_t mce_entry;
  42
  43 static int mce_dont_init;
  44
  45 /*
  46  * Tolerant levels:
  47  *   0: always panic on uncorrected errors, log corrected errors
  48  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  49  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  50  *   3: never panic or SIGBUS, log all errors (for testing only)
  51  */
  52 static int tolerant = 1;
  53 static int banks;
  54 static u64 *bank;
  55 static unsigned long notify_user;
  56 static int rip_msr;
  57 static int mce_bootlog = -1;
  58 static atomic_t mce_events;
  59
  60 static char trigger[128];
  61 static char *trigger_argv[2] = { trigger, NULL };
  62
  63 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  64
  65 /* MCA banks polled by the period polling timer for corrected events */
  66 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  67         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  68 };
  69
  70 /* Do initial initialization of a struct mce */
  71 void mce_setup(struct mce *m)
  72 {
  73         memset(m, 0, sizeof(struct mce));
  74         m->cpu = smp_processor_id();
  75         rdtscll(m->tsc);
  76 }
  77
  78 /*
  79  * Lockless MCE logging infrastructure.
  80  * This avoids deadlocks on printk locks without having to break locks. Also
  81  * separate MCEs from kernel messages to avoid bogus bug reports.
  82  */
  83
  84 static struct mce_log mcelog = {
  85         MCE_LOG_SIGNATURE,
  86         MCE_LOG_LEN,
  87 };
  88
  89 void mce_log(struct mce *mce)
  90 {
  91         unsigned next, entry;
  92         atomic_inc(&mce_events);
  93         mce->finished = 0;
  94         wmb();
  95         for (;;) {
  96                 entry = rcu_dereference(mcelog.next);
  97                 for (;;) {
  98                         /* When the buffer fills up discard new entries. Assume
  99                            that the earlier errors are the more interesting. */
 100                         if (entry >= MCE_LOG_LEN) {
 101                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 102                                 return;
 103                         }
 104                         /* Old left over entry. Skip. */
 105                         if (mcelog.entry[entry].finished) {
 106                                 entry++;
 107                                 continue;
 108                         }
 109                         break;
 110                 }
 111                 smp_rmb();
 112                 next = entry + 1;
 113                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 114                         break;
 115         }
 116         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 117         wmb();
 118         mcelog.entry[entry].finished = 1;
 119         wmb();
 120
 121         set_bit(0, &notify_user);
 122 }
 123
 124 static void print_mce(struct mce *m)
 125 {
 126         printk(KERN_EMERG "\n"
 127                KERN_EMERG "HARDWARE ERROR\n"
 128                KERN_EMERG
 129                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 130                m->cpu, m->mcgstatus, m->bank, m->status);
 131         if (m->ip) {
 132                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 133                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 134                        m->cs, m->ip);
 135                 if (m->cs == __KERNEL_CS)
 136                         print_symbol("{%s}", m->ip);
 137                 printk("\n");
 138         }
 139         printk(KERN_EMERG "TSC %llx ", m->tsc);
 140         if (m->addr)
 141                 printk("ADDR %llx ", m->addr);
 142         if (m->misc)
 143                 printk("MISC %llx ", m->misc);
 144         printk("\n");
 145         printk(KERN_EMERG "This is not a software problem!\n");
 146         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 147                "and contact your hardware vendor\n");
 148 }
 149
 150 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 151 {
 152         int i;
 153
 154         oops_begin();
 155         for (i = 0; i < MCE_LOG_LEN; i++) {
 156                 unsigned long tsc = mcelog.entry[i].tsc;
 157
 158                 if (time_before(tsc, start))
 159                         continue;
 160                 print_mce(&mcelog.entry[i]);
 161                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 162                         backup = NULL;
 163         }
 164         if (backup)
 165                 print_mce(backup);
 166         panic(msg);
 167 }
 168
 169 int mce_available(struct cpuinfo_x86 *c)
 170 {
 171         if (mce_dont_init)
 172                 return 0;
 173         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 174 }
 175
 176 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 177 {
 178         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 179                 m->ip = regs->ip;
 180                 m->cs = regs->cs;
 181         } else {
 182                 m->ip = 0;
 183                 m->cs = 0;
 184         }
 185         if (rip_msr) {
 186                 /* Assume the RIP in the MSR is exact. Is this true? */
 187                 m->mcgstatus |= MCG_STATUS_EIPV;
 188                 rdmsrl(rip_msr, m->ip);
 189                 m->cs = 0;
 190         }
 191 }
 192
 193 /*
 194  * Poll for corrected events or events that happened before reset.
 195  * Those are just logged through /dev/mcelog.
 196  *
 197  * This is executed in standard interrupt context.
 198  */
 199 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 200 {
 201         struct mce m;
 202         int i;
 203
 204         mce_setup(&m);
 205
 206         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 207         for (i = 0; i < banks; i++) {
 208                 if (!bank[i] || !test_bit(i, *b))
 209                         continue;
 210
 211                 m.misc = 0;
 212                 m.addr = 0;
 213                 m.bank = i;
 214                 m.tsc = 0;
 215
 216                 barrier();
 217                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 218                 if (!(m.status & MCI_STATUS_VAL))
 219                         continue;
 220
 221                 /*
 222                  * Uncorrected events are handled by the exception handler
 223                  * when it is enabled. But when the exception is disabled log
 224                  * everything.
 225                  *
 226                  * TBD do the same check for MCI_STATUS_EN here?
 227                  */
 228                 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
 229                         continue;
 230
 231                 if (m.status & MCI_STATUS_MISCV)
 232                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 233                 if (m.status & MCI_STATUS_ADDRV)
 234                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 235
 236                 if (!(flags & MCP_TIMESTAMP))
 237                         m.tsc = 0;
 238                 /*
 239                  * Don't get the IP here because it's unlikely to
 240                  * have anything to do with the actual error location.
 241                  */
 242                 if (!(flags & MCP_DONTLOG)) {
 243                         mce_log(&m);
 244                         add_taint(TAINT_MACHINE_CHECK);
 245                 }
 246
 247                 /*
 248                  * Clear state for this bank.
 249                  */
 250                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 251         }
 252
 253         /*
 254          * Don't clear MCG_STATUS here because it's only defined for
 255          * exceptions.
 256          */
 257 }
 258
 259 /*
 260  * The actual machine check handler. This only handles real
 261  * exceptions when something got corrupted coming in through int 18.
 262  *
 263  * This is executed in NMI context not subject to normal locking rules. This
 264  * implies that most kernel services cannot be safely used. Don't even
 265  * think about putting a printk in there!
 266  */
 267 void do_machine_check(struct pt_regs * regs, long error_code)
 268 {
 269         struct mce m, panicm;
 270         u64 mcestart = 0;
 271         int i;
 272         int panicm_found = 0;
 273         /*
 274          * If no_way_out gets set, there is no safe way to recover from this
 275          * MCE.  If tolerant is cranked up, we'll try anyway.
 276          */
 277         int no_way_out = 0;
 278         /*
 279          * If kill_it gets set, there might be a way to recover from this
 280          * error.
 281          */
 282         int kill_it = 0;
 283         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 284
 285         atomic_inc(&mce_entry);
 286
 287         if (notify_die(DIE_NMI, "machine check", regs, error_code,
 288                            18, SIGKILL) == NOTIFY_STOP)
 289                 goto out2;
 290         if (!banks)
 291                 goto out2;
 292
 293         mce_setup(&m);
 294
 295         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 296         /* if the restart IP is not valid, we're done for */
 297         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 298                 no_way_out = 1;
 299
 300         rdtscll(mcestart);
 301         barrier();
 302
 303         for (i = 0; i < banks; i++) {
 304                 __clear_bit(i, toclear);
 305                 if (!bank[i])
 306                         continue;
 307
 308                 m.misc = 0;
 309                 m.addr = 0;
 310                 m.bank = i;
 311
 312                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 313                 if ((m.status & MCI_STATUS_VAL) == 0)
 314                         continue;
 315
 316                 /*
 317                  * Non uncorrected errors are handled by machine_check_poll
 318                  * Leave them alone.
 319                  */
 320                 if ((m.status & MCI_STATUS_UC) == 0)
 321                         continue;
 322
 323                 /*
 324                  * Set taint even when machine check was not enabled.
 325                  */
 326                 add_taint(TAINT_MACHINE_CHECK);
 327
 328                 __set_bit(i, toclear);
 329
 330                 if (m.status & MCI_STATUS_EN) {
 331                         /* if PCC was set, there's no way out */
 332                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 333                         /*
 334                          * If this error was uncorrectable and there was
 335                          * an overflow, we're in trouble.  If no overflow,
 336                          * we might get away with just killing a task.
 337                          */
 338                         if (m.status & MCI_STATUS_UC) {
 339                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 340                                         no_way_out = 1;
 341                                 kill_it = 1;
 342                         }
 343                 } else {
 344                         /*
 345                          * Machine check event was not enabled. Clear, but
 346                          * ignore.
 347                          */
 348                         continue;
 349                 }
 350
 351                 if (m.status & MCI_STATUS_MISCV)
 352                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 353                 if (m.status & MCI_STATUS_ADDRV)
 354                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 355
 356                 mce_get_rip(&m, regs);
 357                 mce_log(&m);
 358
 359                 /* Did this bank cause the exception? */
 360                 /* Assume that the bank with uncorrectable errors did it,
 361                    and that there is only a single one. */
 362                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 363                         panicm = m;
 364                         panicm_found = 1;
 365                 }
 366         }
 367
 368         /* If we didn't find an uncorrectable error, pick
 369            the last one (shouldn't happen, just being safe). */
 370         if (!panicm_found)
 371                 panicm = m;
 372
 373         /*
 374          * If we have decided that we just CAN'T continue, and the user
 375          *  has not set tolerant to an insane level, give up and die.
 376          */
 377         if (no_way_out && tolerant < 3)
 378                 mce_panic("Machine check", &panicm, mcestart);
 379
 380         /*
 381          * If the error seems to be unrecoverable, something should be
 382          * done.  Try to kill as little as possible.  If we can kill just
 383          * one task, do that.  If the user has set the tolerance very
 384          * high, don't try to do anything at all.
 385          */
 386         if (kill_it && tolerant < 3) {
 387                 int user_space = 0;
 388
 389                 /*
 390                  * If the EIPV bit is set, it means the saved IP is the
 391                  * instruction which caused the MCE.
 392                  */
 393                 if (m.mcgstatus & MCG_STATUS_EIPV)
 394                         user_space = panicm.ip && (panicm.cs & 3);
 395
 396                 /*
 397                  * If we know that the error was in user space, send a
 398                  * SIGBUS.  Otherwise, panic if tolerance is low.
 399                  *
 400                  * force_sig() takes an awful lot of locks and has a slight
 401                  * risk of deadlocking.
 402                  */
 403                 if (user_space) {
 404                         force_sig(SIGBUS, current);
 405                 } else if (panic_on_oops || tolerant < 2) {
 406                         mce_panic("Uncorrected machine check",
 407                                 &panicm, mcestart);
 408                 }
 409         }
 410
 411         /* notify userspace ASAP */
 412         set_thread_flag(TIF_MCE_NOTIFY);
 413
 414         /* the last thing we do is clear state */
 415         for (i = 0; i < banks; i++) {
 416                 if (test_bit(i, toclear))
 417                         wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 418         }
 419         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 420  out2:
 421         atomic_dec(&mce_entry);
 422 }
 423 EXPORT_SYMBOL_GPL(do_machine_check);
 424
 425 #ifdef CONFIG_X86_MCE_INTEL
 426 /***
 427  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 428  * @cpu: The CPU on which the event occurred.
 429  * @status: Event status information
 430  *
 431  * This function should be called by the thermal interrupt after the
 432  * event has been processed and the decision was made to log the event
 433  * further.
 434  *
 435  * The status parameter will be saved to the 'status' field of 'struct mce'
 436  * and historically has been the register value of the
 437  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 438  */
 439 void mce_log_therm_throt_event(__u64 status)
 440 {
 441         struct mce m;
 442
 443         mce_setup(&m);
 444         m.bank = MCE_THERMAL_BANK;
 445         m.status = status;
 446         mce_log(&m);
 447 }
 448 #endif /* CONFIG_X86_MCE_INTEL */
 449
 450 /*
 451  * Periodic polling timer for "silent" machine check errors.  If the
 452  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 453  * errors, poll 2x slower (up to check_interval seconds).
 454  */
 455
 456 static int check_interval = 5 * 60; /* 5 minutes */
 457 static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 458 static void mcheck_timer(unsigned long);
 459 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 460
 461 static void mcheck_timer(unsigned long data)
 462 {
 463         struct timer_list *t = &per_cpu(mce_timer, data);
 464         int *n;
 465
 466         WARN_ON(smp_processor_id() != data);
 467
 468         if (mce_available(&current_cpu_data))
 469                 machine_check_poll(MCP_TIMESTAMP,
 470                                 &__get_cpu_var(mce_poll_banks));
 471
 472         /*
 473          * Alert userspace if needed.  If we logged an MCE, reduce the
 474          * polling interval, otherwise increase the polling interval.
 475          */
 476         n = &__get_cpu_var(next_interval);
 477         if (mce_notify_user()) {
 478                 *n = max(*n/2, HZ/100);
 479         } else {
 480                 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 481         }
 482
 483         t->expires = jiffies + *n;
 484         add_timer(t);
 485 }
 486
 487 static void mce_do_trigger(struct work_struct *work)
 488 {
 489         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 490 }
 491
 492 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 493
 494 /*
 495  * Notify the user(s) about new machine check events.
 496  * Can be called from interrupt context, but not from machine check/NMI
 497  * context.
 498  */
 499 int mce_notify_user(void)
 500 {
 501         /* Not more than two messages every minute */
 502         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 503
 504         clear_thread_flag(TIF_MCE_NOTIFY);
 505         if (test_and_clear_bit(0, &notify_user)) {
 506                 wake_up_interruptible(&mce_wait);
 507
 508                 /*
 509                  * There is no risk of missing notifications because
 510                  * work_pending is always cleared before the function is
 511                  * executed.
 512                  */
 513                 if (trigger[0] && !work_pending(&mce_trigger_work))
 514                         schedule_work(&mce_trigger_work);
 515
 516                 if (__ratelimit(&ratelimit))
 517                         printk(KERN_INFO "Machine check events logged\n");
 518
 519                 return 1;
 520         }
 521         return 0;
 522 }
 523
 524 /* see if the idle task needs to notify userspace */
 525 static int
 526 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 527 {
 528         /* IDLE_END should be safe - interrupts are back on */
 529         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 530                 mce_notify_user();
 531
 532         return NOTIFY_OK;
 533 }
 534
 535 static struct notifier_block mce_idle_notifier = {
 536         .notifier_call = mce_idle_callback,
 537 };
 538
 539 static __init int periodic_mcheck_init(void)
 540 {
 541        idle_notifier_register(&mce_idle_notifier);
 542        return 0;
 543 }
 544 __initcall(periodic_mcheck_init);
 545
 546 /*
 547  * Initialize Machine Checks for a CPU.
 548  */
 549 static int mce_cap_init(void)
 550 {
 551         u64 cap;
 552         unsigned b;
 553
 554         rdmsrl(MSR_IA32_MCG_CAP, cap);
 555         b = cap & 0xff;
 556         if (b > MAX_NR_BANKS) {
 557                 printk(KERN_WARNING
 558                        "MCE: Using only %u machine check banks out of %u\n",
 559                         MAX_NR_BANKS, b);
 560                 b = MAX_NR_BANKS;
 561         }
 562
 563         /* Don't support asymmetric configurations today */
 564         WARN_ON(banks != 0 && b != banks);
 565         banks = b;
 566         if (!bank) {
 567                 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
 568                 if (!bank)
 569                         return -ENOMEM;
 570                 memset(bank, 0xff, banks * sizeof(u64));
 571         }
 572
 573         /* Use accurate RIP reporting if available. */
 574         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 575                 rip_msr = MSR_IA32_MCG_EIP;
 576
 577         return 0;
 578 }
 579
 580 static void mce_init(void *dummy)
 581 {
 582         u64 cap;
 583         int i;
 584         mce_banks_t all_banks;
 585
 586         /*
 587          * Log the machine checks left over from the previous reset.
 588          */
 589         bitmap_fill(all_banks, MAX_NR_BANKS);
 590         machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 591
 592         set_in_cr4(X86_CR4_MCE);
 593
 594         rdmsrl(MSR_IA32_MCG_CAP, cap);
 595         if (cap & MCG_CTL_P)
 596                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 597
 598         for (i = 0; i < banks; i++) {
 599                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 600                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 601         }
 602 }
 603
 604 /* Add per CPU specific workarounds here */
 605 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 606 {
 607         /* This should be disabled by the BIOS, but isn't always */
 608         if (c->x86_vendor == X86_VENDOR_AMD) {
 609                 if (c->x86 == 15 && banks > 4)
 610                         /* disable GART TBL walk error reporting, which trips off
 611                            incorrectly with the IOMMU & 3ware & Cerberus. */
 612                         clear_bit(10, (unsigned long *)&bank[4]);
 613                 if(c->x86 <= 17 && mce_bootlog < 0)
 614                         /* Lots of broken BIOS around that don't clear them
 615                            by default and leave crap in there. Don't log. */
 616                         mce_bootlog = 0;
 617         }
 618
 619 }
 620
 621 static void mce_cpu_features(struct cpuinfo_x86 *c)
 622 {
 623         switch (c->x86_vendor) {
 624         case X86_VENDOR_INTEL:
 625                 mce_intel_feature_init(c);
 626                 break;
 627         case X86_VENDOR_AMD:
 628                 mce_amd_feature_init(c);
 629                 break;
 630         default:
 631                 break;
 632         }
 633 }
 634
 635 static void mce_init_timer(void)
 636 {
 637         struct timer_list *t = &__get_cpu_var(mce_timer);
 638         int *n = &__get_cpu_var(next_interval);
 639
 640         *n = check_interval * HZ;
 641         if (!*n)
 642                 return;
 643         setup_timer(t, mcheck_timer, smp_processor_id());
 644         t->expires = round_jiffies(jiffies + *n);
 645         add_timer(t);
 646 }
 647
 648 /*
 649  * Called for each booted CPU to set up machine checks.
 650  * Must be called with preempt off.
 651  */
 652 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 653 {
 654         if (!mce_available(c))
 655                 return;
 656
 657         if (mce_cap_init() < 0) {
 658                 mce_dont_init = 1;
 659                 return;
 660         }
 661         mce_cpu_quirks(c);
 662
 663         mce_init(NULL);
 664         mce_cpu_features(c);
 665         mce_init_timer();
 666 }
 667
 668 /*
 669  * Character device to read and clear the MCE log.
 670  */
 671
 672 static DEFINE_SPINLOCK(mce_state_lock);
 673 static int open_count;  /* #times opened */
 674 static int open_exclu;  /* already open exclusive? */
 675
 676 static int mce_open(struct inode *inode, struct file *file)
 677 {
 678         lock_kernel();
 679         spin_lock(&mce_state_lock);
 680
 681         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 682                 spin_unlock(&mce_state_lock);
 683                 unlock_kernel();
 684                 return -EBUSY;
 685         }
 686
 687         if (file->f_flags & O_EXCL)
 688                 open_exclu = 1;
 689         open_count++;
 690
 691         spin_unlock(&mce_state_lock);
 692         unlock_kernel();
 693
 694         return nonseekable_open(inode, file);
 695 }
 696
 697 static int mce_release(struct inode *inode, struct file *file)
 698 {
 699         spin_lock(&mce_state_lock);
 700
 701         open_count--;
 702         open_exclu = 0;
 703
 704         spin_unlock(&mce_state_lock);
 705
 706         return 0;
 707 }
 708
 709 static void collect_tscs(void *data)
 710 {
 711         unsigned long *cpu_tsc = (unsigned long *)data;
 712
 713         rdtscll(cpu_tsc[smp_processor_id()]);
 714 }
 715
 716 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 717                         loff_t *off)
 718 {
 719         unsigned long *cpu_tsc;
 720         static DEFINE_MUTEX(mce_read_mutex);
 721         unsigned prev, next;
 722         char __user *buf = ubuf;
 723         int i, err;
 724
 725         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 726         if (!cpu_tsc)
 727                 return -ENOMEM;
 728
 729         mutex_lock(&mce_read_mutex);
 730         next = rcu_dereference(mcelog.next);
 731
 732         /* Only supports full reads right now */
 733         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 734                 mutex_unlock(&mce_read_mutex);
 735                 kfree(cpu_tsc);
 736                 return -EINVAL;
 737         }
 738
 739         err = 0;
 740         prev = 0;
 741         do {
 742                 for (i = prev; i < next; i++) {
 743                         unsigned long start = jiffies;
 744
 745                         while (!mcelog.entry[i].finished) {
 746                                 if (time_after_eq(jiffies, start + 2)) {
 747                                         memset(mcelog.entry + i, 0,
 748                                                sizeof(struct mce));
 749                                         goto timeout;
 750                                 }
 751                                 cpu_relax();
 752                         }
 753                         smp_rmb();
 754                         err |= copy_to_user(buf, mcelog.entry + i,
 755                                             sizeof(struct mce));
 756                         buf += sizeof(struct mce);
 757 timeout:
 758                         ;
 759                 }
 760
 761                 memset(mcelog.entry + prev, 0,
 762                        (next - prev) * sizeof(struct mce));
 763                 prev = next;
 764                 next = cmpxchg(&mcelog.next, prev, 0);
 765         } while (next != prev);
 766
 767         synchronize_sched();
 768
 769         /*
 770          * Collect entries that were still getting written before the
 771          * synchronize.
 772          */
 773         on_each_cpu(collect_tscs, cpu_tsc, 1);
 774         for (i = next; i < MCE_LOG_LEN; i++) {
 775                 if (mcelog.entry[i].finished &&
 776                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 777                         err |= copy_to_user(buf, mcelog.entry+i,
 778                                             sizeof(struct mce));
 779                         smp_rmb();
 780                         buf += sizeof(struct mce);
 781                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 782                 }
 783         }
 784         mutex_unlock(&mce_read_mutex);
 785         kfree(cpu_tsc);
 786         return err ? -EFAULT : buf - ubuf;
 787 }
 788
 789 static unsigned int mce_poll(struct file *file, poll_table *wait)
 790 {
 791         poll_wait(file, &mce_wait, wait);
 792         if (rcu_dereference(mcelog.next))
 793                 return POLLIN | POLLRDNORM;
 794         return 0;
 795 }
 796
 797 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 798 {
 799         int __user *p = (int __user *)arg;
 800
 801         if (!capable(CAP_SYS_ADMIN))
 802                 return -EPERM;
 803         switch (cmd) {
 804         case MCE_GET_RECORD_LEN:
 805                 return put_user(sizeof(struct mce), p);
 806         case MCE_GET_LOG_LEN:
 807                 return put_user(MCE_LOG_LEN, p);
 808         case MCE_GETCLEAR_FLAGS: {
 809                 unsigned flags;
 810
 811                 do {
 812                         flags = mcelog.flags;
 813                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 814                 return put_user(flags, p);
 815         }
 816         default:
 817                 return -ENOTTY;
 818         }
 819 }
 820
 821 static const struct file_operations mce_chrdev_ops = {
 822         .open = mce_open,
 823         .release = mce_release,
 824         .read = mce_read,
 825         .poll = mce_poll,
 826         .unlocked_ioctl = mce_ioctl,
 827 };
 828
 829 static struct miscdevice mce_log_device = {
 830         MISC_MCELOG_MINOR,
 831         "mcelog",
 832         &mce_chrdev_ops,
 833 };
 834
 835 /*
 836  * Old style boot options parsing. Only for compatibility.
 837  */
 838 static int __init mcheck_disable(char *str)
 839 {
 840         mce_dont_init = 1;
 841         return 1;
 842 }
 843
 844 /* mce=off disables machine check.
 845    mce=TOLERANCELEVEL (number, see above)
 846    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 847    mce=nobootlog Don't log MCEs from before booting. */
 848 static int __init mcheck_enable(char *str)
 849 {
 850         if (!strcmp(str, "off"))
 851                 mce_dont_init = 1;
 852         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 853                 mce_bootlog = str[0] == 'b';
 854         else if (isdigit(str[0]))
 855                 get_option(&str, &tolerant);
 856         else
 857                 printk("mce= argument %s ignored. Please use /sys", str);
 858         return 1;
 859 }
 860
 861 __setup("nomce", mcheck_disable);
 862 __setup("mce=", mcheck_enable);
 863
 864 /*
 865  * Sysfs support
 866  */
 867
 868 /*
 869  * Disable machine checks on suspend and shutdown. We can't really handle
 870  * them later.
 871  */
 872 static int mce_disable(void)
 873 {
 874         int i;
 875
 876         for (i = 0; i < banks; i++)
 877                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 878         return 0;
 879 }
 880
 881 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 882 {
 883         return mce_disable();
 884 }
 885
 886 static int mce_shutdown(struct sys_device *dev)
 887 {
 888         return mce_disable();
 889 }
 890
 891 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 892    Only one CPU is active at this time, the others get readded later using
 893    CPU hotplug. */
 894 static int mce_resume(struct sys_device *dev)
 895 {
 896         mce_init(NULL);
 897         mce_cpu_features(&current_cpu_data);
 898         return 0;
 899 }
 900
 901 static void mce_cpu_restart(void *data)
 902 {
 903         del_timer_sync(&__get_cpu_var(mce_timer));
 904         if (mce_available(&current_cpu_data))
 905                 mce_init(NULL);
 906         mce_init_timer();
 907 }
 908
 909 /* Reinit MCEs after user configuration changes */
 910 static void mce_restart(void)
 911 {
 912         on_each_cpu(mce_cpu_restart, NULL, 1);
 913 }
 914
 915 static struct sysdev_class mce_sysclass = {
 916         .suspend = mce_suspend,
 917         .shutdown = mce_shutdown,
 918         .resume = mce_resume,
 919         .name = "machinecheck",
 920 };
 921
 922 DEFINE_PER_CPU(struct sys_device, device_mce);
 923 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
 924
 925 /* Why are there no generic functions for this? */
 926 #define ACCESSOR(name, var, start) \
 927         static ssize_t show_ ## name(struct sys_device *s,              \
 928                                      struct sysdev_attribute *attr,     \
 929                                      char *buf) {                       \
 930                 return sprintf(buf, "%lx\n", (unsigned long)var);       \
 931         }                                                               \
 932         static ssize_t set_ ## name(struct sys_device *s,               \
 933                                     struct sysdev_attribute *attr,      \
 934                                     const char *buf, size_t siz) {      \
 935                 char *end;                                              \
 936                 unsigned long new = simple_strtoul(buf, &end, 0);       \
 937                 if (end == buf) return -EINVAL;                         \
 938                 var = new;                                              \
 939                 start;                                                  \
 940                 return end-buf;                                         \
 941         }                                                               \
 942         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 943
 944 static struct sysdev_attribute *bank_attrs;
 945
 946 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 947                          char *buf)
 948 {
 949         u64 b = bank[attr - bank_attrs];
 950         return sprintf(buf, "%llx\n", b);
 951 }
 952
 953 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 954                         const char *buf, size_t siz)
 955 {
 956         char *end;
 957         u64 new = simple_strtoull(buf, &end, 0);
 958         if (end == buf)
 959                 return -EINVAL;
 960         bank[attr - bank_attrs] = new;
 961         mce_restart();
 962         return end-buf;
 963 }
 964
 965 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 966                                 char *buf)
 967 {
 968         strcpy(buf, trigger);
 969         strcat(buf, "\n");
 970         return strlen(trigger) + 1;
 971 }
 972
 973 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 974                                 const char *buf,size_t siz)
 975 {
 976         char *p;
 977         int len;
 978         strncpy(trigger, buf, sizeof(trigger));
 979         trigger[sizeof(trigger)-1] = 0;
 980         len = strlen(trigger);
 981         p = strchr(trigger, '\n');
 982         if (*p) *p = 0;
 983         return len;
 984 }
 985
 986 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 987 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 988 ACCESSOR(check_interval,check_interval,mce_restart())
 989 static struct sysdev_attribute *mce_attributes[] = {
 990         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 991         NULL
 992 };
 993
 994 static cpumask_var_t mce_device_initialized;
 995
 996 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 997 static __cpuinit int mce_create_device(unsigned int cpu)
 998 {
 999         int err;
1000         int i;
1001
1002         if (!mce_available(&boot_cpu_data))
1003                 return -EIO;
1004
1005         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1006         per_cpu(device_mce,cpu).id = cpu;
1007         per_cpu(device_mce,cpu).cls = &mce_sysclass;
1008
1009         err = sysdev_register(&per_cpu(device_mce,cpu));
1010         if (err)
1011                 return err;
1012
1013         for (i = 0; mce_attributes[i]; i++) {
1014                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1015                                          mce_attributes[i]);
1016                 if (err)
1017                         goto error;
1018         }
1019         for (i = 0; i < banks; i++) {
1020                 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1021                                         &bank_attrs[i]);
1022                 if (err)
1023                         goto error2;
1024         }
1025         cpumask_set_cpu(cpu, mce_device_initialized);
1026
1027         return 0;
1028 error2:
1029         while (--i >= 0) {
1030                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1031                                         &bank_attrs[i]);
1032         }
1033 error:
1034         while (--i >= 0) {
1035                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1036                                    mce_attributes[i]);
1037         }
1038         sysdev_unregister(&per_cpu(device_mce,cpu));
1039
1040         return err;
1041 }
1042
1043 static __cpuinit void mce_remove_device(unsigned int cpu)
1044 {
1045         int i;
1046
1047         if (!cpumask_test_cpu(cpu, mce_device_initialized))
1048                 return;
1049
1050         for (i = 0; mce_attributes[i]; i++)
1051                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1052                         mce_attributes[i]);
1053         for (i = 0; i < banks; i++)
1054                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1055                         &bank_attrs[i]);
1056         sysdev_unregister(&per_cpu(device_mce,cpu));
1057         cpumask_clear_cpu(cpu, mce_device_initialized);
1058 }
1059
1060 /* Make sure there are no machine checks on offlined CPUs. */
1061 static void mce_disable_cpu(void *h)
1062 {
1063         int i;
1064         unsigned long action = *(unsigned long *)h;
1065
1066         if (!mce_available(&current_cpu_data))
1067                 return;
1068         if (!(action & CPU_TASKS_FROZEN))
1069                 cmci_clear();
1070         for (i = 0; i < banks; i++)
1071                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1072 }
1073
1074 static void mce_reenable_cpu(void *h)
1075 {
1076         int i;
1077         unsigned long action = *(unsigned long *)h;
1078
1079         if (!mce_available(&current_cpu_data))
1080                 return;
1081         if (!(action & CPU_TASKS_FROZEN))
1082                 cmci_reenable();
1083         for (i = 0; i < banks; i++)
1084                 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1085 }
1086
1087 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1088 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1089                                       unsigned long action, void *hcpu)
1090 {
1091         unsigned int cpu = (unsigned long)hcpu;
1092         struct timer_list *t = &per_cpu(mce_timer, cpu);
1093
1094         switch (action) {
1095         case CPU_ONLINE:
1096         case CPU_ONLINE_FROZEN:
1097                 mce_create_device(cpu);
1098                 if (threshold_cpu_callback)
1099                         threshold_cpu_callback(action, cpu);
1100                 break;
1101         case CPU_DEAD:
1102         case CPU_DEAD_FROZEN:
1103                 if (threshold_cpu_callback)
1104                         threshold_cpu_callback(action, cpu);
1105                 mce_remove_device(cpu);
1106                 break;
1107         case CPU_DOWN_PREPARE:
1108         case CPU_DOWN_PREPARE_FROZEN:
1109                 del_timer_sync(t);
1110                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1111                 break;
1112         case CPU_DOWN_FAILED:
1113         case CPU_DOWN_FAILED_FROZEN:
1114                 t->expires = round_jiffies(jiffies +
1115                                                 __get_cpu_var(next_interval));
1116                 add_timer_on(t, cpu);
1117                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1118                 break;
1119         case CPU_POST_DEAD:
1120                 /* intentionally ignoring frozen here */
1121                 cmci_rediscover(cpu);
1122                 break;
1123         }
1124         return NOTIFY_OK;
1125 }
1126
1127 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1128         .notifier_call = mce_cpu_callback,
1129 };
1130
1131 static __init int mce_init_banks(void)
1132 {
1133         int i;
1134
1135         bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1136                                 GFP_KERNEL);
1137         if (!bank_attrs)
1138                 return -ENOMEM;
1139
1140         for (i = 0; i < banks; i++) {
1141                 struct sysdev_attribute *a = &bank_attrs[i];
1142                 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1143                 if (!a->attr.name)
1144                         goto nomem;
1145                 a->attr.mode = 0644;
1146                 a->show = show_bank;
1147                 a->store = set_bank;
1148         }
1149         return 0;
1150
1151 nomem:
1152         while (--i >= 0)
1153                 kfree(bank_attrs[i].attr.name);
1154         kfree(bank_attrs);
1155         bank_attrs = NULL;
1156         return -ENOMEM;
1157 }
1158
1159 static __init int mce_init_device(void)
1160 {
1161         int err;
1162         int i = 0;
1163
1164         if (!mce_available(&boot_cpu_data))
1165                 return -EIO;
1166
1167         zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1168
1169         err = mce_init_banks();
1170         if (err)
1171                 return err;
1172
1173         err = sysdev_class_register(&mce_sysclass);
1174         if (err)
1175                 return err;
1176
1177         for_each_online_cpu(i) {
1178                 err = mce_create_device(i);
1179                 if (err)
1180                         return err;
1181         }
1182
1183         register_hotcpu_notifier(&mce_cpu_notifier);
1184         misc_register(&mce_log_device);
1185         return err;
1186 }
1187
1188 device_initcall(mce_init_device);