arch/x86/kernel/cpu/mcheck/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  * Copyright 2008 Intel Corporation
   7  * Author: Andi Kleen
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/types.h>
  12 #include <linux/kernel.h>
  13 #include <linux/sched.h>
  14 #include <linux/smp_lock.h>
  15 #include <linux/string.h>
  16 #include <linux/rcupdate.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/sysdev.h>
  19 #include <linux/miscdevice.h>
  20 #include <linux/fs.h>
  21 #include <linux/capability.h>
  22 #include <linux/cpu.h>
  23 #include <linux/percpu.h>
  24 #include <linux/poll.h>
  25 #include <linux/thread_info.h>
  26 #include <linux/ctype.h>
  27 #include <linux/kmod.h>
  28 #include <linux/kdebug.h>
  29 #include <linux/kobject.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/ratelimit.h>
  32 #include <asm/processor.h>
  33 #include <asm/msr.h>
  34 #include <asm/mce.h>
  35 #include <asm/uaccess.h>
  36 #include <asm/smp.h>
  37 #include <asm/idle.h>
  38
  39 #define MISC_MCELOG_MINOR 227
  40
  41 atomic_t mce_entry;
  42
  43 static int mce_dont_init;
  44
  45 /*
  46  * Tolerant levels:
  47  *   0: always panic on uncorrected errors, log corrected errors
  48  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  49  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  50  *   3: never panic or SIGBUS, log all errors (for testing only)
  51  */
  52 static int tolerant = 1;
  53 static int banks;
  54 static u64 *bank;
  55 static unsigned long notify_user;
  56 static int rip_msr;
  57 static int mce_bootlog = -1;
  58 static atomic_t mce_events;
  59
  60 static char trigger[128];
  61 static char *trigger_argv[2] = { trigger, NULL };
  62
  63 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  64
  65 /* MCA banks polled by the period polling timer for corrected events */
  66 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  67         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  68 };
  69
  70 /* Do initial initialization of a struct mce */
  71 void mce_setup(struct mce *m)
  72 {
  73         memset(m, 0, sizeof(struct mce));
  74         m->cpu = smp_processor_id();
  75         rdtscll(m->tsc);
  76 }
  77
  78 /*
  79  * Lockless MCE logging infrastructure.
  80  * This avoids deadlocks on printk locks without having to break locks. Also
  81  * separate MCEs from kernel messages to avoid bogus bug reports.
  82  */
  83
  84 static struct mce_log mcelog = {
  85         MCE_LOG_SIGNATURE,
  86         MCE_LOG_LEN,
  87 };
  88
  89 void mce_log(struct mce *mce)
  90 {
  91         unsigned next, entry;
  92         atomic_inc(&mce_events);
  93         mce->finished = 0;
  94         wmb();
  95         for (;;) {
  96                 entry = rcu_dereference(mcelog.next);
  97                 for (;;) {
  98                         /* When the buffer fills up discard new entries. Assume
  99                            that the earlier errors are the more interesting. */
 100                         if (entry >= MCE_LOG_LEN) {
 101                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 102                                 return;
 103                         }
 104                         /* Old left over entry. Skip. */
 105                         if (mcelog.entry[entry].finished) {
 106                                 entry++;
 107                                 continue;
 108                         }
 109                         break;
 110                 }
 111                 smp_rmb();
 112                 next = entry + 1;
 113                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 114                         break;
 115         }
 116         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 117         wmb();
 118         mcelog.entry[entry].finished = 1;
 119         wmb();
 120
 121         set_bit(0, &notify_user);
 122 }
 123
 124 static void print_mce(struct mce *m)
 125 {
 126         printk(KERN_EMERG "\n"
 127                KERN_EMERG "HARDWARE ERROR\n"
 128                KERN_EMERG
 129                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 130                m->cpu, m->mcgstatus, m->bank, m->status);
 131         if (m->ip) {
 132                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 133                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 134                        m->cs, m->ip);
 135                 if (m->cs == __KERNEL_CS)
 136                         print_symbol("{%s}", m->ip);
 137                 printk("\n");
 138         }
 139         printk(KERN_EMERG "TSC %llx ", m->tsc);
 140         if (m->addr)
 141                 printk("ADDR %llx ", m->addr);
 142         if (m->misc)
 143                 printk("MISC %llx ", m->misc);
 144         printk("\n");
 145         printk(KERN_EMERG "This is not a software problem!\n");
 146         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 147                "and contact your hardware vendor\n");
 148 }
 149
 150 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 151 {
 152         int i;
 153
 154         oops_begin();
 155         for (i = 0; i < MCE_LOG_LEN; i++) {
 156                 unsigned long tsc = mcelog.entry[i].tsc;
 157
 158                 if (time_before(tsc, start))
 159                         continue;
 160                 print_mce(&mcelog.entry[i]);
 161                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 162                         backup = NULL;
 163         }
 164         if (backup)
 165                 print_mce(backup);
 166         panic(msg);
 167 }
 168
 169 int mce_available(struct cpuinfo_x86 *c)
 170 {
 171         if (mce_dont_init)
 172                 return 0;
 173         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 174 }
 175
 176 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 177 {
 178         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 179                 m->ip = regs->ip;
 180                 m->cs = regs->cs;
 181         } else {
 182                 m->ip = 0;
 183                 m->cs = 0;
 184         }
 185         if (rip_msr) {
 186                 /* Assume the RIP in the MSR is exact. Is this true? */
 187                 m->mcgstatus |= MCG_STATUS_EIPV;
 188                 rdmsrl(rip_msr, m->ip);
 189                 m->cs = 0;
 190         }
 191 }
 192
 193 /*
 194  * Poll for corrected events or events that happened before reset.
 195  * Those are just logged through /dev/mcelog.
 196  *
 197  * This is executed in standard interrupt context.
 198  */
 199 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 200 {
 201         struct mce m;
 202         int i;
 203
 204         mce_setup(&m);
 205
 206         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 207         for (i = 0; i < banks; i++) {
 208                 if (!bank[i] || !test_bit(i, *b))
 209                         continue;
 210
 211                 m.misc = 0;
 212                 m.addr = 0;
 213                 m.bank = i;
 214                 m.tsc = 0;
 215
 216                 barrier();
 217                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 218                 if (!(m.status & MCI_STATUS_VAL))
 219                         continue;
 220
 221                 /*
 222                  * Uncorrected events are handled by the exception handler
 223                  * when it is enabled. But when the exception is disabled log
 224                  * everything.
 225                  *
 226                  * TBD do the same check for MCI_STATUS_EN here?
 227                  */
 228                 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
 229                         continue;
 230
 231                 if (m.status & MCI_STATUS_MISCV)
 232                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 233                 if (m.status & MCI_STATUS_ADDRV)
 234                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 235
 236                 if (!(flags & MCP_TIMESTAMP))
 237                         m.tsc = 0;
 238                 /*
 239                  * Don't get the IP here because it's unlikely to
 240                  * have anything to do with the actual error location.
 241                  */
 242                 if (!(flags & MCP_DONTLOG)) {
 243                         mce_log(&m);
 244                         add_taint(TAINT_MACHINE_CHECK);
 245                 }
 246
 247                 /*
 248                  * Clear state for this bank.
 249                  */
 250                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 251         }
 252
 253         /*
 254          * Don't clear MCG_STATUS here because it's only defined for
 255          * exceptions.
 256          */
 257 }
 258
 259 /*
 260  * The actual machine check handler. This only handles real
 261  * exceptions when something got corrupted coming in through int 18.
 262  *
 263  * This is executed in NMI context not subject to normal locking rules. This
 264  * implies that most kernel services cannot be safely used. Don't even
 265  * think about putting a printk in there!
 266  */
 267 void do_machine_check(struct pt_regs * regs, long error_code)
 268 {
 269         struct mce m, panicm;
 270         u64 mcestart = 0;
 271         int i;
 272         int panicm_found = 0;
 273         /*
 274          * If no_way_out gets set, there is no safe way to recover from this
 275          * MCE.  If tolerant is cranked up, we'll try anyway.
 276          */
 277         int no_way_out = 0;
 278         /*
 279          * If kill_it gets set, there might be a way to recover from this
 280          * error.
 281          */
 282         int kill_it = 0;
 283         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 284
 285         atomic_inc(&mce_entry);
 286
 287         if (notify_die(DIE_NMI, "machine check", regs, error_code,
 288                            18, SIGKILL) == NOTIFY_STOP)
 289                 goto out2;
 290         if (!banks)
 291                 goto out2;
 292
 293         mce_setup(&m);
 294
 295         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 296         /* if the restart IP is not valid, we're done for */
 297         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 298                 no_way_out = 1;
 299
 300         rdtscll(mcestart);
 301         barrier();
 302
 303         for (i = 0; i < banks; i++) {
 304                 __clear_bit(i, toclear);
 305                 if (!bank[i])
 306                         continue;
 307
 308                 m.misc = 0;
 309                 m.addr = 0;
 310                 m.bank = i;
 311
 312                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 313                 if ((m.status & MCI_STATUS_VAL) == 0)
 314                         continue;
 315
 316                 /*
 317                  * Non uncorrected errors are handled by machine_check_poll
 318                  * Leave them alone.
 319                  */
 320                 if ((m.status & MCI_STATUS_UC) == 0)
 321                         continue;
 322
 323                 /*
 324                  * Set taint even when machine check was not enabled.
 325                  */
 326                 add_taint(TAINT_MACHINE_CHECK);
 327
 328                 __set_bit(i, toclear);
 329
 330                 if (m.status & MCI_STATUS_EN) {
 331                         /* if PCC was set, there's no way out */
 332                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 333                         /*
 334                          * If this error was uncorrectable and there was
 335                          * an overflow, we're in trouble.  If no overflow,
 336                          * we might get away with just killing a task.
 337                          */
 338                         if (m.status & MCI_STATUS_UC) {
 339                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 340                                         no_way_out = 1;
 341                                 kill_it = 1;
 342                         }
 343                 } else {
 344                         /*
 345                          * Machine check event was not enabled. Clear, but
 346                          * ignore.
 347                          */
 348                         continue;
 349                 }
 350
 351                 if (m.status & MCI_STATUS_MISCV)
 352                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 353                 if (m.status & MCI_STATUS_ADDRV)
 354                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 355
 356                 mce_get_rip(&m, regs);
 357                 mce_log(&m);
 358
 359                 /* Did this bank cause the exception? */
 360                 /* Assume that the bank with uncorrectable errors did it,
 361                    and that there is only a single one. */
 362                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 363                         panicm = m;
 364                         panicm_found = 1;
 365                 }
 366         }
 367
 368         /* If we didn't find an uncorrectable error, pick
 369            the last one (shouldn't happen, just being safe). */
 370         if (!panicm_found)
 371                 panicm = m;
 372
 373         /*
 374          * If we have decided that we just CAN'T continue, and the user
 375          *  has not set tolerant to an insane level, give up and die.
 376          */
 377         if (no_way_out && tolerant < 3)
 378                 mce_panic("Machine check", &panicm, mcestart);
 379
 380         /*
 381          * If the error seems to be unrecoverable, something should be
 382          * done.  Try to kill as little as possible.  If we can kill just
 383          * one task, do that.  If the user has set the tolerance very
 384          * high, don't try to do anything at all.
 385          */
 386         if (kill_it && tolerant < 3) {
 387                 int user_space = 0;
 388
 389                 /*
 390                  * If the EIPV bit is set, it means the saved IP is the
 391                  * instruction which caused the MCE.
 392                  */
 393                 if (m.mcgstatus & MCG_STATUS_EIPV)
 394                         user_space = panicm.ip && (panicm.cs & 3);
 395
 396                 /*
 397                  * If we know that the error was in user space, send a
 398                  * SIGBUS.  Otherwise, panic if tolerance is low.
 399                  *
 400                  * force_sig() takes an awful lot of locks and has a slight
 401                  * risk of deadlocking.
 402                  */
 403                 if (user_space) {
 404                         force_sig(SIGBUS, current);
 405                 } else if (panic_on_oops || tolerant < 2) {
 406                         mce_panic("Uncorrected machine check",
 407                                 &panicm, mcestart);
 408                 }
 409         }
 410
 411         /* notify userspace ASAP */
 412         set_thread_flag(TIF_MCE_NOTIFY);
 413
 414         /* the last thing we do is clear state */
 415         for (i = 0; i < banks; i++) {
 416                 if (test_bit(i, toclear))
 417                         wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 418         }
 419         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 420  out2:
 421         atomic_dec(&mce_entry);
 422 }
 423
 424 #ifdef CONFIG_X86_MCE_INTEL
 425 /***
 426  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 427  * @cpu: The CPU on which the event occurred.
 428  * @status: Event status information
 429  *
 430  * This function should be called by the thermal interrupt after the
 431  * event has been processed and the decision was made to log the event
 432  * further.
 433  *
 434  * The status parameter will be saved to the 'status' field of 'struct mce'
 435  * and historically has been the register value of the
 436  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 437  */
 438 void mce_log_therm_throt_event(__u64 status)
 439 {
 440         struct mce m;
 441
 442         mce_setup(&m);
 443         m.bank = MCE_THERMAL_BANK;
 444         m.status = status;
 445         mce_log(&m);
 446 }
 447 #endif /* CONFIG_X86_MCE_INTEL */
 448
 449 /*
 450  * Periodic polling timer for "silent" machine check errors.  If the
 451  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 452  * errors, poll 2x slower (up to check_interval seconds).
 453  */
 454
 455 static int check_interval = 5 * 60; /* 5 minutes */
 456 static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
 457 static void mcheck_timer(unsigned long);
 458 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 459
 460 static void mcheck_timer(unsigned long data)
 461 {
 462         struct timer_list *t = &per_cpu(mce_timer, data);
 463         int *n;
 464
 465         WARN_ON(smp_processor_id() != data);
 466
 467         if (mce_available(&current_cpu_data))
 468                 machine_check_poll(MCP_TIMESTAMP,
 469                                 &__get_cpu_var(mce_poll_banks));
 470
 471         /*
 472          * Alert userspace if needed.  If we logged an MCE, reduce the
 473          * polling interval, otherwise increase the polling interval.
 474          */
 475         n = &__get_cpu_var(next_interval);
 476         if (mce_notify_user()) {
 477                 *n = max(*n/2, HZ/100);
 478         } else {
 479                 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
 480         }
 481
 482         t->expires = jiffies + *n;
 483         add_timer(t);
 484 }
 485
 486 static void mce_do_trigger(struct work_struct *work)
 487 {
 488         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 489 }
 490
 491 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 492
 493 /*
 494  * Notify the user(s) about new machine check events.
 495  * Can be called from interrupt context, but not from machine check/NMI
 496  * context.
 497  */
 498 int mce_notify_user(void)
 499 {
 500         /* Not more than two messages every minute */
 501         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
 502
 503         clear_thread_flag(TIF_MCE_NOTIFY);
 504         if (test_and_clear_bit(0, &notify_user)) {
 505                 wake_up_interruptible(&mce_wait);
 506
 507                 /*
 508                  * There is no risk of missing notifications because
 509                  * work_pending is always cleared before the function is
 510                  * executed.
 511                  */
 512                 if (trigger[0] && !work_pending(&mce_trigger_work))
 513                         schedule_work(&mce_trigger_work);
 514
 515                 if (__ratelimit(&ratelimit))
 516                         printk(KERN_INFO "Machine check events logged\n");
 517
 518                 return 1;
 519         }
 520         return 0;
 521 }
 522
 523 /* see if the idle task needs to notify userspace */
 524 static int
 525 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 526 {
 527         /* IDLE_END should be safe - interrupts are back on */
 528         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 529                 mce_notify_user();
 530
 531         return NOTIFY_OK;
 532 }
 533
 534 static struct notifier_block mce_idle_notifier = {
 535         .notifier_call = mce_idle_callback,
 536 };
 537
 538 static __init int periodic_mcheck_init(void)
 539 {
 540        idle_notifier_register(&mce_idle_notifier);
 541        return 0;
 542 }
 543 __initcall(periodic_mcheck_init);
 544
 545 /*
 546  * Initialize Machine Checks for a CPU.
 547  */
 548 static int mce_cap_init(void)
 549 {
 550         u64 cap;
 551         unsigned b;
 552
 553         rdmsrl(MSR_IA32_MCG_CAP, cap);
 554         b = cap & 0xff;
 555         if (b > MAX_NR_BANKS) {
 556                 printk(KERN_WARNING
 557                        "MCE: Using only %u machine check banks out of %u\n",
 558                         MAX_NR_BANKS, b);
 559                 b = MAX_NR_BANKS;
 560         }
 561
 562         /* Don't support asymmetric configurations today */
 563         WARN_ON(banks != 0 && b != banks);
 564         banks = b;
 565         if (!bank) {
 566                 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
 567                 if (!bank)
 568                         return -ENOMEM;
 569                 memset(bank, 0xff, banks * sizeof(u64));
 570         }
 571
 572         /* Use accurate RIP reporting if available. */
 573         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 574                 rip_msr = MSR_IA32_MCG_EIP;
 575
 576         return 0;
 577 }
 578
 579 static void mce_init(void *dummy)
 580 {
 581         u64 cap;
 582         int i;
 583         mce_banks_t all_banks;
 584
 585         /*
 586          * Log the machine checks left over from the previous reset.
 587          */
 588         bitmap_fill(all_banks, MAX_NR_BANKS);
 589         machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 590
 591         set_in_cr4(X86_CR4_MCE);
 592
 593         rdmsrl(MSR_IA32_MCG_CAP, cap);
 594         if (cap & MCG_CTL_P)
 595                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 596
 597         for (i = 0; i < banks; i++) {
 598                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 599                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 600         }
 601 }
 602
 603 /* Add per CPU specific workarounds here */
 604 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 605 {
 606         /* This should be disabled by the BIOS, but isn't always */
 607         if (c->x86_vendor == X86_VENDOR_AMD) {
 608                 if (c->x86 == 15 && banks > 4)
 609                         /* disable GART TBL walk error reporting, which trips off
 610                            incorrectly with the IOMMU & 3ware & Cerberus. */
 611                         clear_bit(10, (unsigned long *)&bank[4]);
 612                 if(c->x86 <= 17 && mce_bootlog < 0)
 613                         /* Lots of broken BIOS around that don't clear them
 614                            by default and leave crap in there. Don't log. */
 615                         mce_bootlog = 0;
 616         }
 617
 618 }
 619
 620 static void mce_cpu_features(struct cpuinfo_x86 *c)
 621 {
 622         switch (c->x86_vendor) {
 623         case X86_VENDOR_INTEL:
 624                 mce_intel_feature_init(c);
 625                 break;
 626         case X86_VENDOR_AMD:
 627                 mce_amd_feature_init(c);
 628                 break;
 629         default:
 630                 break;
 631         }
 632 }
 633
 634 static void mce_init_timer(void)
 635 {
 636         struct timer_list *t = &__get_cpu_var(mce_timer);
 637         int *n = &__get_cpu_var(next_interval);
 638
 639         *n = check_interval * HZ;
 640         if (!*n)
 641                 return;
 642         setup_timer(t, mcheck_timer, smp_processor_id());
 643         t->expires = round_jiffies(jiffies + *n);
 644         add_timer(t);
 645 }
 646
 647 /*
 648  * Called for each booted CPU to set up machine checks.
 649  * Must be called with preempt off.
 650  */
 651 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 652 {
 653         if (!mce_available(c))
 654                 return;
 655
 656         if (mce_cap_init() < 0) {
 657                 mce_dont_init = 1;
 658                 return;
 659         }
 660         mce_cpu_quirks(c);
 661
 662         mce_init(NULL);
 663         mce_cpu_features(c);
 664         mce_init_timer();
 665 }
 666
 667 /*
 668  * Character device to read and clear the MCE log.
 669  */
 670
 671 static DEFINE_SPINLOCK(mce_state_lock);
 672 static int open_count;  /* #times opened */
 673 static int open_exclu;  /* already open exclusive? */
 674
 675 static int mce_open(struct inode *inode, struct file *file)
 676 {
 677         lock_kernel();
 678         spin_lock(&mce_state_lock);
 679
 680         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 681                 spin_unlock(&mce_state_lock);
 682                 unlock_kernel();
 683                 return -EBUSY;
 684         }
 685
 686         if (file->f_flags & O_EXCL)
 687                 open_exclu = 1;
 688         open_count++;
 689
 690         spin_unlock(&mce_state_lock);
 691         unlock_kernel();
 692
 693         return nonseekable_open(inode, file);
 694 }
 695
 696 static int mce_release(struct inode *inode, struct file *file)
 697 {
 698         spin_lock(&mce_state_lock);
 699
 700         open_count--;
 701         open_exclu = 0;
 702
 703         spin_unlock(&mce_state_lock);
 704
 705         return 0;
 706 }
 707
 708 static void collect_tscs(void *data)
 709 {
 710         unsigned long *cpu_tsc = (unsigned long *)data;
 711
 712         rdtscll(cpu_tsc[smp_processor_id()]);
 713 }
 714
 715 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 716                         loff_t *off)
 717 {
 718         unsigned long *cpu_tsc;
 719         static DEFINE_MUTEX(mce_read_mutex);
 720         unsigned prev, next;
 721         char __user *buf = ubuf;
 722         int i, err;
 723
 724         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 725         if (!cpu_tsc)
 726                 return -ENOMEM;
 727
 728         mutex_lock(&mce_read_mutex);
 729         next = rcu_dereference(mcelog.next);
 730
 731         /* Only supports full reads right now */
 732         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 733                 mutex_unlock(&mce_read_mutex);
 734                 kfree(cpu_tsc);
 735                 return -EINVAL;
 736         }
 737
 738         err = 0;
 739         prev = 0;
 740         do {
 741                 for (i = prev; i < next; i++) {
 742                         unsigned long start = jiffies;
 743
 744                         while (!mcelog.entry[i].finished) {
 745                                 if (time_after_eq(jiffies, start + 2)) {
 746                                         memset(mcelog.entry + i, 0,
 747                                                sizeof(struct mce));
 748                                         goto timeout;
 749                                 }
 750                                 cpu_relax();
 751                         }
 752                         smp_rmb();
 753                         err |= copy_to_user(buf, mcelog.entry + i,
 754                                             sizeof(struct mce));
 755                         buf += sizeof(struct mce);
 756 timeout:
 757                         ;
 758                 }
 759
 760                 memset(mcelog.entry + prev, 0,
 761                        (next - prev) * sizeof(struct mce));
 762                 prev = next;
 763                 next = cmpxchg(&mcelog.next, prev, 0);
 764         } while (next != prev);
 765
 766         synchronize_sched();
 767
 768         /*
 769          * Collect entries that were still getting written before the
 770          * synchronize.
 771          */
 772         on_each_cpu(collect_tscs, cpu_tsc, 1);
 773         for (i = next; i < MCE_LOG_LEN; i++) {
 774                 if (mcelog.entry[i].finished &&
 775                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 776                         err |= copy_to_user(buf, mcelog.entry+i,
 777                                             sizeof(struct mce));
 778                         smp_rmb();
 779                         buf += sizeof(struct mce);
 780                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 781                 }
 782         }
 783         mutex_unlock(&mce_read_mutex);
 784         kfree(cpu_tsc);
 785         return err ? -EFAULT : buf - ubuf;
 786 }
 787
 788 static unsigned int mce_poll(struct file *file, poll_table *wait)
 789 {
 790         poll_wait(file, &mce_wait, wait);
 791         if (rcu_dereference(mcelog.next))
 792                 return POLLIN | POLLRDNORM;
 793         return 0;
 794 }
 795
 796 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 797 {
 798         int __user *p = (int __user *)arg;
 799
 800         if (!capable(CAP_SYS_ADMIN))
 801                 return -EPERM;
 802         switch (cmd) {
 803         case MCE_GET_RECORD_LEN:
 804                 return put_user(sizeof(struct mce), p);
 805         case MCE_GET_LOG_LEN:
 806                 return put_user(MCE_LOG_LEN, p);
 807         case MCE_GETCLEAR_FLAGS: {
 808                 unsigned flags;
 809
 810                 do {
 811                         flags = mcelog.flags;
 812                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 813                 return put_user(flags, p);
 814         }
 815         default:
 816                 return -ENOTTY;
 817         }
 818 }
 819
 820 static const struct file_operations mce_chrdev_ops = {
 821         .open = mce_open,
 822         .release = mce_release,
 823         .read = mce_read,
 824         .poll = mce_poll,
 825         .unlocked_ioctl = mce_ioctl,
 826 };
 827
 828 static struct miscdevice mce_log_device = {
 829         MISC_MCELOG_MINOR,
 830         "mcelog",
 831         &mce_chrdev_ops,
 832 };
 833
 834 /*
 835  * Old style boot options parsing. Only for compatibility.
 836  */
 837 static int __init mcheck_disable(char *str)
 838 {
 839         mce_dont_init = 1;
 840         return 1;
 841 }
 842
 843 /* mce=off disables machine check.
 844    mce=TOLERANCELEVEL (number, see above)
 845    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 846    mce=nobootlog Don't log MCEs from before booting. */
 847 static int __init mcheck_enable(char *str)
 848 {
 849         if (!strcmp(str, "off"))
 850                 mce_dont_init = 1;
 851         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 852                 mce_bootlog = str[0] == 'b';
 853         else if (isdigit(str[0]))
 854                 get_option(&str, &tolerant);
 855         else
 856                 printk("mce= argument %s ignored. Please use /sys", str);
 857         return 1;
 858 }
 859
 860 __setup("nomce", mcheck_disable);
 861 __setup("mce=", mcheck_enable);
 862
 863 /*
 864  * Sysfs support
 865  */
 866
 867 /*
 868  * Disable machine checks on suspend and shutdown. We can't really handle
 869  * them later.
 870  */
 871 static int mce_disable(void)
 872 {
 873         int i;
 874
 875         for (i = 0; i < banks; i++)
 876                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 877         return 0;
 878 }
 879
 880 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 881 {
 882         return mce_disable();
 883 }
 884
 885 static int mce_shutdown(struct sys_device *dev)
 886 {
 887         return mce_disable();
 888 }
 889
 890 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 891    Only one CPU is active at this time, the others get readded later using
 892    CPU hotplug. */
 893 static int mce_resume(struct sys_device *dev)
 894 {
 895         mce_init(NULL);
 896         mce_cpu_features(&current_cpu_data);
 897         return 0;
 898 }
 899
 900 static void mce_cpu_restart(void *data)
 901 {
 902         del_timer_sync(&__get_cpu_var(mce_timer));
 903         if (mce_available(&current_cpu_data))
 904                 mce_init(NULL);
 905         mce_init_timer();
 906 }
 907
 908 /* Reinit MCEs after user configuration changes */
 909 static void mce_restart(void)
 910 {
 911         on_each_cpu(mce_cpu_restart, NULL, 1);
 912 }
 913
 914 static struct sysdev_class mce_sysclass = {
 915         .suspend = mce_suspend,
 916         .shutdown = mce_shutdown,
 917         .resume = mce_resume,
 918         .name = "machinecheck",
 919 };
 920
 921 DEFINE_PER_CPU(struct sys_device, device_mce);
 922 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
 923
 924 /* Why are there no generic functions for this? */
 925 #define ACCESSOR(name, var, start) \
 926         static ssize_t show_ ## name(struct sys_device *s,              \
 927                                      struct sysdev_attribute *attr,     \
 928                                      char *buf) {                       \
 929                 return sprintf(buf, "%lx\n", (unsigned long)var);       \
 930         }                                                               \
 931         static ssize_t set_ ## name(struct sys_device *s,               \
 932                                     struct sysdev_attribute *attr,      \
 933                                     const char *buf, size_t siz) {      \
 934                 char *end;                                              \
 935                 unsigned long new = simple_strtoul(buf, &end, 0);       \
 936                 if (end == buf) return -EINVAL;                         \
 937                 var = new;                                              \
 938                 start;                                                  \
 939                 return end-buf;                                         \
 940         }                                                               \
 941         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 942
 943 static struct sysdev_attribute *bank_attrs;
 944
 945 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 946                          char *buf)
 947 {
 948         u64 b = bank[attr - bank_attrs];
 949         return sprintf(buf, "%llx\n", b);
 950 }
 951
 952 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 953                         const char *buf, size_t siz)
 954 {
 955         char *end;
 956         u64 new = simple_strtoull(buf, &end, 0);
 957         if (end == buf)
 958                 return -EINVAL;
 959         bank[attr - bank_attrs] = new;
 960         mce_restart();
 961         return end-buf;
 962 }
 963
 964 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 965                                 char *buf)
 966 {
 967         strcpy(buf, trigger);
 968         strcat(buf, "\n");
 969         return strlen(trigger) + 1;
 970 }
 971
 972 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 973                                 const char *buf,size_t siz)
 974 {
 975         char *p;
 976         int len;
 977         strncpy(trigger, buf, sizeof(trigger));
 978         trigger[sizeof(trigger)-1] = 0;
 979         len = strlen(trigger);
 980         p = strchr(trigger, '\n');
 981         if (*p) *p = 0;
 982         return len;
 983 }
 984
 985 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 986 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 987 ACCESSOR(check_interval,check_interval,mce_restart())
 988 static struct sysdev_attribute *mce_attributes[] = {
 989         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 990         NULL
 991 };
 992
 993 static cpumask_var_t mce_device_initialized;
 994
 995 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 996 static __cpuinit int mce_create_device(unsigned int cpu)
 997 {
 998         int err;
 999         int i;
1000
1001         if (!mce_available(&boot_cpu_data))
1002                 return -EIO;
1003
1004         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1005         per_cpu(device_mce,cpu).id = cpu;
1006         per_cpu(device_mce,cpu).cls = &mce_sysclass;
1007
1008         err = sysdev_register(&per_cpu(device_mce,cpu));
1009         if (err)
1010                 return err;
1011
1012         for (i = 0; mce_attributes[i]; i++) {
1013                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1014                                          mce_attributes[i]);
1015                 if (err)
1016                         goto error;
1017         }
1018         for (i = 0; i < banks; i++) {
1019                 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1020                                         &bank_attrs[i]);
1021                 if (err)
1022                         goto error2;
1023         }
1024         cpumask_set_cpu(cpu, mce_device_initialized);
1025
1026         return 0;
1027 error2:
1028         while (--i >= 0) {
1029                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1030                                         &bank_attrs[i]);
1031         }
1032 error:
1033         while (--i >= 0) {
1034                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1035                                    mce_attributes[i]);
1036         }
1037         sysdev_unregister(&per_cpu(device_mce,cpu));
1038
1039         return err;
1040 }
1041
1042 static __cpuinit void mce_remove_device(unsigned int cpu)
1043 {
1044         int i;
1045
1046         if (!cpumask_test_cpu(cpu, mce_device_initialized))
1047                 return;
1048
1049         for (i = 0; mce_attributes[i]; i++)
1050                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1051                         mce_attributes[i]);
1052         for (i = 0; i < banks; i++)
1053                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1054                         &bank_attrs[i]);
1055         sysdev_unregister(&per_cpu(device_mce,cpu));
1056         cpumask_clear_cpu(cpu, mce_device_initialized);
1057 }
1058
1059 /* Make sure there are no machine checks on offlined CPUs. */
1060 static void mce_disable_cpu(void *h)
1061 {
1062         int i;
1063         unsigned long action = *(unsigned long *)h;
1064
1065         if (!mce_available(&current_cpu_data))
1066                 return;
1067         if (!(action & CPU_TASKS_FROZEN))
1068                 cmci_clear();
1069         for (i = 0; i < banks; i++)
1070                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1071 }
1072
1073 static void mce_reenable_cpu(void *h)
1074 {
1075         int i;
1076         unsigned long action = *(unsigned long *)h;
1077
1078         if (!mce_available(&current_cpu_data))
1079                 return;
1080         if (!(action & CPU_TASKS_FROZEN))
1081                 cmci_reenable();
1082         for (i = 0; i < banks; i++)
1083                 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1084 }
1085
1086 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1087 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1088                                       unsigned long action, void *hcpu)
1089 {
1090         unsigned int cpu = (unsigned long)hcpu;
1091         struct timer_list *t = &per_cpu(mce_timer, cpu);
1092
1093         switch (action) {
1094         case CPU_ONLINE:
1095         case CPU_ONLINE_FROZEN:
1096                 mce_create_device(cpu);
1097                 if (threshold_cpu_callback)
1098                         threshold_cpu_callback(action, cpu);
1099                 break;
1100         case CPU_DEAD:
1101         case CPU_DEAD_FROZEN:
1102                 if (threshold_cpu_callback)
1103                         threshold_cpu_callback(action, cpu);
1104                 mce_remove_device(cpu);
1105                 break;
1106         case CPU_DOWN_PREPARE:
1107         case CPU_DOWN_PREPARE_FROZEN:
1108                 del_timer_sync(t);
1109                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
1110                 break;
1111         case CPU_DOWN_FAILED:
1112         case CPU_DOWN_FAILED_FROZEN:
1113                 t->expires = round_jiffies(jiffies +
1114                                                 __get_cpu_var(next_interval));
1115                 add_timer_on(t, cpu);
1116                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1117                 break;
1118         case CPU_POST_DEAD:
1119                 /* intentionally ignoring frozen here */
1120                 cmci_rediscover(cpu);
1121                 break;
1122         }
1123         return NOTIFY_OK;
1124 }
1125
1126 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1127         .notifier_call = mce_cpu_callback,
1128 };
1129
1130 static __init int mce_init_banks(void)
1131 {
1132         int i;
1133
1134         bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1135                                 GFP_KERNEL);
1136         if (!bank_attrs)
1137                 return -ENOMEM;
1138
1139         for (i = 0; i < banks; i++) {
1140                 struct sysdev_attribute *a = &bank_attrs[i];
1141                 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1142                 if (!a->attr.name)
1143                         goto nomem;
1144                 a->attr.mode = 0644;
1145                 a->show = show_bank;
1146                 a->store = set_bank;
1147         }
1148         return 0;
1149
1150 nomem:
1151         while (--i >= 0)
1152                 kfree(bank_attrs[i].attr.name);
1153         kfree(bank_attrs);
1154         bank_attrs = NULL;
1155         return -ENOMEM;
1156 }
1157
1158 static __init int mce_init_device(void)
1159 {
1160         int err;
1161         int i = 0;
1162
1163         if (!mce_available(&boot_cpu_data))
1164                 return -EIO;
1165
1166         zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1167
1168         err = mce_init_banks();
1169         if (err)
1170                 return err;
1171
1172         err = sysdev_class_register(&mce_sysclass);
1173         if (err)
1174                 return err;
1175
1176         for_each_online_cpu(i) {
1177                 err = mce_create_device(i);
1178                 if (err)
1179                         return err;
1180         }
1181
1182         register_hotcpu_notifier(&mce_cpu_notifier);
1183         misc_register(&mce_log_device);
1184         return err;
1185 }
1186
1187 device_initcall(mce_init_device);