arch/x86/kernel/cpu/mcheck/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  * Copyright 2008 Intel Corporation
   7  * Author: Andi Kleen
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/types.h>
  12 #include <linux/kernel.h>
  13 #include <linux/sched.h>
  14 #include <linux/smp_lock.h>
  15 #include <linux/string.h>
  16 #include <linux/rcupdate.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/sysdev.h>
  19 #include <linux/miscdevice.h>
  20 #include <linux/fs.h>
  21 #include <linux/capability.h>
  22 #include <linux/cpu.h>
  23 #include <linux/percpu.h>
  24 #include <linux/poll.h>
  25 #include <linux/thread_info.h>
  26 #include <linux/ctype.h>
  27 #include <linux/kmod.h>
  28 #include <linux/kdebug.h>
  29 #include <linux/kobject.h>
  30 #include <linux/sysfs.h>
  31 #include <asm/processor.h>
  32 #include <asm/msr.h>
  33 #include <asm/mce.h>
  34 #include <asm/uaccess.h>
  35 #include <asm/smp.h>
  36 #include <asm/idle.h>
  37
  38 #define MISC_MCELOG_MINOR 227
  39
  40 atomic_t mce_entry;
  41
  42 static int mce_dont_init;
  43
  44 /*
  45  * Tolerant levels:
  46  *   0: always panic on uncorrected errors, log corrected errors
  47  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  48  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  49  *   3: never panic or SIGBUS, log all errors (for testing only)
  50  */
  51 static int tolerant = 1;
  52 static int banks;
  53 static u64 *bank;
  54 static unsigned long notify_user;
  55 static int rip_msr;
  56 static int mce_bootlog = -1;
  57 static atomic_t mce_events;
  58
  59 static char trigger[128];
  60 static char *trigger_argv[2] = { trigger, NULL };
  61
  62 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  63
  64 /* Do initial initialization of a struct mce */
  65 void mce_setup(struct mce *m)
  66 {
  67         memset(m, 0, sizeof(struct mce));
  68         m->cpu = smp_processor_id();
  69         rdtscll(m->tsc);
  70 }
  71
  72 /*
  73  * Lockless MCE logging infrastructure.
  74  * This avoids deadlocks on printk locks without having to break locks. Also
  75  * separate MCEs from kernel messages to avoid bogus bug reports.
  76  */
  77
  78 static struct mce_log mcelog = {
  79         MCE_LOG_SIGNATURE,
  80         MCE_LOG_LEN,
  81 };
  82
  83 void mce_log(struct mce *mce)
  84 {
  85         unsigned next, entry;
  86         atomic_inc(&mce_events);
  87         mce->finished = 0;
  88         wmb();
  89         for (;;) {
  90                 entry = rcu_dereference(mcelog.next);
  91                 for (;;) {
  92                         /* When the buffer fills up discard new entries. Assume
  93                            that the earlier errors are the more interesting. */
  94                         if (entry >= MCE_LOG_LEN) {
  95                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
  96                                 return;
  97                         }
  98                         /* Old left over entry. Skip. */
  99                         if (mcelog.entry[entry].finished) {
 100                                 entry++;
 101                                 continue;
 102                         }
 103                         break;
 104                 }
 105                 smp_rmb();
 106                 next = entry + 1;
 107                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 108                         break;
 109         }
 110         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 111         wmb();
 112         mcelog.entry[entry].finished = 1;
 113         wmb();
 114
 115         set_bit(0, &notify_user);
 116 }
 117
 118 static void print_mce(struct mce *m)
 119 {
 120         printk(KERN_EMERG "\n"
 121                KERN_EMERG "HARDWARE ERROR\n"
 122                KERN_EMERG
 123                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 124                m->cpu, m->mcgstatus, m->bank, m->status);
 125         if (m->ip) {
 126                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 127                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 128                        m->cs, m->ip);
 129                 if (m->cs == __KERNEL_CS)
 130                         print_symbol("{%s}", m->ip);
 131                 printk("\n");
 132         }
 133         printk(KERN_EMERG "TSC %llx ", m->tsc);
 134         if (m->addr)
 135                 printk("ADDR %llx ", m->addr);
 136         if (m->misc)
 137                 printk("MISC %llx ", m->misc);
 138         printk("\n");
 139         printk(KERN_EMERG "This is not a software problem!\n");
 140         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 141                "and contact your hardware vendor\n");
 142 }
 143
 144 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 145 {
 146         int i;
 147
 148         oops_begin();
 149         for (i = 0; i < MCE_LOG_LEN; i++) {
 150                 unsigned long tsc = mcelog.entry[i].tsc;
 151
 152                 if (time_before(tsc, start))
 153                         continue;
 154                 print_mce(&mcelog.entry[i]);
 155                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 156                         backup = NULL;
 157         }
 158         if (backup)
 159                 print_mce(backup);
 160         panic(msg);
 161 }
 162
 163 static int mce_available(struct cpuinfo_x86 *c)
 164 {
 165         if (mce_dont_init)
 166                 return 0;
 167         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 168 }
 169
 170 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 171 {
 172         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 173                 m->ip = regs->ip;
 174                 m->cs = regs->cs;
 175         } else {
 176                 m->ip = 0;
 177                 m->cs = 0;
 178         }
 179         if (rip_msr) {
 180                 /* Assume the RIP in the MSR is exact. Is this true? */
 181                 m->mcgstatus |= MCG_STATUS_EIPV;
 182                 rdmsrl(rip_msr, m->ip);
 183                 m->cs = 0;
 184         }
 185 }
 186
 187 /*
 188  * Poll for corrected events or events that happened before reset.
 189  * Those are just logged through /dev/mcelog.
 190  *
 191  * This is executed in standard interrupt context.
 192  */
 193 void machine_check_poll(enum mcp_flags flags)
 194 {
 195         struct mce m;
 196         int i;
 197
 198         mce_setup(&m);
 199
 200         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 201         for (i = 0; i < banks; i++) {
 202                 if (!bank[i])
 203                         continue;
 204
 205                 m.misc = 0;
 206                 m.addr = 0;
 207                 m.bank = i;
 208                 m.tsc = 0;
 209
 210                 barrier();
 211                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 212                 if (!(m.status & MCI_STATUS_VAL))
 213                         continue;
 214
 215                 /*
 216                  * Uncorrected events are handled by the exception handler
 217                  * when it is enabled. But when the exception is disabled log
 218                  * everything.
 219                  *
 220                  * TBD do the same check for MCI_STATUS_EN here?
 221                  */
 222                 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
 223                         continue;
 224
 225                 if (m.status & MCI_STATUS_MISCV)
 226                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 227                 if (m.status & MCI_STATUS_ADDRV)
 228                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 229
 230                 if (!(flags & MCP_TIMESTAMP))
 231                         m.tsc = 0;
 232                 /*
 233                  * Don't get the IP here because it's unlikely to
 234                  * have anything to do with the actual error location.
 235                  */
 236
 237                 mce_log(&m);
 238                 add_taint(TAINT_MACHINE_CHECK);
 239
 240                 /*
 241                  * Clear state for this bank.
 242                  */
 243                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 244         }
 245
 246         /*
 247          * Don't clear MCG_STATUS here because it's only defined for
 248          * exceptions.
 249          */
 250 }
 251
 252 /*
 253  * The actual machine check handler. This only handles real
 254  * exceptions when something got corrupted coming in through int 18.
 255  *
 256  * This is executed in NMI context not subject to normal locking rules. This
 257  * implies that most kernel services cannot be safely used. Don't even
 258  * think about putting a printk in there!
 259  */
 260 void do_machine_check(struct pt_regs * regs, long error_code)
 261 {
 262         struct mce m, panicm;
 263         u64 mcestart = 0;
 264         int i;
 265         int panicm_found = 0;
 266         /*
 267          * If no_way_out gets set, there is no safe way to recover from this
 268          * MCE.  If tolerant is cranked up, we'll try anyway.
 269          */
 270         int no_way_out = 0;
 271         /*
 272          * If kill_it gets set, there might be a way to recover from this
 273          * error.
 274          */
 275         int kill_it = 0;
 276         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 277
 278         atomic_inc(&mce_entry);
 279
 280         if (notify_die(DIE_NMI, "machine check", regs, error_code,
 281                            18, SIGKILL) == NOTIFY_STOP)
 282                 goto out2;
 283         if (!banks)
 284                 goto out2;
 285
 286         mce_setup(&m);
 287
 288         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 289         /* if the restart IP is not valid, we're done for */
 290         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 291                 no_way_out = 1;
 292
 293         rdtscll(mcestart);
 294         barrier();
 295
 296         for (i = 0; i < banks; i++) {
 297                 __clear_bit(i, toclear);
 298                 if (!bank[i])
 299                         continue;
 300
 301                 m.misc = 0;
 302                 m.addr = 0;
 303                 m.bank = i;
 304
 305                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 306                 if ((m.status & MCI_STATUS_VAL) == 0)
 307                         continue;
 308
 309                 /*
 310                  * Non uncorrected errors are handled by machine_check_poll
 311                  * Leave them alone.
 312                  */
 313                 if ((m.status & MCI_STATUS_UC) == 0)
 314                         continue;
 315
 316                 /*
 317                  * Set taint even when machine check was not enabled.
 318                  */
 319                 add_taint(TAINT_MACHINE_CHECK);
 320
 321                 __set_bit(i, toclear);
 322
 323                 if (m.status & MCI_STATUS_EN) {
 324                         /* if PCC was set, there's no way out */
 325                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 326                         /*
 327                          * If this error was uncorrectable and there was
 328                          * an overflow, we're in trouble.  If no overflow,
 329                          * we might get away with just killing a task.
 330                          */
 331                         if (m.status & MCI_STATUS_UC) {
 332                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 333                                         no_way_out = 1;
 334                                 kill_it = 1;
 335                         }
 336                 } else {
 337                         /*
 338                          * Machine check event was not enabled. Clear, but
 339                          * ignore.
 340                          */
 341                         continue;
 342                 }
 343
 344                 if (m.status & MCI_STATUS_MISCV)
 345                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 346                 if (m.status & MCI_STATUS_ADDRV)
 347                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 348
 349                 mce_get_rip(&m, regs);
 350                 mce_log(&m);
 351
 352                 /* Did this bank cause the exception? */
 353                 /* Assume that the bank with uncorrectable errors did it,
 354                    and that there is only a single one. */
 355                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 356                         panicm = m;
 357                         panicm_found = 1;
 358                 }
 359         }
 360
 361         /* If we didn't find an uncorrectable error, pick
 362            the last one (shouldn't happen, just being safe). */
 363         if (!panicm_found)
 364                 panicm = m;
 365
 366         /*
 367          * If we have decided that we just CAN'T continue, and the user
 368          *  has not set tolerant to an insane level, give up and die.
 369          */
 370         if (no_way_out && tolerant < 3)
 371                 mce_panic("Machine check", &panicm, mcestart);
 372
 373         /*
 374          * If the error seems to be unrecoverable, something should be
 375          * done.  Try to kill as little as possible.  If we can kill just
 376          * one task, do that.  If the user has set the tolerance very
 377          * high, don't try to do anything at all.
 378          */
 379         if (kill_it && tolerant < 3) {
 380                 int user_space = 0;
 381
 382                 /*
 383                  * If the EIPV bit is set, it means the saved IP is the
 384                  * instruction which caused the MCE.
 385                  */
 386                 if (m.mcgstatus & MCG_STATUS_EIPV)
 387                         user_space = panicm.ip && (panicm.cs & 3);
 388
 389                 /*
 390                  * If we know that the error was in user space, send a
 391                  * SIGBUS.  Otherwise, panic if tolerance is low.
 392                  *
 393                  * force_sig() takes an awful lot of locks and has a slight
 394                  * risk of deadlocking.
 395                  */
 396                 if (user_space) {
 397                         force_sig(SIGBUS, current);
 398                 } else if (panic_on_oops || tolerant < 2) {
 399                         mce_panic("Uncorrected machine check",
 400                                 &panicm, mcestart);
 401                 }
 402         }
 403
 404         /* notify userspace ASAP */
 405         set_thread_flag(TIF_MCE_NOTIFY);
 406
 407         /* the last thing we do is clear state */
 408         for (i = 0; i < banks; i++) {
 409                 if (test_bit(i, toclear))
 410                         wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 411         }
 412         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 413  out2:
 414         atomic_dec(&mce_entry);
 415 }
 416
 417 #ifdef CONFIG_X86_MCE_INTEL
 418 /***
 419  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 420  * @cpu: The CPU on which the event occurred.
 421  * @status: Event status information
 422  *
 423  * This function should be called by the thermal interrupt after the
 424  * event has been processed and the decision was made to log the event
 425  * further.
 426  *
 427  * The status parameter will be saved to the 'status' field of 'struct mce'
 428  * and historically has been the register value of the
 429  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 430  */
 431 void mce_log_therm_throt_event(__u64 status)
 432 {
 433         struct mce m;
 434
 435         mce_setup(&m);
 436         m.bank = MCE_THERMAL_BANK;
 437         m.status = status;
 438         mce_log(&m);
 439 }
 440 #endif /* CONFIG_X86_MCE_INTEL */
 441
 442 /*
 443  * Periodic polling timer for "silent" machine check errors.  If the
 444  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 445  * errors, poll 2x slower (up to check_interval seconds).
 446  */
 447
 448 static int check_interval = 5 * 60; /* 5 minutes */
 449 static int next_interval; /* in jiffies */
 450 static void mcheck_timer(unsigned long);
 451 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 452
 453 static void mcheck_timer(unsigned long data)
 454 {
 455         struct timer_list *t = &per_cpu(mce_timer, data);
 456
 457         WARN_ON(smp_processor_id() != data);
 458
 459         if (mce_available(&current_cpu_data))
 460                 machine_check_poll(MCP_TIMESTAMP);
 461
 462         /*
 463          * Alert userspace if needed.  If we logged an MCE, reduce the
 464          * polling interval, otherwise increase the polling interval.
 465          */
 466         if (mce_notify_user()) {
 467                 next_interval = max(next_interval/2, HZ/100);
 468         } else {
 469                 next_interval = min(next_interval * 2,
 470                                 (int)round_jiffies_relative(check_interval*HZ));
 471         }
 472
 473         t->expires = jiffies + next_interval;
 474         add_timer(t);
 475 }
 476
 477 static void mce_do_trigger(struct work_struct *work)
 478 {
 479         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 480 }
 481
 482 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 483
 484 /*
 485  * Notify the user(s) about new machine check events.
 486  * Can be called from interrupt context, but not from machine check/NMI
 487  * context.
 488  */
 489 int mce_notify_user(void)
 490 {
 491         clear_thread_flag(TIF_MCE_NOTIFY);
 492         if (test_and_clear_bit(0, &notify_user)) {
 493                 static unsigned long last_print;
 494                 unsigned long now = jiffies;
 495
 496                 wake_up_interruptible(&mce_wait);
 497
 498                 /*
 499                  * There is no risk of missing notifications because
 500                  * work_pending is always cleared before the function is
 501                  * executed.
 502                  */
 503                 if (trigger[0] && !work_pending(&mce_trigger_work))
 504                         schedule_work(&mce_trigger_work);
 505
 506                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 507                         last_print = now;
 508                         printk(KERN_INFO "Machine check events logged\n");
 509                 }
 510
 511                 return 1;
 512         }
 513         return 0;
 514 }
 515
 516 /* see if the idle task needs to notify userspace */
 517 static int
 518 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 519 {
 520         /* IDLE_END should be safe - interrupts are back on */
 521         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 522                 mce_notify_user();
 523
 524         return NOTIFY_OK;
 525 }
 526
 527 static struct notifier_block mce_idle_notifier = {
 528         .notifier_call = mce_idle_callback,
 529 };
 530
 531 static __init int periodic_mcheck_init(void)
 532 {
 533        idle_notifier_register(&mce_idle_notifier);
 534        return 0;
 535 }
 536 __initcall(periodic_mcheck_init);
 537
 538 /*
 539  * Initialize Machine Checks for a CPU.
 540  */
 541 static int mce_cap_init(void)
 542 {
 543         u64 cap;
 544         unsigned b;
 545
 546         rdmsrl(MSR_IA32_MCG_CAP, cap);
 547         b = cap & 0xff;
 548         if (b > MAX_NR_BANKS) {
 549                 printk(KERN_WARNING
 550                        "MCE: Using only %u machine check banks out of %u\n",
 551                         MAX_NR_BANKS, b);
 552                 b = MAX_NR_BANKS;
 553         }
 554
 555         /* Don't support asymmetric configurations today */
 556         WARN_ON(banks != 0 && b != banks);
 557         banks = b;
 558         if (!bank) {
 559                 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
 560                 if (!bank)
 561                         return -ENOMEM;
 562                 memset(bank, 0xff, banks * sizeof(u64));
 563         }
 564
 565         /* Use accurate RIP reporting if available. */
 566         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 567                 rip_msr = MSR_IA32_MCG_EIP;
 568
 569         return 0;
 570 }
 571
 572 static void mce_init(void *dummy)
 573 {
 574         u64 cap;
 575         int i;
 576
 577         /*
 578          * Log the machine checks left over from the previous reset.
 579          */
 580         machine_check_poll(MCP_UC);
 581
 582         set_in_cr4(X86_CR4_MCE);
 583
 584         rdmsrl(MSR_IA32_MCG_CAP, cap);
 585         if (cap & MCG_CTL_P)
 586                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 587
 588         for (i = 0; i < banks; i++) {
 589                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 590                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 591         }
 592 }
 593
 594 /* Add per CPU specific workarounds here */
 595 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 596 {
 597         /* This should be disabled by the BIOS, but isn't always */
 598         if (c->x86_vendor == X86_VENDOR_AMD) {
 599                 if (c->x86 == 15 && banks > 4)
 600                         /* disable GART TBL walk error reporting, which trips off
 601                            incorrectly with the IOMMU & 3ware & Cerberus. */
 602                         clear_bit(10, (unsigned long *)&bank[4]);
 603                 if(c->x86 <= 17 && mce_bootlog < 0)
 604                         /* Lots of broken BIOS around that don't clear them
 605                            by default and leave crap in there. Don't log. */
 606                         mce_bootlog = 0;
 607         }
 608
 609 }
 610
 611 static void mce_cpu_features(struct cpuinfo_x86 *c)
 612 {
 613         switch (c->x86_vendor) {
 614         case X86_VENDOR_INTEL:
 615                 mce_intel_feature_init(c);
 616                 break;
 617         case X86_VENDOR_AMD:
 618                 mce_amd_feature_init(c);
 619                 break;
 620         default:
 621                 break;
 622         }
 623 }
 624
 625 static void mce_init_timer(void)
 626 {
 627         struct timer_list *t = &__get_cpu_var(mce_timer);
 628
 629         /* data race harmless because everyone sets to the same value */
 630         if (!next_interval)
 631                 next_interval = check_interval * HZ;
 632         if (!next_interval)
 633                 return;
 634         setup_timer(t, mcheck_timer, smp_processor_id());
 635         t->expires = round_jiffies_relative(jiffies + next_interval);
 636         add_timer(t);
 637 }
 638
 639 /*
 640  * Called for each booted CPU to set up machine checks.
 641  * Must be called with preempt off.
 642  */
 643 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 644 {
 645         if (!mce_available(c))
 646                 return;
 647
 648         if (mce_cap_init() < 0) {
 649                 mce_dont_init = 1;
 650                 return;
 651         }
 652         mce_cpu_quirks(c);
 653
 654         mce_init(NULL);
 655         mce_cpu_features(c);
 656         mce_init_timer();
 657 }
 658
 659 /*
 660  * Character device to read and clear the MCE log.
 661  */
 662
 663 static DEFINE_SPINLOCK(mce_state_lock);
 664 static int open_count;  /* #times opened */
 665 static int open_exclu;  /* already open exclusive? */
 666
 667 static int mce_open(struct inode *inode, struct file *file)
 668 {
 669         lock_kernel();
 670         spin_lock(&mce_state_lock);
 671
 672         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 673                 spin_unlock(&mce_state_lock);
 674                 unlock_kernel();
 675                 return -EBUSY;
 676         }
 677
 678         if (file->f_flags & O_EXCL)
 679                 open_exclu = 1;
 680         open_count++;
 681
 682         spin_unlock(&mce_state_lock);
 683         unlock_kernel();
 684
 685         return nonseekable_open(inode, file);
 686 }
 687
 688 static int mce_release(struct inode *inode, struct file *file)
 689 {
 690         spin_lock(&mce_state_lock);
 691
 692         open_count--;
 693         open_exclu = 0;
 694
 695         spin_unlock(&mce_state_lock);
 696
 697         return 0;
 698 }
 699
 700 static void collect_tscs(void *data)
 701 {
 702         unsigned long *cpu_tsc = (unsigned long *)data;
 703
 704         rdtscll(cpu_tsc[smp_processor_id()]);
 705 }
 706
 707 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 708                         loff_t *off)
 709 {
 710         unsigned long *cpu_tsc;
 711         static DEFINE_MUTEX(mce_read_mutex);
 712         unsigned prev, next;
 713         char __user *buf = ubuf;
 714         int i, err;
 715
 716         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 717         if (!cpu_tsc)
 718                 return -ENOMEM;
 719
 720         mutex_lock(&mce_read_mutex);
 721         next = rcu_dereference(mcelog.next);
 722
 723         /* Only supports full reads right now */
 724         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 725                 mutex_unlock(&mce_read_mutex);
 726                 kfree(cpu_tsc);
 727                 return -EINVAL;
 728         }
 729
 730         err = 0;
 731         prev = 0;
 732         do {
 733                 for (i = prev; i < next; i++) {
 734                         unsigned long start = jiffies;
 735
 736                         while (!mcelog.entry[i].finished) {
 737                                 if (time_after_eq(jiffies, start + 2)) {
 738                                         memset(mcelog.entry + i, 0,
 739                                                sizeof(struct mce));
 740                                         goto timeout;
 741                                 }
 742                                 cpu_relax();
 743                         }
 744                         smp_rmb();
 745                         err |= copy_to_user(buf, mcelog.entry + i,
 746                                             sizeof(struct mce));
 747                         buf += sizeof(struct mce);
 748 timeout:
 749                         ;
 750                 }
 751
 752                 memset(mcelog.entry + prev, 0,
 753                        (next - prev) * sizeof(struct mce));
 754                 prev = next;
 755                 next = cmpxchg(&mcelog.next, prev, 0);
 756         } while (next != prev);
 757
 758         synchronize_sched();
 759
 760         /*
 761          * Collect entries that were still getting written before the
 762          * synchronize.
 763          */
 764         on_each_cpu(collect_tscs, cpu_tsc, 1);
 765         for (i = next; i < MCE_LOG_LEN; i++) {
 766                 if (mcelog.entry[i].finished &&
 767                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 768                         err |= copy_to_user(buf, mcelog.entry+i,
 769                                             sizeof(struct mce));
 770                         smp_rmb();
 771                         buf += sizeof(struct mce);
 772                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 773                 }
 774         }
 775         mutex_unlock(&mce_read_mutex);
 776         kfree(cpu_tsc);
 777         return err ? -EFAULT : buf - ubuf;
 778 }
 779
 780 static unsigned int mce_poll(struct file *file, poll_table *wait)
 781 {
 782         poll_wait(file, &mce_wait, wait);
 783         if (rcu_dereference(mcelog.next))
 784                 return POLLIN | POLLRDNORM;
 785         return 0;
 786 }
 787
 788 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 789 {
 790         int __user *p = (int __user *)arg;
 791
 792         if (!capable(CAP_SYS_ADMIN))
 793                 return -EPERM;
 794         switch (cmd) {
 795         case MCE_GET_RECORD_LEN:
 796                 return put_user(sizeof(struct mce), p);
 797         case MCE_GET_LOG_LEN:
 798                 return put_user(MCE_LOG_LEN, p);
 799         case MCE_GETCLEAR_FLAGS: {
 800                 unsigned flags;
 801
 802                 do {
 803                         flags = mcelog.flags;
 804                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 805                 return put_user(flags, p);
 806         }
 807         default:
 808                 return -ENOTTY;
 809         }
 810 }
 811
 812 static const struct file_operations mce_chrdev_ops = {
 813         .open = mce_open,
 814         .release = mce_release,
 815         .read = mce_read,
 816         .poll = mce_poll,
 817         .unlocked_ioctl = mce_ioctl,
 818 };
 819
 820 static struct miscdevice mce_log_device = {
 821         MISC_MCELOG_MINOR,
 822         "mcelog",
 823         &mce_chrdev_ops,
 824 };
 825
 826 /*
 827  * Old style boot options parsing. Only for compatibility.
 828  */
 829 static int __init mcheck_disable(char *str)
 830 {
 831         mce_dont_init = 1;
 832         return 1;
 833 }
 834
 835 /* mce=off disables machine check.
 836    mce=TOLERANCELEVEL (number, see above)
 837    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 838    mce=nobootlog Don't log MCEs from before booting. */
 839 static int __init mcheck_enable(char *str)
 840 {
 841         if (!strcmp(str, "off"))
 842                 mce_dont_init = 1;
 843         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 844                 mce_bootlog = str[0] == 'b';
 845         else if (isdigit(str[0]))
 846                 get_option(&str, &tolerant);
 847         else
 848                 printk("mce= argument %s ignored. Please use /sys", str);
 849         return 1;
 850 }
 851
 852 __setup("nomce", mcheck_disable);
 853 __setup("mce=", mcheck_enable);
 854
 855 /*
 856  * Sysfs support
 857  */
 858
 859 /*
 860  * Disable machine checks on suspend and shutdown. We can't really handle
 861  * them later.
 862  */
 863 static int mce_disable(void)
 864 {
 865         int i;
 866
 867         for (i = 0; i < banks; i++)
 868                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 869         return 0;
 870 }
 871
 872 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 873 {
 874         return mce_disable();
 875 }
 876
 877 static int mce_shutdown(struct sys_device *dev)
 878 {
 879         return mce_disable();
 880 }
 881
 882 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 883    Only one CPU is active at this time, the others get readded later using
 884    CPU hotplug. */
 885 static int mce_resume(struct sys_device *dev)
 886 {
 887         mce_init(NULL);
 888         mce_cpu_features(&current_cpu_data);
 889         return 0;
 890 }
 891
 892 static void mce_cpu_restart(void *data)
 893 {
 894         del_timer_sync(&__get_cpu_var(mce_timer));
 895         if (mce_available(&current_cpu_data))
 896                 mce_init(NULL);
 897         mce_init_timer();
 898 }
 899
 900 /* Reinit MCEs after user configuration changes */
 901 static void mce_restart(void)
 902 {
 903         next_interval = check_interval * HZ;
 904         on_each_cpu(mce_cpu_restart, NULL, 1);
 905 }
 906
 907 static struct sysdev_class mce_sysclass = {
 908         .suspend = mce_suspend,
 909         .shutdown = mce_shutdown,
 910         .resume = mce_resume,
 911         .name = "machinecheck",
 912 };
 913
 914 DEFINE_PER_CPU(struct sys_device, device_mce);
 915 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
 916
 917 /* Why are there no generic functions for this? */
 918 #define ACCESSOR(name, var, start) \
 919         static ssize_t show_ ## name(struct sys_device *s,              \
 920                                      struct sysdev_attribute *attr,     \
 921                                      char *buf) {                       \
 922                 return sprintf(buf, "%lx\n", (unsigned long)var);       \
 923         }                                                               \
 924         static ssize_t set_ ## name(struct sys_device *s,               \
 925                                     struct sysdev_attribute *attr,      \
 926                                     const char *buf, size_t siz) {      \
 927                 char *end;                                              \
 928                 unsigned long new = simple_strtoul(buf, &end, 0);       \
 929                 if (end == buf) return -EINVAL;                         \
 930                 var = new;                                              \
 931                 start;                                                  \
 932                 return end-buf;                                         \
 933         }                                                               \
 934         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 935
 936 static struct sysdev_attribute *bank_attrs;
 937
 938 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 939                          char *buf)
 940 {
 941         u64 b = bank[attr - bank_attrs];
 942         return sprintf(buf, "%llx\n", b);
 943 }
 944
 945 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 946                         const char *buf, size_t siz)
 947 {
 948         char *end;
 949         u64 new = simple_strtoull(buf, &end, 0);
 950         if (end == buf)
 951                 return -EINVAL;
 952         bank[attr - bank_attrs] = new;
 953         mce_restart();
 954         return end-buf;
 955 }
 956
 957 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 958                                 char *buf)
 959 {
 960         strcpy(buf, trigger);
 961         strcat(buf, "\n");
 962         return strlen(trigger) + 1;
 963 }
 964
 965 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 966                                 const char *buf,size_t siz)
 967 {
 968         char *p;
 969         int len;
 970         strncpy(trigger, buf, sizeof(trigger));
 971         trigger[sizeof(trigger)-1] = 0;
 972         len = strlen(trigger);
 973         p = strchr(trigger, '\n');
 974         if (*p) *p = 0;
 975         return len;
 976 }
 977
 978 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 979 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 980 ACCESSOR(check_interval,check_interval,mce_restart())
 981 static struct sysdev_attribute *mce_attributes[] = {
 982         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 983         NULL
 984 };
 985
 986 static cpumask_t mce_device_initialized = CPU_MASK_NONE;
 987
 988 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 989 static __cpuinit int mce_create_device(unsigned int cpu)
 990 {
 991         int err;
 992         int i;
 993
 994         if (!mce_available(&boot_cpu_data))
 995                 return -EIO;
 996
 997         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 998         per_cpu(device_mce,cpu).id = cpu;
 999         per_cpu(device_mce,cpu).cls = &mce_sysclass;
1000
1001         err = sysdev_register(&per_cpu(device_mce,cpu));
1002         if (err)
1003                 return err;
1004
1005         for (i = 0; mce_attributes[i]; i++) {
1006                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1007                                          mce_attributes[i]);
1008                 if (err)
1009                         goto error;
1010         }
1011         for (i = 0; i < banks; i++) {
1012                 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1013                                         &bank_attrs[i]);
1014                 if (err)
1015                         goto error2;
1016         }
1017         cpu_set(cpu, mce_device_initialized);
1018
1019         return 0;
1020 error2:
1021         while (--i >= 0) {
1022                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1023                                         &bank_attrs[i]);
1024         }
1025 error:
1026         while (--i >= 0) {
1027                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1028                                    mce_attributes[i]);
1029         }
1030         sysdev_unregister(&per_cpu(device_mce,cpu));
1031
1032         return err;
1033 }
1034
1035 static __cpuinit void mce_remove_device(unsigned int cpu)
1036 {
1037         int i;
1038
1039         if (!cpu_isset(cpu, mce_device_initialized))
1040                 return;
1041
1042         for (i = 0; mce_attributes[i]; i++)
1043                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1044                         mce_attributes[i]);
1045         for (i = 0; i < banks; i++)
1046                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1047                         &bank_attrs[i]);
1048         sysdev_unregister(&per_cpu(device_mce,cpu));
1049         cpu_clear(cpu, mce_device_initialized);
1050 }
1051
1052 /* Make sure there are no machine checks on offlined CPUs. */
1053 static void mce_disable_cpu(void *h)
1054 {
1055         int i;
1056
1057         if (!mce_available(&current_cpu_data))
1058                 return;
1059         for (i = 0; i < banks; i++)
1060                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1061 }
1062
1063 static void mce_reenable_cpu(void *h)
1064 {
1065         int i;
1066
1067         if (!mce_available(&current_cpu_data))
1068                 return;
1069         for (i = 0; i < banks; i++)
1070                 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1071 }
1072
1073 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1074 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1075                                       unsigned long action, void *hcpu)
1076 {
1077         unsigned int cpu = (unsigned long)hcpu;
1078         struct timer_list *t = &per_cpu(mce_timer, cpu);
1079
1080         switch (action) {
1081         case CPU_ONLINE:
1082         case CPU_ONLINE_FROZEN:
1083                 mce_create_device(cpu);
1084                 if (threshold_cpu_callback)
1085                         threshold_cpu_callback(action, cpu);
1086                 break;
1087         case CPU_DEAD:
1088         case CPU_DEAD_FROZEN:
1089                 if (threshold_cpu_callback)
1090                         threshold_cpu_callback(action, cpu);
1091                 mce_remove_device(cpu);
1092                 break;
1093         case CPU_DOWN_PREPARE:
1094         case CPU_DOWN_PREPARE_FROZEN:
1095                 del_timer_sync(t);
1096                 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
1097                 break;
1098         case CPU_DOWN_FAILED:
1099         case CPU_DOWN_FAILED_FROZEN:
1100                 t->expires = round_jiffies_relative(jiffies + next_interval);
1101                 add_timer_on(t, cpu);
1102                 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
1103                 break;
1104         }
1105         return NOTIFY_OK;
1106 }
1107
1108 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1109         .notifier_call = mce_cpu_callback,
1110 };
1111
1112 static __init int mce_init_banks(void)
1113 {
1114         int i;
1115
1116         bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1117                                 GFP_KERNEL);
1118         if (!bank_attrs)
1119                 return -ENOMEM;
1120
1121         for (i = 0; i < banks; i++) {
1122                 struct sysdev_attribute *a = &bank_attrs[i];
1123                 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1124                 if (!a->attr.name)
1125                         goto nomem;
1126                 a->attr.mode = 0644;
1127                 a->show = show_bank;
1128                 a->store = set_bank;
1129         }
1130         return 0;
1131
1132 nomem:
1133         while (--i >= 0)
1134                 kfree(bank_attrs[i].attr.name);
1135         kfree(bank_attrs);
1136         bank_attrs = NULL;
1137         return -ENOMEM;
1138 }
1139
1140 static __init int mce_init_device(void)
1141 {
1142         int err;
1143         int i = 0;
1144
1145         if (!mce_available(&boot_cpu_data))
1146                 return -EIO;
1147
1148         err = mce_init_banks();
1149         if (err)
1150                 return err;
1151
1152         err = sysdev_class_register(&mce_sysclass);
1153         if (err)
1154                 return err;
1155
1156         for_each_online_cpu(i) {
1157                 err = mce_create_device(i);
1158                 if (err)
1159                         return err;
1160         }
1161
1162         register_hotcpu_notifier(&mce_cpu_notifier);
1163         misc_register(&mce_log_device);
1164         return err;
1165 }
1166
1167 device_initcall(mce_init_device);