arch/x86/kernel/cpu/mcheck/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  * Copyright 2008 Intel Corporation
   7  * Author: Andi Kleen
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/types.h>
  12 #include <linux/kernel.h>
  13 #include <linux/sched.h>
  14 #include <linux/smp_lock.h>
  15 #include <linux/string.h>
  16 #include <linux/rcupdate.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/sysdev.h>
  19 #include <linux/miscdevice.h>
  20 #include <linux/fs.h>
  21 #include <linux/capability.h>
  22 #include <linux/cpu.h>
  23 #include <linux/percpu.h>
  24 #include <linux/poll.h>
  25 #include <linux/thread_info.h>
  26 #include <linux/ctype.h>
  27 #include <linux/kmod.h>
  28 #include <linux/kdebug.h>
  29 #include <linux/kobject.h>
  30 #include <linux/sysfs.h>
  31 #include <asm/processor.h>
  32 #include <asm/msr.h>
  33 #include <asm/mce.h>
  34 #include <asm/uaccess.h>
  35 #include <asm/smp.h>
  36 #include <asm/idle.h>
  37
  38 #define MISC_MCELOG_MINOR 227
  39
  40 /*
  41  * To support more than 128 would need to escape the predefined
  42  * Linux defined extended banks first.
  43  */
  44 #define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
  45
  46 atomic_t mce_entry;
  47
  48 static int mce_dont_init;
  49
  50 /*
  51  * Tolerant levels:
  52  *   0: always panic on uncorrected errors, log corrected errors
  53  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  54  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  55  *   3: never panic or SIGBUS, log all errors (for testing only)
  56  */
  57 static int tolerant = 1;
  58 static int banks;
  59 static u64 *bank;
  60 static unsigned long notify_user;
  61 static int rip_msr;
  62 static int mce_bootlog = -1;
  63 static atomic_t mce_events;
  64
  65 static char trigger[128];
  66 static char *trigger_argv[2] = { trigger, NULL };
  67
  68 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  69
  70 /* Do initial initialization of a struct mce */
  71 void mce_setup(struct mce *m)
  72 {
  73         memset(m, 0, sizeof(struct mce));
  74         m->cpu = smp_processor_id();
  75         rdtscll(m->tsc);
  76 }
  77
  78 /*
  79  * Lockless MCE logging infrastructure.
  80  * This avoids deadlocks on printk locks without having to break locks. Also
  81  * separate MCEs from kernel messages to avoid bogus bug reports.
  82  */
  83
  84 static struct mce_log mcelog = {
  85         MCE_LOG_SIGNATURE,
  86         MCE_LOG_LEN,
  87 };
  88
  89 void mce_log(struct mce *mce)
  90 {
  91         unsigned next, entry;
  92         atomic_inc(&mce_events);
  93         mce->finished = 0;
  94         wmb();
  95         for (;;) {
  96                 entry = rcu_dereference(mcelog.next);
  97                 for (;;) {
  98                         /* When the buffer fills up discard new entries. Assume
  99                            that the earlier errors are the more interesting. */
 100                         if (entry >= MCE_LOG_LEN) {
 101                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
 102                                 return;
 103                         }
 104                         /* Old left over entry. Skip. */
 105                         if (mcelog.entry[entry].finished) {
 106                                 entry++;
 107                                 continue;
 108                         }
 109                         break;
 110                 }
 111                 smp_rmb();
 112                 next = entry + 1;
 113                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 114                         break;
 115         }
 116         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 117         wmb();
 118         mcelog.entry[entry].finished = 1;
 119         wmb();
 120
 121         set_bit(0, &notify_user);
 122 }
 123
 124 static void print_mce(struct mce *m)
 125 {
 126         printk(KERN_EMERG "\n"
 127                KERN_EMERG "HARDWARE ERROR\n"
 128                KERN_EMERG
 129                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 130                m->cpu, m->mcgstatus, m->bank, m->status);
 131         if (m->ip) {
 132                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 133                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 134                        m->cs, m->ip);
 135                 if (m->cs == __KERNEL_CS)
 136                         print_symbol("{%s}", m->ip);
 137                 printk("\n");
 138         }
 139         printk(KERN_EMERG "TSC %llx ", m->tsc);
 140         if (m->addr)
 141                 printk("ADDR %llx ", m->addr);
 142         if (m->misc)
 143                 printk("MISC %llx ", m->misc);
 144         printk("\n");
 145         printk(KERN_EMERG "This is not a software problem!\n");
 146         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 147                "and contact your hardware vendor\n");
 148 }
 149
 150 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 151 {
 152         int i;
 153
 154         oops_begin();
 155         for (i = 0; i < MCE_LOG_LEN; i++) {
 156                 unsigned long tsc = mcelog.entry[i].tsc;
 157
 158                 if (time_before(tsc, start))
 159                         continue;
 160                 print_mce(&mcelog.entry[i]);
 161                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 162                         backup = NULL;
 163         }
 164         if (backup)
 165                 print_mce(backup);
 166         panic(msg);
 167 }
 168
 169 static int mce_available(struct cpuinfo_x86 *c)
 170 {
 171         if (mce_dont_init)
 172                 return 0;
 173         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 174 }
 175
 176 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 177 {
 178         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 179                 m->ip = regs->ip;
 180                 m->cs = regs->cs;
 181         } else {
 182                 m->ip = 0;
 183                 m->cs = 0;
 184         }
 185         if (rip_msr) {
 186                 /* Assume the RIP in the MSR is exact. Is this true? */
 187                 m->mcgstatus |= MCG_STATUS_EIPV;
 188                 rdmsrl(rip_msr, m->ip);
 189                 m->cs = 0;
 190         }
 191 }
 192
 193 /*
 194  * Poll for corrected events or events that happened before reset.
 195  * Those are just logged through /dev/mcelog.
 196  *
 197  * This is executed in standard interrupt context.
 198  */
 199 void machine_check_poll(enum mcp_flags flags)
 200 {
 201         struct mce m;
 202         int i;
 203
 204         mce_setup(&m);
 205
 206         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 207         for (i = 0; i < banks; i++) {
 208                 if (!bank[i])
 209                         continue;
 210
 211                 m.misc = 0;
 212                 m.addr = 0;
 213                 m.bank = i;
 214                 m.tsc = 0;
 215
 216                 barrier();
 217                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 218                 if (!(m.status & MCI_STATUS_VAL))
 219                         continue;
 220
 221                 /*
 222                  * Uncorrected events are handled by the exception handler
 223                  * when it is enabled. But when the exception is disabled log
 224                  * everything.
 225                  *
 226                  * TBD do the same check for MCI_STATUS_EN here?
 227                  */
 228                 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
 229                         continue;
 230
 231                 if (m.status & MCI_STATUS_MISCV)
 232                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 233                 if (m.status & MCI_STATUS_ADDRV)
 234                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 235
 236                 if (!(flags & MCP_TIMESTAMP))
 237                         m.tsc = 0;
 238                 /*
 239                  * Don't get the IP here because it's unlikely to
 240                  * have anything to do with the actual error location.
 241                  */
 242
 243                 mce_log(&m);
 244                 add_taint(TAINT_MACHINE_CHECK);
 245
 246                 /*
 247                  * Clear state for this bank.
 248                  */
 249                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 250         }
 251
 252         /*
 253          * Don't clear MCG_STATUS here because it's only defined for
 254          * exceptions.
 255          */
 256 }
 257
 258 /*
 259  * The actual machine check handler. This only handles real
 260  * exceptions when something got corrupted coming in through int 18.
 261  *
 262  * This is executed in NMI context not subject to normal locking rules. This
 263  * implies that most kernel services cannot be safely used. Don't even
 264  * think about putting a printk in there!
 265  */
 266 void do_machine_check(struct pt_regs * regs, long error_code)
 267 {
 268         struct mce m, panicm;
 269         u64 mcestart = 0;
 270         int i;
 271         int panicm_found = 0;
 272         /*
 273          * If no_way_out gets set, there is no safe way to recover from this
 274          * MCE.  If tolerant is cranked up, we'll try anyway.
 275          */
 276         int no_way_out = 0;
 277         /*
 278          * If kill_it gets set, there might be a way to recover from this
 279          * error.
 280          */
 281         int kill_it = 0;
 282         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 283
 284         atomic_inc(&mce_entry);
 285
 286         if (notify_die(DIE_NMI, "machine check", regs, error_code,
 287                            18, SIGKILL) == NOTIFY_STOP)
 288                 goto out2;
 289         if (!banks)
 290                 goto out2;
 291
 292         mce_setup(&m);
 293
 294         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 295         /* if the restart IP is not valid, we're done for */
 296         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 297                 no_way_out = 1;
 298
 299         rdtscll(mcestart);
 300         barrier();
 301
 302         for (i = 0; i < banks; i++) {
 303                 __clear_bit(i, toclear);
 304                 if (!bank[i])
 305                         continue;
 306
 307                 m.misc = 0;
 308                 m.addr = 0;
 309                 m.bank = i;
 310
 311                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 312                 if ((m.status & MCI_STATUS_VAL) == 0)
 313                         continue;
 314
 315                 /*
 316                  * Non uncorrected errors are handled by machine_check_poll
 317                  * Leave them alone.
 318                  */
 319                 if ((m.status & MCI_STATUS_UC) == 0)
 320                         continue;
 321
 322                 /*
 323                  * Set taint even when machine check was not enabled.
 324                  */
 325                 add_taint(TAINT_MACHINE_CHECK);
 326
 327                 __set_bit(i, toclear);
 328
 329                 if (m.status & MCI_STATUS_EN) {
 330                         /* if PCC was set, there's no way out */
 331                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 332                         /*
 333                          * If this error was uncorrectable and there was
 334                          * an overflow, we're in trouble.  If no overflow,
 335                          * we might get away with just killing a task.
 336                          */
 337                         if (m.status & MCI_STATUS_UC) {
 338                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 339                                         no_way_out = 1;
 340                                 kill_it = 1;
 341                         }
 342                 } else {
 343                         /*
 344                          * Machine check event was not enabled. Clear, but
 345                          * ignore.
 346                          */
 347                         continue;
 348                 }
 349
 350                 if (m.status & MCI_STATUS_MISCV)
 351                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 352                 if (m.status & MCI_STATUS_ADDRV)
 353                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 354
 355                 mce_get_rip(&m, regs);
 356                 mce_log(&m);
 357
 358                 /* Did this bank cause the exception? */
 359                 /* Assume that the bank with uncorrectable errors did it,
 360                    and that there is only a single one. */
 361                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 362                         panicm = m;
 363                         panicm_found = 1;
 364                 }
 365         }
 366
 367         /* If we didn't find an uncorrectable error, pick
 368            the last one (shouldn't happen, just being safe). */
 369         if (!panicm_found)
 370                 panicm = m;
 371
 372         /*
 373          * If we have decided that we just CAN'T continue, and the user
 374          *  has not set tolerant to an insane level, give up and die.
 375          */
 376         if (no_way_out && tolerant < 3)
 377                 mce_panic("Machine check", &panicm, mcestart);
 378
 379         /*
 380          * If the error seems to be unrecoverable, something should be
 381          * done.  Try to kill as little as possible.  If we can kill just
 382          * one task, do that.  If the user has set the tolerance very
 383          * high, don't try to do anything at all.
 384          */
 385         if (kill_it && tolerant < 3) {
 386                 int user_space = 0;
 387
 388                 /*
 389                  * If the EIPV bit is set, it means the saved IP is the
 390                  * instruction which caused the MCE.
 391                  */
 392                 if (m.mcgstatus & MCG_STATUS_EIPV)
 393                         user_space = panicm.ip && (panicm.cs & 3);
 394
 395                 /*
 396                  * If we know that the error was in user space, send a
 397                  * SIGBUS.  Otherwise, panic if tolerance is low.
 398                  *
 399                  * force_sig() takes an awful lot of locks and has a slight
 400                  * risk of deadlocking.
 401                  */
 402                 if (user_space) {
 403                         force_sig(SIGBUS, current);
 404                 } else if (panic_on_oops || tolerant < 2) {
 405                         mce_panic("Uncorrected machine check",
 406                                 &panicm, mcestart);
 407                 }
 408         }
 409
 410         /* notify userspace ASAP */
 411         set_thread_flag(TIF_MCE_NOTIFY);
 412
 413         /* the last thing we do is clear state */
 414         for (i = 0; i < banks; i++) {
 415                 if (test_bit(i, toclear))
 416                         wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 417         }
 418         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 419  out2:
 420         atomic_dec(&mce_entry);
 421 }
 422
 423 #ifdef CONFIG_X86_MCE_INTEL
 424 /***
 425  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 426  * @cpu: The CPU on which the event occurred.
 427  * @status: Event status information
 428  *
 429  * This function should be called by the thermal interrupt after the
 430  * event has been processed and the decision was made to log the event
 431  * further.
 432  *
 433  * The status parameter will be saved to the 'status' field of 'struct mce'
 434  * and historically has been the register value of the
 435  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 436  */
 437 void mce_log_therm_throt_event(__u64 status)
 438 {
 439         struct mce m;
 440
 441         mce_setup(&m);
 442         m.bank = MCE_THERMAL_BANK;
 443         m.status = status;
 444         mce_log(&m);
 445 }
 446 #endif /* CONFIG_X86_MCE_INTEL */
 447
 448 /*
 449  * Periodic polling timer for "silent" machine check errors.  If the
 450  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 451  * errors, poll 2x slower (up to check_interval seconds).
 452  */
 453
 454 static int check_interval = 5 * 60; /* 5 minutes */
 455 static int next_interval; /* in jiffies */
 456 static void mcheck_timer(unsigned long);
 457 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 458
 459 static void mcheck_timer(unsigned long data)
 460 {
 461         struct timer_list *t = &per_cpu(mce_timer, data);
 462
 463         WARN_ON(smp_processor_id() != data);
 464
 465         if (mce_available(&current_cpu_data))
 466                 machine_check_poll(MCP_TIMESTAMP);
 467
 468         /*
 469          * Alert userspace if needed.  If we logged an MCE, reduce the
 470          * polling interval, otherwise increase the polling interval.
 471          */
 472         if (mce_notify_user()) {
 473                 next_interval = max(next_interval/2, HZ/100);
 474         } else {
 475                 next_interval = min(next_interval * 2,
 476                                 (int)round_jiffies_relative(check_interval*HZ));
 477         }
 478
 479         t->expires = jiffies + next_interval;
 480         add_timer(t);
 481 }
 482
 483 static void mce_do_trigger(struct work_struct *work)
 484 {
 485         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 486 }
 487
 488 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 489
 490 /*
 491  * Notify the user(s) about new machine check events.
 492  * Can be called from interrupt context, but not from machine check/NMI
 493  * context.
 494  */
 495 int mce_notify_user(void)
 496 {
 497         clear_thread_flag(TIF_MCE_NOTIFY);
 498         if (test_and_clear_bit(0, &notify_user)) {
 499                 static unsigned long last_print;
 500                 unsigned long now = jiffies;
 501
 502                 wake_up_interruptible(&mce_wait);
 503
 504                 /*
 505                  * There is no risk of missing notifications because
 506                  * work_pending is always cleared before the function is
 507                  * executed.
 508                  */
 509                 if (trigger[0] && !work_pending(&mce_trigger_work))
 510                         schedule_work(&mce_trigger_work);
 511
 512                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 513                         last_print = now;
 514                         printk(KERN_INFO "Machine check events logged\n");
 515                 }
 516
 517                 return 1;
 518         }
 519         return 0;
 520 }
 521
 522 /* see if the idle task needs to notify userspace */
 523 static int
 524 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 525 {
 526         /* IDLE_END should be safe - interrupts are back on */
 527         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 528                 mce_notify_user();
 529
 530         return NOTIFY_OK;
 531 }
 532
 533 static struct notifier_block mce_idle_notifier = {
 534         .notifier_call = mce_idle_callback,
 535 };
 536
 537 static __init int periodic_mcheck_init(void)
 538 {
 539        idle_notifier_register(&mce_idle_notifier);
 540        return 0;
 541 }
 542 __initcall(periodic_mcheck_init);
 543
 544 /*
 545  * Initialize Machine Checks for a CPU.
 546  */
 547 static int mce_cap_init(void)
 548 {
 549         u64 cap;
 550         unsigned b;
 551
 552         rdmsrl(MSR_IA32_MCG_CAP, cap);
 553         b = cap & 0xff;
 554         if (b > MAX_NR_BANKS) {
 555                 printk(KERN_WARNING
 556                        "MCE: Using only %u machine check banks out of %u\n",
 557                         MAX_NR_BANKS, b);
 558                 b = MAX_NR_BANKS;
 559         }
 560
 561         /* Don't support asymmetric configurations today */
 562         WARN_ON(banks != 0 && b != banks);
 563         banks = b;
 564         if (!bank) {
 565                 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
 566                 if (!bank)
 567                         return -ENOMEM;
 568                 memset(bank, 0xff, banks * sizeof(u64));
 569         }
 570
 571         /* Use accurate RIP reporting if available. */
 572         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 573                 rip_msr = MSR_IA32_MCG_EIP;
 574
 575         return 0;
 576 }
 577
 578 static void mce_init(void *dummy)
 579 {
 580         u64 cap;
 581         int i;
 582
 583         /*
 584          * Log the machine checks left over from the previous reset.
 585          */
 586         machine_check_poll(MCP_UC);
 587
 588         set_in_cr4(X86_CR4_MCE);
 589
 590         rdmsrl(MSR_IA32_MCG_CAP, cap);
 591         if (cap & MCG_CTL_P)
 592                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 593
 594         for (i = 0; i < banks; i++) {
 595                 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 596                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 597         }
 598 }
 599
 600 /* Add per CPU specific workarounds here */
 601 static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 602 {
 603         /* This should be disabled by the BIOS, but isn't always */
 604         if (c->x86_vendor == X86_VENDOR_AMD) {
 605                 if (c->x86 == 15 && banks > 4)
 606                         /* disable GART TBL walk error reporting, which trips off
 607                            incorrectly with the IOMMU & 3ware & Cerberus. */
 608                         clear_bit(10, (unsigned long *)&bank[4]);
 609                 if(c->x86 <= 17 && mce_bootlog < 0)
 610                         /* Lots of broken BIOS around that don't clear them
 611                            by default and leave crap in there. Don't log. */
 612                         mce_bootlog = 0;
 613         }
 614
 615 }
 616
 617 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 618 {
 619         switch (c->x86_vendor) {
 620         case X86_VENDOR_INTEL:
 621                 mce_intel_feature_init(c);
 622                 break;
 623         case X86_VENDOR_AMD:
 624                 mce_amd_feature_init(c);
 625                 break;
 626         default:
 627                 break;
 628         }
 629 }
 630
 631 static void mce_init_timer(void)
 632 {
 633         struct timer_list *t = &__get_cpu_var(mce_timer);
 634
 635         /* data race harmless because everyone sets to the same value */
 636         if (!next_interval)
 637                 next_interval = check_interval * HZ;
 638         if (!next_interval)
 639                 return;
 640         setup_timer(t, mcheck_timer, smp_processor_id());
 641         t->expires = round_jiffies_relative(jiffies + next_interval);
 642         add_timer(t);
 643 }
 644
 645 /*
 646  * Called for each booted CPU to set up machine checks.
 647  * Must be called with preempt off.
 648  */
 649 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 650 {
 651         if (!mce_available(c))
 652                 return;
 653
 654         if (mce_cap_init() < 0) {
 655                 mce_dont_init = 1;
 656                 return;
 657         }
 658         mce_cpu_quirks(c);
 659
 660         mce_init(NULL);
 661         mce_cpu_features(c);
 662         mce_init_timer();
 663 }
 664
 665 /*
 666  * Character device to read and clear the MCE log.
 667  */
 668
 669 static DEFINE_SPINLOCK(mce_state_lock);
 670 static int open_count;  /* #times opened */
 671 static int open_exclu;  /* already open exclusive? */
 672
 673 static int mce_open(struct inode *inode, struct file *file)
 674 {
 675         lock_kernel();
 676         spin_lock(&mce_state_lock);
 677
 678         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 679                 spin_unlock(&mce_state_lock);
 680                 unlock_kernel();
 681                 return -EBUSY;
 682         }
 683
 684         if (file->f_flags & O_EXCL)
 685                 open_exclu = 1;
 686         open_count++;
 687
 688         spin_unlock(&mce_state_lock);
 689         unlock_kernel();
 690
 691         return nonseekable_open(inode, file);
 692 }
 693
 694 static int mce_release(struct inode *inode, struct file *file)
 695 {
 696         spin_lock(&mce_state_lock);
 697
 698         open_count--;
 699         open_exclu = 0;
 700
 701         spin_unlock(&mce_state_lock);
 702
 703         return 0;
 704 }
 705
 706 static void collect_tscs(void *data)
 707 {
 708         unsigned long *cpu_tsc = (unsigned long *)data;
 709
 710         rdtscll(cpu_tsc[smp_processor_id()]);
 711 }
 712
 713 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 714                         loff_t *off)
 715 {
 716         unsigned long *cpu_tsc;
 717         static DEFINE_MUTEX(mce_read_mutex);
 718         unsigned prev, next;
 719         char __user *buf = ubuf;
 720         int i, err;
 721
 722         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 723         if (!cpu_tsc)
 724                 return -ENOMEM;
 725
 726         mutex_lock(&mce_read_mutex);
 727         next = rcu_dereference(mcelog.next);
 728
 729         /* Only supports full reads right now */
 730         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 731                 mutex_unlock(&mce_read_mutex);
 732                 kfree(cpu_tsc);
 733                 return -EINVAL;
 734         }
 735
 736         err = 0;
 737         prev = 0;
 738         do {
 739                 for (i = prev; i < next; i++) {
 740                         unsigned long start = jiffies;
 741
 742                         while (!mcelog.entry[i].finished) {
 743                                 if (time_after_eq(jiffies, start + 2)) {
 744                                         memset(mcelog.entry + i, 0,
 745                                                sizeof(struct mce));
 746                                         goto timeout;
 747                                 }
 748                                 cpu_relax();
 749                         }
 750                         smp_rmb();
 751                         err |= copy_to_user(buf, mcelog.entry + i,
 752                                             sizeof(struct mce));
 753                         buf += sizeof(struct mce);
 754 timeout:
 755                         ;
 756                 }
 757
 758                 memset(mcelog.entry + prev, 0,
 759                        (next - prev) * sizeof(struct mce));
 760                 prev = next;
 761                 next = cmpxchg(&mcelog.next, prev, 0);
 762         } while (next != prev);
 763
 764         synchronize_sched();
 765
 766         /*
 767          * Collect entries that were still getting written before the
 768          * synchronize.
 769          */
 770         on_each_cpu(collect_tscs, cpu_tsc, 1);
 771         for (i = next; i < MCE_LOG_LEN; i++) {
 772                 if (mcelog.entry[i].finished &&
 773                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 774                         err |= copy_to_user(buf, mcelog.entry+i,
 775                                             sizeof(struct mce));
 776                         smp_rmb();
 777                         buf += sizeof(struct mce);
 778                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 779                 }
 780         }
 781         mutex_unlock(&mce_read_mutex);
 782         kfree(cpu_tsc);
 783         return err ? -EFAULT : buf - ubuf;
 784 }
 785
 786 static unsigned int mce_poll(struct file *file, poll_table *wait)
 787 {
 788         poll_wait(file, &mce_wait, wait);
 789         if (rcu_dereference(mcelog.next))
 790                 return POLLIN | POLLRDNORM;
 791         return 0;
 792 }
 793
 794 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 795 {
 796         int __user *p = (int __user *)arg;
 797
 798         if (!capable(CAP_SYS_ADMIN))
 799                 return -EPERM;
 800         switch (cmd) {
 801         case MCE_GET_RECORD_LEN:
 802                 return put_user(sizeof(struct mce), p);
 803         case MCE_GET_LOG_LEN:
 804                 return put_user(MCE_LOG_LEN, p);
 805         case MCE_GETCLEAR_FLAGS: {
 806                 unsigned flags;
 807
 808                 do {
 809                         flags = mcelog.flags;
 810                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 811                 return put_user(flags, p);
 812         }
 813         default:
 814                 return -ENOTTY;
 815         }
 816 }
 817
 818 static const struct file_operations mce_chrdev_ops = {
 819         .open = mce_open,
 820         .release = mce_release,
 821         .read = mce_read,
 822         .poll = mce_poll,
 823         .unlocked_ioctl = mce_ioctl,
 824 };
 825
 826 static struct miscdevice mce_log_device = {
 827         MISC_MCELOG_MINOR,
 828         "mcelog",
 829         &mce_chrdev_ops,
 830 };
 831
 832 /*
 833  * Old style boot options parsing. Only for compatibility.
 834  */
 835 static int __init mcheck_disable(char *str)
 836 {
 837         mce_dont_init = 1;
 838         return 1;
 839 }
 840
 841 /* mce=off disables machine check.
 842    mce=TOLERANCELEVEL (number, see above)
 843    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 844    mce=nobootlog Don't log MCEs from before booting. */
 845 static int __init mcheck_enable(char *str)
 846 {
 847         if (!strcmp(str, "off"))
 848                 mce_dont_init = 1;
 849         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 850                 mce_bootlog = str[0] == 'b';
 851         else if (isdigit(str[0]))
 852                 get_option(&str, &tolerant);
 853         else
 854                 printk("mce= argument %s ignored. Please use /sys", str);
 855         return 1;
 856 }
 857
 858 __setup("nomce", mcheck_disable);
 859 __setup("mce=", mcheck_enable);
 860
 861 /*
 862  * Sysfs support
 863  */
 864
 865 /*
 866  * Disable machine checks on suspend and shutdown. We can't really handle
 867  * them later.
 868  */
 869 static int mce_disable(void)
 870 {
 871         int i;
 872
 873         for (i = 0; i < banks; i++)
 874                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 875         return 0;
 876 }
 877
 878 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 879 {
 880         return mce_disable();
 881 }
 882
 883 static int mce_shutdown(struct sys_device *dev)
 884 {
 885         return mce_disable();
 886 }
 887
 888 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 889    Only one CPU is active at this time, the others get readded later using
 890    CPU hotplug. */
 891 static int mce_resume(struct sys_device *dev)
 892 {
 893         mce_init(NULL);
 894         mce_cpu_features(&current_cpu_data);
 895         return 0;
 896 }
 897
 898 static void mce_cpu_restart(void *data)
 899 {
 900         del_timer_sync(&__get_cpu_var(mce_timer));
 901         if (mce_available(&current_cpu_data))
 902                 mce_init(NULL);
 903         mce_init_timer();
 904 }
 905
 906 /* Reinit MCEs after user configuration changes */
 907 static void mce_restart(void)
 908 {
 909         next_interval = check_interval * HZ;
 910         on_each_cpu(mce_cpu_restart, NULL, 1);
 911 }
 912
 913 static struct sysdev_class mce_sysclass = {
 914         .suspend = mce_suspend,
 915         .shutdown = mce_shutdown,
 916         .resume = mce_resume,
 917         .name = "machinecheck",
 918 };
 919
 920 DEFINE_PER_CPU(struct sys_device, device_mce);
 921 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
 922
 923 /* Why are there no generic functions for this? */
 924 #define ACCESSOR(name, var, start) \
 925         static ssize_t show_ ## name(struct sys_device *s,              \
 926                                      struct sysdev_attribute *attr,     \
 927                                      char *buf) {                       \
 928                 return sprintf(buf, "%lx\n", (unsigned long)var);       \
 929         }                                                               \
 930         static ssize_t set_ ## name(struct sys_device *s,               \
 931                                     struct sysdev_attribute *attr,      \
 932                                     const char *buf, size_t siz) {      \
 933                 char *end;                                              \
 934                 unsigned long new = simple_strtoul(buf, &end, 0);       \
 935                 if (end == buf) return -EINVAL;                         \
 936                 var = new;                                              \
 937                 start;                                                  \
 938                 return end-buf;                                         \
 939         }                                                               \
 940         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 941
 942 static struct sysdev_attribute *bank_attrs;
 943
 944 static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
 945                          char *buf)
 946 {
 947         u64 b = bank[attr - bank_attrs];
 948         return sprintf(buf, "%llx\n", b);
 949 }
 950
 951 static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 952                         const char *buf, size_t siz)
 953 {
 954         char *end;
 955         u64 new = simple_strtoull(buf, &end, 0);
 956         if (end == buf)
 957                 return -EINVAL;
 958         bank[attr - bank_attrs] = new;
 959         mce_restart();
 960         return end-buf;
 961 }
 962
 963 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 964                                 char *buf)
 965 {
 966         strcpy(buf, trigger);
 967         strcat(buf, "\n");
 968         return strlen(trigger) + 1;
 969 }
 970
 971 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 972                                 const char *buf,size_t siz)
 973 {
 974         char *p;
 975         int len;
 976         strncpy(trigger, buf, sizeof(trigger));
 977         trigger[sizeof(trigger)-1] = 0;
 978         len = strlen(trigger);
 979         p = strchr(trigger, '\n');
 980         if (*p) *p = 0;
 981         return len;
 982 }
 983
 984 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 985 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 986 ACCESSOR(check_interval,check_interval,mce_restart())
 987 static struct sysdev_attribute *mce_attributes[] = {
 988         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 989         NULL
 990 };
 991
 992 static cpumask_t mce_device_initialized = CPU_MASK_NONE;
 993
 994 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 995 static __cpuinit int mce_create_device(unsigned int cpu)
 996 {
 997         int err;
 998         int i;
 999
1000         if (!mce_available(&boot_cpu_data))
1001                 return -EIO;
1002
1003         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
1004         per_cpu(device_mce,cpu).id = cpu;
1005         per_cpu(device_mce,cpu).cls = &mce_sysclass;
1006
1007         err = sysdev_register(&per_cpu(device_mce,cpu));
1008         if (err)
1009                 return err;
1010
1011         for (i = 0; mce_attributes[i]; i++) {
1012                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1013                                          mce_attributes[i]);
1014                 if (err)
1015                         goto error;
1016         }
1017         for (i = 0; i < banks; i++) {
1018                 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1019                                         &bank_attrs[i]);
1020                 if (err)
1021                         goto error2;
1022         }
1023         cpu_set(cpu, mce_device_initialized);
1024
1025         return 0;
1026 error2:
1027         while (--i >= 0) {
1028                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1029                                         &bank_attrs[i]);
1030         }
1031 error:
1032         while (--i >= 0) {
1033                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1034                                    mce_attributes[i]);
1035         }
1036         sysdev_unregister(&per_cpu(device_mce,cpu));
1037
1038         return err;
1039 }
1040
1041 static __cpuinit void mce_remove_device(unsigned int cpu)
1042 {
1043         int i;
1044
1045         if (!cpu_isset(cpu, mce_device_initialized))
1046                 return;
1047
1048         for (i = 0; mce_attributes[i]; i++)
1049                 sysdev_remove_file(&per_cpu(device_mce,cpu),
1050                         mce_attributes[i]);
1051         for (i = 0; i < banks; i++)
1052                 sysdev_remove_file(&per_cpu(device_mce, cpu),
1053                         &bank_attrs[i]);
1054         sysdev_unregister(&per_cpu(device_mce,cpu));
1055         cpu_clear(cpu, mce_device_initialized);
1056 }
1057
1058 /* Make sure there are no machine checks on offlined CPUs. */
1059 static void mce_disable_cpu(void *h)
1060 {
1061         int i;
1062
1063         if (!mce_available(&current_cpu_data))
1064                 return;
1065         for (i = 0; i < banks; i++)
1066                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1067 }
1068
1069 static void mce_reenable_cpu(void *h)
1070 {
1071         int i;
1072
1073         if (!mce_available(&current_cpu_data))
1074                 return;
1075         for (i = 0; i < banks; i++)
1076                 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1077 }
1078
1079 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
1080 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1081                                       unsigned long action, void *hcpu)
1082 {
1083         unsigned int cpu = (unsigned long)hcpu;
1084         struct timer_list *t = &per_cpu(mce_timer, cpu);
1085
1086         switch (action) {
1087         case CPU_ONLINE:
1088         case CPU_ONLINE_FROZEN:
1089                 mce_create_device(cpu);
1090                 if (threshold_cpu_callback)
1091                         threshold_cpu_callback(action, cpu);
1092                 break;
1093         case CPU_DEAD:
1094         case CPU_DEAD_FROZEN:
1095                 if (threshold_cpu_callback)
1096                         threshold_cpu_callback(action, cpu);
1097                 mce_remove_device(cpu);
1098                 break;
1099         case CPU_DOWN_PREPARE:
1100         case CPU_DOWN_PREPARE_FROZEN:
1101                 del_timer_sync(t);
1102                 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
1103                 break;
1104         case CPU_DOWN_FAILED:
1105         case CPU_DOWN_FAILED_FROZEN:
1106                 t->expires = round_jiffies_relative(jiffies + next_interval);
1107                 add_timer_on(t, cpu);
1108                 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
1109                 break;
1110         }
1111         return NOTIFY_OK;
1112 }
1113
1114 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1115         .notifier_call = mce_cpu_callback,
1116 };
1117
1118 static __init int mce_init_banks(void)
1119 {
1120         int i;
1121
1122         bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1123                                 GFP_KERNEL);
1124         if (!bank_attrs)
1125                 return -ENOMEM;
1126
1127         for (i = 0; i < banks; i++) {
1128                 struct sysdev_attribute *a = &bank_attrs[i];
1129                 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1130                 if (!a->attr.name)
1131                         goto nomem;
1132                 a->attr.mode = 0644;
1133                 a->show = show_bank;
1134                 a->store = set_bank;
1135         }
1136         return 0;
1137
1138 nomem:
1139         while (--i >= 0)
1140                 kfree(bank_attrs[i].attr.name);
1141         kfree(bank_attrs);
1142         bank_attrs = NULL;
1143         return -ENOMEM;
1144 }
1145
1146 static __init int mce_init_device(void)
1147 {
1148         int err;
1149         int i = 0;
1150
1151         if (!mce_available(&boot_cpu_data))
1152                 return -EIO;
1153
1154         err = mce_init_banks();
1155         if (err)
1156                 return err;
1157
1158         err = sysdev_class_register(&mce_sysclass);
1159         if (err)
1160                 return err;
1161
1162         for_each_online_cpu(i) {
1163                 err = mce_create_device(i);
1164                 if (err)
1165                         return err;
1166         }
1167
1168         register_hotcpu_notifier(&mce_cpu_notifier);
1169         misc_register(&mce_log_device);
1170         return err;
1171 }
1172
1173 device_initcall(mce_init_device);