arch/x86/kernel/cpu/mcheck/mce_64.c

   1 /*
   2  * Machine check handler.
   3  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  * Rest from unknown author(s).
   5  * 2004 Andi Kleen. Rewrote most of it.
   6  */
   7
   8 #include <linux/init.h>
   9 #include <linux/types.h>
  10 #include <linux/kernel.h>
  11 #include <linux/sched.h>
  12 #include <linux/smp_lock.h>
  13 #include <linux/string.h>
  14 #include <linux/rcupdate.h>
  15 #include <linux/kallsyms.h>
  16 #include <linux/sysdev.h>
  17 #include <linux/miscdevice.h>
  18 #include <linux/fs.h>
  19 #include <linux/capability.h>
  20 #include <linux/cpu.h>
  21 #include <linux/percpu.h>
  22 #include <linux/poll.h>
  23 #include <linux/thread_info.h>
  24 #include <linux/ctype.h>
  25 #include <linux/kmod.h>
  26 #include <linux/kdebug.h>
  27 #include <asm/processor.h>
  28 #include <asm/msr.h>
  29 #include <asm/mce.h>
  30 #include <asm/uaccess.h>
  31 #include <asm/smp.h>
  32 #include <asm/idle.h>
  33
  34 #define MISC_MCELOG_MINOR 227
  35 #define NR_SYSFS_BANKS 6
  36
  37 atomic_t mce_entry;
  38
  39 static int mce_dont_init;
  40
  41 /*
  42  * Tolerant levels:
  43  *   0: always panic on uncorrected errors, log corrected errors
  44  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  45  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  46  *   3: never panic or SIGBUS, log all errors (for testing only)
  47  */
  48 static int tolerant = 1;
  49 static int banks;
  50 static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
  51 static unsigned long notify_user;
  52 static int rip_msr;
  53 static int mce_bootlog = -1;
  54 static atomic_t mce_events;
  55
  56 static char trigger[128];
  57 static char *trigger_argv[2] = { trigger, NULL };
  58
  59 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  60
  61 /*
  62  * Lockless MCE logging infrastructure.
  63  * This avoids deadlocks on printk locks without having to break locks. Also
  64  * separate MCEs from kernel messages to avoid bogus bug reports.
  65  */
  66
  67 static struct mce_log mcelog = {
  68         MCE_LOG_SIGNATURE,
  69         MCE_LOG_LEN,
  70 };
  71
  72 void mce_log(struct mce *mce)
  73 {
  74         unsigned next, entry;
  75         atomic_inc(&mce_events);
  76         mce->finished = 0;
  77         wmb();
  78         for (;;) {
  79                 entry = rcu_dereference(mcelog.next);
  80                 for (;;) {
  81                         /* When the buffer fills up discard new entries. Assume
  82                            that the earlier errors are the more interesting. */
  83                         if (entry >= MCE_LOG_LEN) {
  84                                 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
  85                                 return;
  86                         }
  87                         /* Old left over entry. Skip. */
  88                         if (mcelog.entry[entry].finished) {
  89                                 entry++;
  90                                 continue;
  91                         }
  92                         break;
  93                 }
  94                 smp_rmb();
  95                 next = entry + 1;
  96                 if (cmpxchg(&mcelog.next, entry, next) == entry)
  97                         break;
  98         }
  99         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 100         wmb();
 101         mcelog.entry[entry].finished = 1;
 102         wmb();
 103
 104         set_bit(0, &notify_user);
 105 }
 106
 107 static void print_mce(struct mce *m)
 108 {
 109         printk(KERN_EMERG "\n"
 110                KERN_EMERG "HARDWARE ERROR\n"
 111                KERN_EMERG
 112                "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
 113                m->cpu, m->mcgstatus, m->bank, m->status);
 114         if (m->ip) {
 115                 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
 116                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 117                        m->cs, m->ip);
 118                 if (m->cs == __KERNEL_CS)
 119                         print_symbol("{%s}", m->ip);
 120                 printk("\n");
 121         }
 122         printk(KERN_EMERG "TSC %Lx ", m->tsc);
 123         if (m->addr)
 124                 printk("ADDR %Lx ", m->addr);
 125         if (m->misc)
 126                 printk("MISC %Lx ", m->misc);
 127         printk("\n");
 128         printk(KERN_EMERG "This is not a software problem!\n");
 129         printk(KERN_EMERG "Run through mcelog --ascii to decode "
 130                "and contact your hardware vendor\n");
 131 }
 132
 133 static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 134 {
 135         int i;
 136
 137         oops_begin();
 138         for (i = 0; i < MCE_LOG_LEN; i++) {
 139                 unsigned long tsc = mcelog.entry[i].tsc;
 140
 141                 if (time_before(tsc, start))
 142                         continue;
 143                 print_mce(&mcelog.entry[i]);
 144                 if (backup && mcelog.entry[i].tsc == backup->tsc)
 145                         backup = NULL;
 146         }
 147         if (backup)
 148                 print_mce(backup);
 149         panic(msg);
 150 }
 151
 152 static int mce_available(struct cpuinfo_x86 *c)
 153 {
 154         if (mce_dont_init)
 155                 return 0;
 156         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 157 }
 158
 159 static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 160 {
 161         if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
 162                 m->ip = regs->ip;
 163                 m->cs = regs->cs;
 164         } else {
 165                 m->ip = 0;
 166                 m->cs = 0;
 167         }
 168         if (rip_msr) {
 169                 /* Assume the RIP in the MSR is exact. Is this true? */
 170                 m->mcgstatus |= MCG_STATUS_EIPV;
 171                 rdmsrl(rip_msr, m->ip);
 172                 m->cs = 0;
 173         }
 174 }
 175
 176 /*
 177  * The actual machine check handler
 178  */
 179 void do_machine_check(struct pt_regs * regs, long error_code)
 180 {
 181         struct mce m, panicm;
 182         u64 mcestart = 0;
 183         int i;
 184         int panicm_found = 0;
 185         /*
 186          * If no_way_out gets set, there is no safe way to recover from this
 187          * MCE.  If tolerant is cranked up, we'll try anyway.
 188          */
 189         int no_way_out = 0;
 190         /*
 191          * If kill_it gets set, there might be a way to recover from this
 192          * error.
 193          */
 194         int kill_it = 0;
 195
 196         atomic_inc(&mce_entry);
 197
 198         if ((regs
 199              && notify_die(DIE_NMI, "machine check", regs, error_code,
 200                            18, SIGKILL) == NOTIFY_STOP)
 201             || !banks)
 202                 goto out2;
 203
 204         memset(&m, 0, sizeof(struct mce));
 205         m.cpu = smp_processor_id();
 206         rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 207         /* if the restart IP is not valid, we're done for */
 208         if (!(m.mcgstatus & MCG_STATUS_RIPV))
 209                 no_way_out = 1;
 210
 211         rdtscll(mcestart);
 212         barrier();
 213
 214         for (i = 0; i < banks; i++) {
 215                 if (i < NR_SYSFS_BANKS && !bank[i])
 216                         continue;
 217
 218                 m.misc = 0;
 219                 m.addr = 0;
 220                 m.bank = i;
 221                 m.tsc = 0;
 222
 223                 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 224                 if ((m.status & MCI_STATUS_VAL) == 0)
 225                         continue;
 226
 227                 if (m.status & MCI_STATUS_EN) {
 228                         /* if PCC was set, there's no way out */
 229                         no_way_out |= !!(m.status & MCI_STATUS_PCC);
 230                         /*
 231                          * If this error was uncorrectable and there was
 232                          * an overflow, we're in trouble.  If no overflow,
 233                          * we might get away with just killing a task.
 234                          */
 235                         if (m.status & MCI_STATUS_UC) {
 236                                 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
 237                                         no_way_out = 1;
 238                                 kill_it = 1;
 239                         }
 240                 }
 241
 242                 if (m.status & MCI_STATUS_MISCV)
 243                         rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
 244                 if (m.status & MCI_STATUS_ADDRV)
 245                         rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 246
 247                 mce_get_rip(&m, regs);
 248                 if (error_code >= 0)
 249                         rdtscll(m.tsc);
 250                 if (error_code != -2)
 251                         mce_log(&m);
 252
 253                 /* Did this bank cause the exception? */
 254                 /* Assume that the bank with uncorrectable errors did it,
 255                    and that there is only a single one. */
 256                 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
 257                         panicm = m;
 258                         panicm_found = 1;
 259                 }
 260
 261                 add_taint(TAINT_MACHINE_CHECK);
 262         }
 263
 264         /* Never do anything final in the polling timer */
 265         if (!regs)
 266                 goto out;
 267
 268         /* If we didn't find an uncorrectable error, pick
 269            the last one (shouldn't happen, just being safe). */
 270         if (!panicm_found)
 271                 panicm = m;
 272
 273         /*
 274          * If we have decided that we just CAN'T continue, and the user
 275          *  has not set tolerant to an insane level, give up and die.
 276          */
 277         if (no_way_out && tolerant < 3)
 278                 mce_panic("Machine check", &panicm, mcestart);
 279
 280         /*
 281          * If the error seems to be unrecoverable, something should be
 282          * done.  Try to kill as little as possible.  If we can kill just
 283          * one task, do that.  If the user has set the tolerance very
 284          * high, don't try to do anything at all.
 285          */
 286         if (kill_it && tolerant < 3) {
 287                 int user_space = 0;
 288
 289                 /*
 290                  * If the EIPV bit is set, it means the saved IP is the
 291                  * instruction which caused the MCE.
 292                  */
 293                 if (m.mcgstatus & MCG_STATUS_EIPV)
 294                         user_space = panicm.ip && (panicm.cs & 3);
 295
 296                 /*
 297                  * If we know that the error was in user space, send a
 298                  * SIGBUS.  Otherwise, panic if tolerance is low.
 299                  *
 300                  * force_sig() takes an awful lot of locks and has a slight
 301                  * risk of deadlocking.
 302                  */
 303                 if (user_space) {
 304                         force_sig(SIGBUS, current);
 305                 } else if (panic_on_oops || tolerant < 2) {
 306                         mce_panic("Uncorrected machine check",
 307                                 &panicm, mcestart);
 308                 }
 309         }
 310
 311         /* notify userspace ASAP */
 312         set_thread_flag(TIF_MCE_NOTIFY);
 313
 314  out:
 315         /* the last thing we do is clear state */
 316         for (i = 0; i < banks; i++)
 317                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 318         wrmsrl(MSR_IA32_MCG_STATUS, 0);
 319  out2:
 320         atomic_dec(&mce_entry);
 321 }
 322
 323 #ifdef CONFIG_X86_MCE_INTEL
 324 /***
 325  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 326  * @cpu: The CPU on which the event occurred.
 327  * @status: Event status information
 328  *
 329  * This function should be called by the thermal interrupt after the
 330  * event has been processed and the decision was made to log the event
 331  * further.
 332  *
 333  * The status parameter will be saved to the 'status' field of 'struct mce'
 334  * and historically has been the register value of the
 335  * MSR_IA32_THERMAL_STATUS (Intel) msr.
 336  */
 337 void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 338 {
 339         struct mce m;
 340
 341         memset(&m, 0, sizeof(m));
 342         m.cpu = cpu;
 343         m.bank = MCE_THERMAL_BANK;
 344         m.status = status;
 345         rdtscll(m.tsc);
 346         mce_log(&m);
 347 }
 348 #endif /* CONFIG_X86_MCE_INTEL */
 349
 350 /*
 351  * Periodic polling timer for "silent" machine check errors.  If the
 352  * poller finds an MCE, poll 2x faster.  When the poller finds no more
 353  * errors, poll 2x slower (up to check_interval seconds).
 354  */
 355
 356 static int check_interval = 5 * 60; /* 5 minutes */
 357 static int next_interval; /* in jiffies */
 358 static void mcheck_timer(unsigned long);
 359 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 360
 361 static void mcheck_timer(unsigned long data)
 362 {
 363         struct timer_list *t = &per_cpu(mce_timer, data);
 364
 365         WARN_ON(smp_processor_id() != data);
 366
 367         if (mce_available(&current_cpu_data))
 368                 do_machine_check(NULL, 0);
 369
 370         /*
 371          * Alert userspace if needed.  If we logged an MCE, reduce the
 372          * polling interval, otherwise increase the polling interval.
 373          */
 374         if (mce_notify_user()) {
 375                 next_interval = max(next_interval/2, HZ/100);
 376         } else {
 377                 next_interval = min(next_interval * 2,
 378                                 (int)round_jiffies_relative(check_interval*HZ));
 379         }
 380
 381         t->expires = jiffies + next_interval;
 382         add_timer(t);
 383 }
 384
 385 static void mce_do_trigger(struct work_struct *work)
 386 {
 387         call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 388 }
 389
 390 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 391
 392 /*
 393  * Notify the user(s) about new machine check events.
 394  * Can be called from interrupt context, but not from machine check/NMI
 395  * context.
 396  */
 397 int mce_notify_user(void)
 398 {
 399         clear_thread_flag(TIF_MCE_NOTIFY);
 400         if (test_and_clear_bit(0, &notify_user)) {
 401                 static unsigned long last_print;
 402                 unsigned long now = jiffies;
 403
 404                 wake_up_interruptible(&mce_wait);
 405
 406                 /*
 407                  * There is no risk of missing notifications because
 408                  * work_pending is always cleared before the function is
 409                  * executed.
 410                  */
 411                 if (trigger[0] && !work_pending(&mce_trigger_work))
 412                         schedule_work(&mce_trigger_work);
 413
 414                 if (time_after_eq(now, last_print + (check_interval*HZ))) {
 415                         last_print = now;
 416                         printk(KERN_INFO "Machine check events logged\n");
 417                 }
 418
 419                 return 1;
 420         }
 421         return 0;
 422 }
 423
 424 /* see if the idle task needs to notify userspace */
 425 static int
 426 mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
 427 {
 428         /* IDLE_END should be safe - interrupts are back on */
 429         if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
 430                 mce_notify_user();
 431
 432         return NOTIFY_OK;
 433 }
 434
 435 static struct notifier_block mce_idle_notifier = {
 436         .notifier_call = mce_idle_callback,
 437 };
 438
 439 static __init int periodic_mcheck_init(void)
 440 {
 441        idle_notifier_register(&mce_idle_notifier);
 442        return 0;
 443 }
 444 __initcall(periodic_mcheck_init);
 445
 446 /*
 447  * Initialize Machine Checks for a CPU.
 448  */
 449 static void mce_init(void *dummy)
 450 {
 451         u64 cap;
 452         int i;
 453
 454         rdmsrl(MSR_IA32_MCG_CAP, cap);
 455         banks = cap & 0xff;
 456         if (banks > MCE_EXTENDED_BANK) {
 457                 banks = MCE_EXTENDED_BANK;
 458                 printk(KERN_INFO "MCE: warning: using only %d banks\n",
 459                        MCE_EXTENDED_BANK);
 460         }
 461         /* Use accurate RIP reporting if available. */
 462         if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 463                 rip_msr = MSR_IA32_MCG_EIP;
 464
 465         /* Log the machine checks left over from the previous reset.
 466            This also clears all registers */
 467         do_machine_check(NULL, mce_bootlog ? -1 : -2);
 468
 469         set_in_cr4(X86_CR4_MCE);
 470
 471         if (cap & MCG_CTL_P)
 472                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 473
 474         for (i = 0; i < banks; i++) {
 475                 if (i < NR_SYSFS_BANKS)
 476                         wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 477                 else
 478                         wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
 479
 480                 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 481         }
 482 }
 483
 484 /* Add per CPU specific workarounds here */
 485 static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 486 {
 487         /* This should be disabled by the BIOS, but isn't always */
 488         if (c->x86_vendor == X86_VENDOR_AMD) {
 489                 if(c->x86 == 15)
 490                         /* disable GART TBL walk error reporting, which trips off
 491                            incorrectly with the IOMMU & 3ware & Cerberus. */
 492                         clear_bit(10, &bank[4]);
 493                 if(c->x86 <= 17 && mce_bootlog < 0)
 494                         /* Lots of broken BIOS around that don't clear them
 495                            by default and leave crap in there. Don't log. */
 496                         mce_bootlog = 0;
 497         }
 498
 499 }
 500
 501 static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 502 {
 503         switch (c->x86_vendor) {
 504         case X86_VENDOR_INTEL:
 505                 mce_intel_feature_init(c);
 506                 break;
 507         case X86_VENDOR_AMD:
 508                 mce_amd_feature_init(c);
 509                 break;
 510         default:
 511                 break;
 512         }
 513 }
 514
 515 static void mce_init_timer(void)
 516 {
 517         struct timer_list *t = &__get_cpu_var(mce_timer);
 518
 519         /* data race harmless because everyone sets to the same value */
 520         if (!next_interval)
 521                 next_interval = check_interval * HZ;
 522         if (!next_interval)
 523                 return;
 524         setup_timer(t, mcheck_timer, smp_processor_id());
 525         t->expires = round_jiffies_relative(jiffies + next_interval);
 526         add_timer(t);
 527 }
 528
 529 /*
 530  * Called for each booted CPU to set up machine checks.
 531  * Must be called with preempt off.
 532  */
 533 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 534 {
 535         mce_cpu_quirks(c);
 536
 537         if (!mce_available(c))
 538                 return;
 539
 540         mce_init(NULL);
 541         mce_cpu_features(c);
 542         mce_init_timer();
 543 }
 544
 545 /*
 546  * Character device to read and clear the MCE log.
 547  */
 548
 549 static DEFINE_SPINLOCK(mce_state_lock);
 550 static int open_count;  /* #times opened */
 551 static int open_exclu;  /* already open exclusive? */
 552
 553 static int mce_open(struct inode *inode, struct file *file)
 554 {
 555         lock_kernel();
 556         spin_lock(&mce_state_lock);
 557
 558         if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 559                 spin_unlock(&mce_state_lock);
 560                 unlock_kernel();
 561                 return -EBUSY;
 562         }
 563
 564         if (file->f_flags & O_EXCL)
 565                 open_exclu = 1;
 566         open_count++;
 567
 568         spin_unlock(&mce_state_lock);
 569         unlock_kernel();
 570
 571         return nonseekable_open(inode, file);
 572 }
 573
 574 static int mce_release(struct inode *inode, struct file *file)
 575 {
 576         spin_lock(&mce_state_lock);
 577
 578         open_count--;
 579         open_exclu = 0;
 580
 581         spin_unlock(&mce_state_lock);
 582
 583         return 0;
 584 }
 585
 586 static void collect_tscs(void *data)
 587 {
 588         unsigned long *cpu_tsc = (unsigned long *)data;
 589
 590         rdtscll(cpu_tsc[smp_processor_id()]);
 591 }
 592
 593 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 594                         loff_t *off)
 595 {
 596         unsigned long *cpu_tsc;
 597         static DEFINE_MUTEX(mce_read_mutex);
 598         unsigned prev, next;
 599         char __user *buf = ubuf;
 600         int i, err;
 601
 602         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 603         if (!cpu_tsc)
 604                 return -ENOMEM;
 605
 606         mutex_lock(&mce_read_mutex);
 607         next = rcu_dereference(mcelog.next);
 608
 609         /* Only supports full reads right now */
 610         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
 611                 mutex_unlock(&mce_read_mutex);
 612                 kfree(cpu_tsc);
 613                 return -EINVAL;
 614         }
 615
 616         err = 0;
 617         prev = 0;
 618         do {
 619                 for (i = prev; i < next; i++) {
 620                         unsigned long start = jiffies;
 621
 622                         while (!mcelog.entry[i].finished) {
 623                                 if (time_after_eq(jiffies, start + 2)) {
 624                                         memset(mcelog.entry + i, 0,
 625                                                sizeof(struct mce));
 626                                         goto timeout;
 627                                 }
 628                                 cpu_relax();
 629                         }
 630                         smp_rmb();
 631                         err |= copy_to_user(buf, mcelog.entry + i,
 632                                             sizeof(struct mce));
 633                         buf += sizeof(struct mce);
 634 timeout:
 635                         ;
 636                 }
 637
 638                 memset(mcelog.entry + prev, 0,
 639                        (next - prev) * sizeof(struct mce));
 640                 prev = next;
 641                 next = cmpxchg(&mcelog.next, prev, 0);
 642         } while (next != prev);
 643
 644         synchronize_sched();
 645
 646         /*
 647          * Collect entries that were still getting written before the
 648          * synchronize.
 649          */
 650         on_each_cpu(collect_tscs, cpu_tsc, 1);
 651         for (i = next; i < MCE_LOG_LEN; i++) {
 652                 if (mcelog.entry[i].finished &&
 653                     mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
 654                         err |= copy_to_user(buf, mcelog.entry+i,
 655                                             sizeof(struct mce));
 656                         smp_rmb();
 657                         buf += sizeof(struct mce);
 658                         memset(&mcelog.entry[i], 0, sizeof(struct mce));
 659                 }
 660         }
 661         mutex_unlock(&mce_read_mutex);
 662         kfree(cpu_tsc);
 663         return err ? -EFAULT : buf - ubuf;
 664 }
 665
 666 static unsigned int mce_poll(struct file *file, poll_table *wait)
 667 {
 668         poll_wait(file, &mce_wait, wait);
 669         if (rcu_dereference(mcelog.next))
 670                 return POLLIN | POLLRDNORM;
 671         return 0;
 672 }
 673
 674 static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 675 {
 676         int __user *p = (int __user *)arg;
 677
 678         if (!capable(CAP_SYS_ADMIN))
 679                 return -EPERM;
 680         switch (cmd) {
 681         case MCE_GET_RECORD_LEN:
 682                 return put_user(sizeof(struct mce), p);
 683         case MCE_GET_LOG_LEN:
 684                 return put_user(MCE_LOG_LEN, p);
 685         case MCE_GETCLEAR_FLAGS: {
 686                 unsigned flags;
 687
 688                 do {
 689                         flags = mcelog.flags;
 690                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 691                 return put_user(flags, p);
 692         }
 693         default:
 694                 return -ENOTTY;
 695         }
 696 }
 697
 698 static const struct file_operations mce_chrdev_ops = {
 699         .open = mce_open,
 700         .release = mce_release,
 701         .read = mce_read,
 702         .poll = mce_poll,
 703         .unlocked_ioctl = mce_ioctl,
 704 };
 705
 706 static struct miscdevice mce_log_device = {
 707         MISC_MCELOG_MINOR,
 708         "mcelog",
 709         &mce_chrdev_ops,
 710 };
 711
 712 /*
 713  * Old style boot options parsing. Only for compatibility.
 714  */
 715 static int __init mcheck_disable(char *str)
 716 {
 717         mce_dont_init = 1;
 718         return 1;
 719 }
 720
 721 /* mce=off disables machine check.
 722    mce=TOLERANCELEVEL (number, see above)
 723    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
 724    mce=nobootlog Don't log MCEs from before booting. */
 725 static int __init mcheck_enable(char *str)
 726 {
 727         if (!strcmp(str, "off"))
 728                 mce_dont_init = 1;
 729         else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
 730                 mce_bootlog = str[0] == 'b';
 731         else if (isdigit(str[0]))
 732                 get_option(&str, &tolerant);
 733         else
 734                 printk("mce= argument %s ignored. Please use /sys", str);
 735         return 1;
 736 }
 737
 738 __setup("nomce", mcheck_disable);
 739 __setup("mce=", mcheck_enable);
 740
 741 /*
 742  * Sysfs support
 743  */
 744
 745 /*
 746  * Disable machine checks on suspend and shutdown. We can't really handle
 747  * them later.
 748  */
 749 static int mce_disable(void)
 750 {
 751         int i;
 752
 753         for (i = 0; i < banks; i++)
 754                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 755         return 0;
 756 }
 757
 758 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 759 {
 760         return mce_disable();
 761 }
 762
 763 static int mce_shutdown(struct sys_device *dev)
 764 {
 765         return mce_disable();
 766 }
 767
 768 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
 769    Only one CPU is active at this time, the others get readded later using
 770    CPU hotplug. */
 771 static int mce_resume(struct sys_device *dev)
 772 {
 773         mce_init(NULL);
 774         mce_cpu_features(&current_cpu_data);
 775         return 0;
 776 }
 777
 778 static void mce_cpu_restart(void *data)
 779 {
 780         del_timer_sync(&__get_cpu_var(mce_timer));
 781         if (mce_available(&current_cpu_data))
 782                 mce_init(NULL);
 783         mce_init_timer();
 784 }
 785
 786 /* Reinit MCEs after user configuration changes */
 787 static void mce_restart(void)
 788 {
 789         next_interval = check_interval * HZ;
 790         on_each_cpu(mce_cpu_restart, NULL, 1);
 791 }
 792
 793 static struct sysdev_class mce_sysclass = {
 794         .suspend = mce_suspend,
 795         .shutdown = mce_shutdown,
 796         .resume = mce_resume,
 797         .name = "machinecheck",
 798 };
 799
 800 DEFINE_PER_CPU(struct sys_device, device_mce);
 801 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
 802
 803 /* Why are there no generic functions for this? */
 804 #define ACCESSOR(name, var, start) \
 805         static ssize_t show_ ## name(struct sys_device *s,              \
 806                                      struct sysdev_attribute *attr,     \
 807                                      char *buf) {                       \
 808                 return sprintf(buf, "%lx\n", (unsigned long)var);       \
 809         }                                                               \
 810         static ssize_t set_ ## name(struct sys_device *s,               \
 811                                     struct sysdev_attribute *attr,      \
 812                                     const char *buf, size_t siz) {      \
 813                 char *end;                                              \
 814                 unsigned long new = simple_strtoul(buf, &end, 0);       \
 815                 if (end == buf) return -EINVAL;                         \
 816                 var = new;                                              \
 817                 start;                                                  \
 818                 return end-buf;                                         \
 819         }                                                               \
 820         static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 821
 822 /*
 823  * TBD should generate these dynamically based on number of available banks.
 824  * Have only 6 contol banks in /sysfs until then.
 825  */
 826 ACCESSOR(bank0ctl,bank[0],mce_restart())
 827 ACCESSOR(bank1ctl,bank[1],mce_restart())
 828 ACCESSOR(bank2ctl,bank[2],mce_restart())
 829 ACCESSOR(bank3ctl,bank[3],mce_restart())
 830 ACCESSOR(bank4ctl,bank[4],mce_restart())
 831 ACCESSOR(bank5ctl,bank[5],mce_restart())
 832
 833 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 834                                 char *buf)
 835 {
 836         strcpy(buf, trigger);
 837         strcat(buf, "\n");
 838         return strlen(trigger) + 1;
 839 }
 840
 841 static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 842                                 const char *buf,size_t siz)
 843 {
 844         char *p;
 845         int len;
 846         strncpy(trigger, buf, sizeof(trigger));
 847         trigger[sizeof(trigger)-1] = 0;
 848         len = strlen(trigger);
 849         p = strchr(trigger, '\n');
 850         if (*p) *p = 0;
 851         return len;
 852 }
 853
 854 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 855 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 856 ACCESSOR(check_interval,check_interval,mce_restart())
 857 static struct sysdev_attribute *mce_attributes[] = {
 858         &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 859         &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 860         &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 861         NULL
 862 };
 863
 864 static cpumask_t mce_device_initialized = CPU_MASK_NONE;
 865
 866 /* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
 867 static __cpuinit int mce_create_device(unsigned int cpu)
 868 {
 869         int err;
 870         int i;
 871
 872         if (!mce_available(&boot_cpu_data))
 873                 return -EIO;
 874
 875         memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
 876         per_cpu(device_mce,cpu).id = cpu;
 877         per_cpu(device_mce,cpu).cls = &mce_sysclass;
 878
 879         err = sysdev_register(&per_cpu(device_mce,cpu));
 880         if (err)
 881                 return err;
 882
 883         for (i = 0; mce_attributes[i]; i++) {
 884                 err = sysdev_create_file(&per_cpu(device_mce,cpu),
 885                                          mce_attributes[i]);
 886                 if (err)
 887                         goto error;
 888         }
 889         cpu_set(cpu, mce_device_initialized);
 890
 891         return 0;
 892 error:
 893         while (i--) {
 894                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 895                                    mce_attributes[i]);
 896         }
 897         sysdev_unregister(&per_cpu(device_mce,cpu));
 898
 899         return err;
 900 }
 901
 902 static __cpuinit void mce_remove_device(unsigned int cpu)
 903 {
 904         int i;
 905
 906         if (!cpu_isset(cpu, mce_device_initialized))
 907                 return;
 908
 909         for (i = 0; mce_attributes[i]; i++)
 910                 sysdev_remove_file(&per_cpu(device_mce,cpu),
 911                         mce_attributes[i]);
 912         sysdev_unregister(&per_cpu(device_mce,cpu));
 913         cpu_clear(cpu, mce_device_initialized);
 914 }
 915
 916 /* Make sure there are no machine checks on offlined CPUs. */
 917 static void __cpuexit mce_disable_cpu(void *h)
 918 {
 919         int i;
 920
 921         if (!mce_available(&current_cpu_data))
 922                 return;
 923         for (i = 0; i < banks; i++)
 924                 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
 925 }
 926
 927 static void __cpuexit mce_reenable_cpu(void *h)
 928 {
 929         int i;
 930
 931         if (!mce_available(&current_cpu_data))
 932                 return;
 933         for (i = 0; i < banks; i++)
 934                 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
 935 }
 936
 937 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 938 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 939                                       unsigned long action, void *hcpu)
 940 {
 941         unsigned int cpu = (unsigned long)hcpu;
 942         struct timer_list *t = &per_cpu(mce_timer, cpu);
 943
 944         switch (action) {
 945         case CPU_ONLINE:
 946         case CPU_ONLINE_FROZEN:
 947                 mce_create_device(cpu);
 948                 if (threshold_cpu_callback)
 949                         threshold_cpu_callback(action, cpu);
 950                 break;
 951         case CPU_DEAD:
 952         case CPU_DEAD_FROZEN:
 953                 if (threshold_cpu_callback)
 954                         threshold_cpu_callback(action, cpu);
 955                 mce_remove_device(cpu);
 956                 break;
 957         case CPU_DOWN_PREPARE:
 958         case CPU_DOWN_PREPARE_FROZEN:
 959                 del_timer_sync(t);
 960                 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
 961                 break;
 962         case CPU_DOWN_FAILED:
 963         case CPU_DOWN_FAILED_FROZEN:
 964                 t->expires = round_jiffies_relative(jiffies + next_interval);
 965                 add_timer_on(t, cpu);
 966                 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
 967                 break;
 968         }
 969         return NOTIFY_OK;
 970 }
 971
 972 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 973         .notifier_call = mce_cpu_callback,
 974 };
 975
 976 static __init int mce_init_device(void)
 977 {
 978         int err;
 979         int i = 0;
 980
 981         if (!mce_available(&boot_cpu_data))
 982                 return -EIO;
 983         err = sysdev_class_register(&mce_sysclass);
 984         if (err)
 985                 return err;
 986
 987         for_each_online_cpu(i) {
 988                 err = mce_create_device(i);
 989                 if (err)
 990                         return err;
 991         }
 992
 993         register_hotcpu_notifier(&mce_cpu_notifier);
 994         misc_register(&mce_log_device);
 995         return err;
 996 }
 997
 998 device_initcall(mce_init_device);