kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *  For licencing details see kernel-base/COPYING
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/cpu.h>
  12 #include <linux/smp.h>
  13 #include <linux/file.h>
  14 #include <linux/poll.h>
  15 #include <linux/sysfs.h>
  16 #include <linux/ptrace.h>
  17 #include <linux/percpu.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/kernel_stat.h>
  22 #include <linux/perf_counter.h>
  23
  24 /*
  25  * Each CPU has a list of per CPU counters:
  26  */
  27 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  28
  29 int perf_max_counters __read_mostly = 1;
  30 static int perf_reserved_percpu __read_mostly;
  31 static int perf_overcommit __read_mostly = 1;
  32
  33 /*
  34  * Mutex for (sysadmin-configurable) counter reservations:
  35  */
  36 static DEFINE_MUTEX(perf_resource_mutex);
  37
  38 /*
  39  * Architecture provided APIs - weak aliases:
  40  */
  41 extern __weak const struct hw_perf_counter_ops *
  42 hw_perf_counter_init(struct perf_counter *counter)
  43 {
  44         return NULL;
  45 }
  46
  47 u64 __weak hw_perf_save_disable(void)           { return 0; }
  48 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  49 void __weak hw_perf_counter_setup(void)         { barrier(); }
  50
  51 static void
  52 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  53 {
  54         struct perf_counter *group_leader = counter->group_leader;
  55
  56         /*
  57          * Depending on whether it is a standalone or sibling counter,
  58          * add it straight to the context's counter list, or to the group
  59          * leader's sibling list:
  60          */
  61         if (counter->group_leader == counter)
  62                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  63         else
  64                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  65 }
  66
  67 static void
  68 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  69 {
  70         struct perf_counter *sibling, *tmp;
  71
  72         list_del_init(&counter->list_entry);
  73
  74         /*
  75          * If this was a group counter with sibling counters then
  76          * upgrade the siblings to singleton counters by adding them
  77          * to the context list directly:
  78          */
  79         list_for_each_entry_safe(sibling, tmp,
  80                                  &counter->sibling_list, list_entry) {
  81
  82                 list_del_init(&sibling->list_entry);
  83                 list_add_tail(&sibling->list_entry, &ctx->counter_list);
  84                 sibling->group_leader = sibling;
  85         }
  86 }
  87
  88 /*
  89  * Cross CPU call to remove a performance counter
  90  *
  91  * We disable the counter on the hardware level first. After that we
  92  * remove it from the context list.
  93  */
  94 static void __perf_counter_remove_from_context(void *info)
  95 {
  96         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  97         struct perf_counter *counter = info;
  98         struct perf_counter_context *ctx = counter->ctx;
  99         unsigned long flags;
 100         u64 perf_flags;
 101
 102         /*
 103          * If this is a task context, we need to check whether it is
 104          * the current task context of this cpu. If not it has been
 105          * scheduled out before the smp call arrived.
 106          */
 107         if (ctx->task && cpuctx->task_ctx != ctx)
 108                 return;
 109
 110         curr_rq_lock_irq_save(&flags);
 111         spin_lock(&ctx->lock);
 112
 113         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 114                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 115                 counter->hw_ops->disable(counter);
 116                 ctx->nr_active--;
 117                 cpuctx->active_oncpu--;
 118                 counter->task = NULL;
 119                 counter->oncpu = -1;
 120         }
 121         ctx->nr_counters--;
 122
 123         /*
 124          * Protect the list operation against NMI by disabling the
 125          * counters on a global level. NOP for non NMI based counters.
 126          */
 127         perf_flags = hw_perf_save_disable();
 128         list_del_counter(counter, ctx);
 129         hw_perf_restore(perf_flags);
 130
 131         if (!ctx->task) {
 132                 /*
 133                  * Allow more per task counters with respect to the
 134                  * reservation:
 135                  */
 136                 cpuctx->max_pertask =
 137                         min(perf_max_counters - ctx->nr_counters,
 138                             perf_max_counters - perf_reserved_percpu);
 139         }
 140
 141         spin_unlock(&ctx->lock);
 142         curr_rq_unlock_irq_restore(&flags);
 143 }
 144
 145
 146 /*
 147  * Remove the counter from a task's (or a CPU's) list of counters.
 148  *
 149  * Must be called with counter->mutex held.
 150  *
 151  * CPU counters are removed with a smp call. For task counters we only
 152  * call when the task is on a CPU.
 153  */
 154 static void perf_counter_remove_from_context(struct perf_counter *counter)
 155 {
 156         struct perf_counter_context *ctx = counter->ctx;
 157         struct task_struct *task = ctx->task;
 158
 159         if (!task) {
 160                 /*
 161                  * Per cpu counters are removed via an smp call and
 162                  * the removal is always sucessful.
 163                  */
 164                 smp_call_function_single(counter->cpu,
 165                                          __perf_counter_remove_from_context,
 166                                          counter, 1);
 167                 return;
 168         }
 169
 170 retry:
 171         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 172                                  counter);
 173
 174         spin_lock_irq(&ctx->lock);
 175         /*
 176          * If the context is active we need to retry the smp call.
 177          */
 178         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 179                 spin_unlock_irq(&ctx->lock);
 180                 goto retry;
 181         }
 182
 183         /*
 184          * The lock prevents that this context is scheduled in so we
 185          * can remove the counter safely, if the call above did not
 186          * succeed.
 187          */
 188         if (!list_empty(&counter->list_entry)) {
 189                 ctx->nr_counters--;
 190                 list_del_counter(counter, ctx);
 191                 counter->task = NULL;
 192         }
 193         spin_unlock_irq(&ctx->lock);
 194 }
 195
 196 static int
 197 counter_sched_in(struct perf_counter *counter,
 198                  struct perf_cpu_context *cpuctx,
 199                  struct perf_counter_context *ctx,
 200                  int cpu)
 201 {
 202         if (counter->state == PERF_COUNTER_STATE_OFF)
 203                 return 0;
 204
 205         counter->state = PERF_COUNTER_STATE_ACTIVE;
 206         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 207         /*
 208          * The new state must be visible before we turn it on in the hardware:
 209          */
 210         smp_wmb();
 211
 212         if (counter->hw_ops->enable(counter)) {
 213                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 214                 counter->oncpu = -1;
 215                 return -EAGAIN;
 216         }
 217
 218         cpuctx->active_oncpu++;
 219         ctx->nr_active++;
 220
 221         return 0;
 222 }
 223
 224 /*
 225  * Cross CPU call to install and enable a performance counter
 226  */
 227 static void __perf_install_in_context(void *info)
 228 {
 229         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 230         struct perf_counter *counter = info;
 231         struct perf_counter_context *ctx = counter->ctx;
 232         int cpu = smp_processor_id();
 233         unsigned long flags;
 234         u64 perf_flags;
 235
 236         /*
 237          * If this is a task context, we need to check whether it is
 238          * the current task context of this cpu. If not it has been
 239          * scheduled out before the smp call arrived.
 240          */
 241         if (ctx->task && cpuctx->task_ctx != ctx)
 242                 return;
 243
 244         curr_rq_lock_irq_save(&flags);
 245         spin_lock(&ctx->lock);
 246
 247         /*
 248          * Protect the list operation against NMI by disabling the
 249          * counters on a global level. NOP for non NMI based counters.
 250          */
 251         perf_flags = hw_perf_save_disable();
 252
 253         list_add_counter(counter, ctx);
 254         ctx->nr_counters++;
 255
 256         counter_sched_in(counter, cpuctx, ctx, cpu);
 257
 258         if (!ctx->task && cpuctx->max_pertask)
 259                 cpuctx->max_pertask--;
 260
 261         hw_perf_restore(perf_flags);
 262
 263         spin_unlock(&ctx->lock);
 264         curr_rq_unlock_irq_restore(&flags);
 265 }
 266
 267 /*
 268  * Attach a performance counter to a context
 269  *
 270  * First we add the counter to the list with the hardware enable bit
 271  * in counter->hw_config cleared.
 272  *
 273  * If the counter is attached to a task which is on a CPU we use a smp
 274  * call to enable it in the task context. The task might have been
 275  * scheduled away, but we check this in the smp call again.
 276  */
 277 static void
 278 perf_install_in_context(struct perf_counter_context *ctx,
 279                         struct perf_counter *counter,
 280                         int cpu)
 281 {
 282         struct task_struct *task = ctx->task;
 283
 284         counter->ctx = ctx;
 285         if (!task) {
 286                 /*
 287                  * Per cpu counters are installed via an smp call and
 288                  * the install is always sucessful.
 289                  */
 290                 smp_call_function_single(cpu, __perf_install_in_context,
 291                                          counter, 1);
 292                 return;
 293         }
 294
 295         counter->task = task;
 296 retry:
 297         task_oncpu_function_call(task, __perf_install_in_context,
 298                                  counter);
 299
 300         spin_lock_irq(&ctx->lock);
 301         /*
 302          * we need to retry the smp call.
 303          */
 304         if (ctx->nr_active && list_empty(&counter->list_entry)) {
 305                 spin_unlock_irq(&ctx->lock);
 306                 goto retry;
 307         }
 308
 309         /*
 310          * The lock prevents that this context is scheduled in so we
 311          * can add the counter safely, if it the call above did not
 312          * succeed.
 313          */
 314         if (list_empty(&counter->list_entry)) {
 315                 list_add_counter(counter, ctx);
 316                 ctx->nr_counters++;
 317         }
 318         spin_unlock_irq(&ctx->lock);
 319 }
 320
 321 static void
 322 counter_sched_out(struct perf_counter *counter,
 323                   struct perf_cpu_context *cpuctx,
 324                   struct perf_counter_context *ctx)
 325 {
 326         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 327                 return;
 328
 329         counter->state = PERF_COUNTER_STATE_INACTIVE;
 330         counter->hw_ops->disable(counter);
 331         counter->oncpu = -1;
 332
 333         cpuctx->active_oncpu--;
 334         ctx->nr_active--;
 335 }
 336
 337 static void
 338 group_sched_out(struct perf_counter *group_counter,
 339                 struct perf_cpu_context *cpuctx,
 340                 struct perf_counter_context *ctx)
 341 {
 342         struct perf_counter *counter;
 343
 344         counter_sched_out(group_counter, cpuctx, ctx);
 345
 346         /*
 347          * Schedule out siblings (if any):
 348          */
 349         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 350                 counter_sched_out(counter, cpuctx, ctx);
 351 }
 352
 353 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 354                               struct perf_cpu_context *cpuctx)
 355 {
 356         struct perf_counter *counter;
 357
 358         if (likely(!ctx->nr_counters))
 359                 return;
 360
 361         spin_lock(&ctx->lock);
 362         if (ctx->nr_active) {
 363                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 364                         group_sched_out(counter, cpuctx, ctx);
 365         }
 366         spin_unlock(&ctx->lock);
 367 }
 368
 369 /*
 370  * Called from scheduler to remove the counters of the current task,
 371  * with interrupts disabled.
 372  *
 373  * We stop each counter and update the counter value in counter->count.
 374  *
 375  * This does not protect us against NMI, but disable()
 376  * sets the disabled bit in the control field of counter _before_
 377  * accessing the counter control register. If a NMI hits, then it will
 378  * not restart the counter.
 379  */
 380 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 381 {
 382         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 383         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 384
 385         if (likely(!cpuctx->task_ctx))
 386                 return;
 387
 388         __perf_counter_sched_out(ctx, cpuctx);
 389
 390         cpuctx->task_ctx = NULL;
 391 }
 392
 393 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 394 {
 395         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 396 }
 397
 398 static int
 399 group_sched_in(struct perf_counter *group_counter,
 400                struct perf_cpu_context *cpuctx,
 401                struct perf_counter_context *ctx,
 402                int cpu)
 403 {
 404         struct perf_counter *counter, *partial_group;
 405         int ret = 0;
 406
 407         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 408                 return -EAGAIN;
 409
 410         /*
 411          * Schedule in siblings as one group (if any):
 412          */
 413         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 414                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 415                         partial_group = counter;
 416                         goto group_error;
 417                 }
 418                 ret = -EAGAIN;
 419         }
 420
 421         return ret;
 422
 423 group_error:
 424         /*
 425          * Groups can be scheduled in as one unit only, so undo any
 426          * partial group before returning:
 427          */
 428         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 429                 if (counter == partial_group)
 430                         break;
 431                 counter_sched_out(counter, cpuctx, ctx);
 432         }
 433         counter_sched_out(group_counter, cpuctx, ctx);
 434
 435         return -EAGAIN;
 436 }
 437
 438 static void
 439 __perf_counter_sched_in(struct perf_counter_context *ctx,
 440                         struct perf_cpu_context *cpuctx, int cpu)
 441 {
 442         struct perf_counter *counter;
 443
 444         if (likely(!ctx->nr_counters))
 445                 return;
 446
 447         spin_lock(&ctx->lock);
 448         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 449                 /*
 450                  * Listen to the 'cpu' scheduling filter constraint
 451                  * of counters:
 452                  */
 453                 if (counter->cpu != -1 && counter->cpu != cpu)
 454                         continue;
 455
 456                 /*
 457                  * If we scheduled in a group atomically and
 458                  * exclusively, break out:
 459                  */
 460                 if (group_sched_in(counter, cpuctx, ctx, cpu))
 461                         break;
 462         }
 463         spin_unlock(&ctx->lock);
 464 }
 465
 466 /*
 467  * Called from scheduler to add the counters of the current task
 468  * with interrupts disabled.
 469  *
 470  * We restore the counter value and then enable it.
 471  *
 472  * This does not protect us against NMI, but enable()
 473  * sets the enabled bit in the control field of counter _before_
 474  * accessing the counter control register. If a NMI hits, then it will
 475  * keep the counter running.
 476  */
 477 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 478 {
 479         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 480         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 481
 482         __perf_counter_sched_in(ctx, cpuctx, cpu);
 483         cpuctx->task_ctx = ctx;
 484 }
 485
 486 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 487 {
 488         struct perf_counter_context *ctx = &cpuctx->ctx;
 489
 490         __perf_counter_sched_in(ctx, cpuctx, cpu);
 491 }
 492
 493 int perf_counter_task_disable(void)
 494 {
 495         struct task_struct *curr = current;
 496         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 497         struct perf_counter *counter;
 498         unsigned long flags;
 499         u64 perf_flags;
 500         int cpu;
 501
 502         if (likely(!ctx->nr_counters))
 503                 return 0;
 504
 505         curr_rq_lock_irq_save(&flags);
 506         cpu = smp_processor_id();
 507
 508         /* force the update of the task clock: */
 509         __task_delta_exec(curr, 1);
 510
 511         perf_counter_task_sched_out(curr, cpu);
 512
 513         spin_lock(&ctx->lock);
 514
 515         /*
 516          * Disable all the counters:
 517          */
 518         perf_flags = hw_perf_save_disable();
 519
 520         list_for_each_entry(counter, &ctx->counter_list, list_entry)
 521                 counter->state = PERF_COUNTER_STATE_OFF;
 522
 523         hw_perf_restore(perf_flags);
 524
 525         spin_unlock(&ctx->lock);
 526
 527         curr_rq_unlock_irq_restore(&flags);
 528
 529         return 0;
 530 }
 531
 532 int perf_counter_task_enable(void)
 533 {
 534         struct task_struct *curr = current;
 535         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 536         struct perf_counter *counter;
 537         unsigned long flags;
 538         u64 perf_flags;
 539         int cpu;
 540
 541         if (likely(!ctx->nr_counters))
 542                 return 0;
 543
 544         curr_rq_lock_irq_save(&flags);
 545         cpu = smp_processor_id();
 546
 547         /* force the update of the task clock: */
 548         __task_delta_exec(curr, 1);
 549
 550         perf_counter_task_sched_out(curr, cpu);
 551
 552         spin_lock(&ctx->lock);
 553
 554         /*
 555          * Disable all the counters:
 556          */
 557         perf_flags = hw_perf_save_disable();
 558
 559         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 560                 if (counter->state != PERF_COUNTER_STATE_OFF)
 561                         continue;
 562                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 563                 counter->hw_event.disabled = 0;
 564         }
 565         hw_perf_restore(perf_flags);
 566
 567         spin_unlock(&ctx->lock);
 568
 569         perf_counter_task_sched_in(curr, cpu);
 570
 571         curr_rq_unlock_irq_restore(&flags);
 572
 573         return 0;
 574 }
 575
 576 /*
 577  * Round-robin a context's counters:
 578  */
 579 static void rotate_ctx(struct perf_counter_context *ctx)
 580 {
 581         struct perf_counter *counter;
 582         u64 perf_flags;
 583
 584         if (!ctx->nr_counters)
 585                 return;
 586
 587         spin_lock(&ctx->lock);
 588         /*
 589          * Rotate the first entry last (works just fine for group counters too):
 590          */
 591         perf_flags = hw_perf_save_disable();
 592         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 593                 list_del(&counter->list_entry);
 594                 list_add_tail(&counter->list_entry, &ctx->counter_list);
 595                 break;
 596         }
 597         hw_perf_restore(perf_flags);
 598
 599         spin_unlock(&ctx->lock);
 600 }
 601
 602 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 603 {
 604         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 605         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 606         const int rotate_percpu = 0;
 607
 608         if (rotate_percpu)
 609                 perf_counter_cpu_sched_out(cpuctx);
 610         perf_counter_task_sched_out(curr, cpu);
 611
 612         if (rotate_percpu)
 613                 rotate_ctx(&cpuctx->ctx);
 614         rotate_ctx(ctx);
 615
 616         if (rotate_percpu)
 617                 perf_counter_cpu_sched_in(cpuctx, cpu);
 618         perf_counter_task_sched_in(curr, cpu);
 619 }
 620
 621 /*
 622  * Cross CPU call to read the hardware counter
 623  */
 624 static void __read(void *info)
 625 {
 626         struct perf_counter *counter = info;
 627         unsigned long flags;
 628
 629         curr_rq_lock_irq_save(&flags);
 630         counter->hw_ops->read(counter);
 631         curr_rq_unlock_irq_restore(&flags);
 632 }
 633
 634 static u64 perf_counter_read(struct perf_counter *counter)
 635 {
 636         /*
 637          * If counter is enabled and currently active on a CPU, update the
 638          * value in the counter structure:
 639          */
 640         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 641                 smp_call_function_single(counter->oncpu,
 642                                          __read, counter, 1);
 643         }
 644
 645         return atomic64_read(&counter->count);
 646 }
 647
 648 /*
 649  * Cross CPU call to switch performance data pointers
 650  */
 651 static void __perf_switch_irq_data(void *info)
 652 {
 653         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 654         struct perf_counter *counter = info;
 655         struct perf_counter_context *ctx = counter->ctx;
 656         struct perf_data *oldirqdata = counter->irqdata;
 657
 658         /*
 659          * If this is a task context, we need to check whether it is
 660          * the current task context of this cpu. If not it has been
 661          * scheduled out before the smp call arrived.
 662          */
 663         if (ctx->task) {
 664                 if (cpuctx->task_ctx != ctx)
 665                         return;
 666                 spin_lock(&ctx->lock);
 667         }
 668
 669         /* Change the pointer NMI safe */
 670         atomic_long_set((atomic_long_t *)&counter->irqdata,
 671                         (unsigned long) counter->usrdata);
 672         counter->usrdata = oldirqdata;
 673
 674         if (ctx->task)
 675                 spin_unlock(&ctx->lock);
 676 }
 677
 678 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
 679 {
 680         struct perf_counter_context *ctx = counter->ctx;
 681         struct perf_data *oldirqdata = counter->irqdata;
 682         struct task_struct *task = ctx->task;
 683
 684         if (!task) {
 685                 smp_call_function_single(counter->cpu,
 686                                          __perf_switch_irq_data,
 687                                          counter, 1);
 688                 return counter->usrdata;
 689         }
 690
 691 retry:
 692         spin_lock_irq(&ctx->lock);
 693         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 694                 counter->irqdata = counter->usrdata;
 695                 counter->usrdata = oldirqdata;
 696                 spin_unlock_irq(&ctx->lock);
 697                 return oldirqdata;
 698         }
 699         spin_unlock_irq(&ctx->lock);
 700         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
 701         /* Might have failed, because task was scheduled out */
 702         if (counter->irqdata == oldirqdata)
 703                 goto retry;
 704
 705         return counter->usrdata;
 706 }
 707
 708 static void put_context(struct perf_counter_context *ctx)
 709 {
 710         if (ctx->task)
 711                 put_task_struct(ctx->task);
 712 }
 713
 714 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 715 {
 716         struct perf_cpu_context *cpuctx;
 717         struct perf_counter_context *ctx;
 718         struct task_struct *task;
 719
 720         /*
 721          * If cpu is not a wildcard then this is a percpu counter:
 722          */
 723         if (cpu != -1) {
 724                 /* Must be root to operate on a CPU counter: */
 725                 if (!capable(CAP_SYS_ADMIN))
 726                         return ERR_PTR(-EACCES);
 727
 728                 if (cpu < 0 || cpu > num_possible_cpus())
 729                         return ERR_PTR(-EINVAL);
 730
 731                 /*
 732                  * We could be clever and allow to attach a counter to an
 733                  * offline CPU and activate it when the CPU comes up, but
 734                  * that's for later.
 735                  */
 736                 if (!cpu_isset(cpu, cpu_online_map))
 737                         return ERR_PTR(-ENODEV);
 738
 739                 cpuctx = &per_cpu(perf_cpu_context, cpu);
 740                 ctx = &cpuctx->ctx;
 741
 742                 return ctx;
 743         }
 744
 745         rcu_read_lock();
 746         if (!pid)
 747                 task = current;
 748         else
 749                 task = find_task_by_vpid(pid);
 750         if (task)
 751                 get_task_struct(task);
 752         rcu_read_unlock();
 753
 754         if (!task)
 755                 return ERR_PTR(-ESRCH);
 756
 757         ctx = &task->perf_counter_ctx;
 758         ctx->task = task;
 759
 760         /* Reuse ptrace permission checks for now. */
 761         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
 762                 put_context(ctx);
 763                 return ERR_PTR(-EACCES);
 764         }
 765
 766         return ctx;
 767 }
 768
 769 /*
 770  * Called when the last reference to the file is gone.
 771  */
 772 static int perf_release(struct inode *inode, struct file *file)
 773 {
 774         struct perf_counter *counter = file->private_data;
 775         struct perf_counter_context *ctx = counter->ctx;
 776
 777         file->private_data = NULL;
 778
 779         mutex_lock(&counter->mutex);
 780
 781         perf_counter_remove_from_context(counter);
 782         put_context(ctx);
 783
 784         mutex_unlock(&counter->mutex);
 785
 786         kfree(counter);
 787
 788         return 0;
 789 }
 790
 791 /*
 792  * Read the performance counter - simple non blocking version for now
 793  */
 794 static ssize_t
 795 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 796 {
 797         u64 cntval;
 798
 799         if (count != sizeof(cntval))
 800                 return -EINVAL;
 801
 802         mutex_lock(&counter->mutex);
 803         cntval = perf_counter_read(counter);
 804         mutex_unlock(&counter->mutex);
 805
 806         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
 807 }
 808
 809 static ssize_t
 810 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
 811 {
 812         if (!usrdata->len)
 813                 return 0;
 814
 815         count = min(count, (size_t)usrdata->len);
 816         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
 817                 return -EFAULT;
 818
 819         /* Adjust the counters */
 820         usrdata->len -= count;
 821         if (!usrdata->len)
 822                 usrdata->rd_idx = 0;
 823         else
 824                 usrdata->rd_idx += count;
 825
 826         return count;
 827 }
 828
 829 static ssize_t
 830 perf_read_irq_data(struct perf_counter  *counter,
 831                    char __user          *buf,
 832                    size_t               count,
 833                    int                  nonblocking)
 834 {
 835         struct perf_data *irqdata, *usrdata;
 836         DECLARE_WAITQUEUE(wait, current);
 837         ssize_t res;
 838
 839         irqdata = counter->irqdata;
 840         usrdata = counter->usrdata;
 841
 842         if (usrdata->len + irqdata->len >= count)
 843                 goto read_pending;
 844
 845         if (nonblocking)
 846                 return -EAGAIN;
 847
 848         spin_lock_irq(&counter->waitq.lock);
 849         __add_wait_queue(&counter->waitq, &wait);
 850         for (;;) {
 851                 set_current_state(TASK_INTERRUPTIBLE);
 852                 if (usrdata->len + irqdata->len >= count)
 853                         break;
 854
 855                 if (signal_pending(current))
 856                         break;
 857
 858                 spin_unlock_irq(&counter->waitq.lock);
 859                 schedule();
 860                 spin_lock_irq(&counter->waitq.lock);
 861         }
 862         __remove_wait_queue(&counter->waitq, &wait);
 863         __set_current_state(TASK_RUNNING);
 864         spin_unlock_irq(&counter->waitq.lock);
 865
 866         if (usrdata->len + irqdata->len < count)
 867                 return -ERESTARTSYS;
 868 read_pending:
 869         mutex_lock(&counter->mutex);
 870
 871         /* Drain pending data first: */
 872         res = perf_copy_usrdata(usrdata, buf, count);
 873         if (res < 0 || res == count)
 874                 goto out;
 875
 876         /* Switch irq buffer: */
 877         usrdata = perf_switch_irq_data(counter);
 878         if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
 879                 if (!res)
 880                         res = -EFAULT;
 881         } else {
 882                 res = count;
 883         }
 884 out:
 885         mutex_unlock(&counter->mutex);
 886
 887         return res;
 888 }
 889
 890 static ssize_t
 891 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 892 {
 893         struct perf_counter *counter = file->private_data;
 894
 895         switch (counter->hw_event.record_type) {
 896         case PERF_RECORD_SIMPLE:
 897                 return perf_read_hw(counter, buf, count);
 898
 899         case PERF_RECORD_IRQ:
 900         case PERF_RECORD_GROUP:
 901                 return perf_read_irq_data(counter, buf, count,
 902                                           file->f_flags & O_NONBLOCK);
 903         }
 904         return -EINVAL;
 905 }
 906
 907 static unsigned int perf_poll(struct file *file, poll_table *wait)
 908 {
 909         struct perf_counter *counter = file->private_data;
 910         unsigned int events = 0;
 911         unsigned long flags;
 912
 913         poll_wait(file, &counter->waitq, wait);
 914
 915         spin_lock_irqsave(&counter->waitq.lock, flags);
 916         if (counter->usrdata->len || counter->irqdata->len)
 917                 events |= POLLIN;
 918         spin_unlock_irqrestore(&counter->waitq.lock, flags);
 919
 920         return events;
 921 }
 922
 923 static const struct file_operations perf_fops = {
 924         .release                = perf_release,
 925         .read                   = perf_read,
 926         .poll                   = perf_poll,
 927 };
 928
 929 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 930 {
 931         return 0;
 932 }
 933
 934 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 935 {
 936 }
 937
 938 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 939 {
 940         int cpu = raw_smp_processor_id();
 941
 942         atomic64_set(&counter->count, cpu_clock(cpu));
 943 }
 944
 945 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 946         .enable         = cpu_clock_perf_counter_enable,
 947         .disable        = cpu_clock_perf_counter_disable,
 948         .read           = cpu_clock_perf_counter_read,
 949 };
 950
 951 /*
 952  * Called from within the scheduler:
 953  */
 954 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
 955 {
 956         struct task_struct *curr = counter->task;
 957         u64 delta;
 958
 959         delta = __task_delta_exec(curr, update);
 960
 961         return curr->se.sum_exec_runtime + delta;
 962 }
 963
 964 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
 965 {
 966         u64 prev;
 967         s64 delta;
 968
 969         prev = atomic64_read(&counter->hw.prev_count);
 970
 971         atomic64_set(&counter->hw.prev_count, now);
 972
 973         delta = now - prev;
 974
 975         atomic64_add(delta, &counter->count);
 976 }
 977
 978 static void task_clock_perf_counter_read(struct perf_counter *counter)
 979 {
 980         u64 now = task_clock_perf_counter_val(counter, 1);
 981
 982         task_clock_perf_counter_update(counter, now);
 983 }
 984
 985 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 986 {
 987         u64 now = task_clock_perf_counter_val(counter, 0);
 988
 989         atomic64_set(&counter->hw.prev_count, now);
 990
 991         return 0;
 992 }
 993
 994 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 995 {
 996         u64 now = task_clock_perf_counter_val(counter, 0);
 997
 998         task_clock_perf_counter_update(counter, now);
 999 }
1000
1001 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1002         .enable         = task_clock_perf_counter_enable,
1003         .disable        = task_clock_perf_counter_disable,
1004         .read           = task_clock_perf_counter_read,
1005 };
1006
1007 static u64 get_page_faults(void)
1008 {
1009         struct task_struct *curr = current;
1010
1011         return curr->maj_flt + curr->min_flt;
1012 }
1013
1014 static void page_faults_perf_counter_update(struct perf_counter *counter)
1015 {
1016         u64 prev, now;
1017         s64 delta;
1018
1019         prev = atomic64_read(&counter->hw.prev_count);
1020         now = get_page_faults();
1021
1022         atomic64_set(&counter->hw.prev_count, now);
1023
1024         delta = now - prev;
1025
1026         atomic64_add(delta, &counter->count);
1027 }
1028
1029 static void page_faults_perf_counter_read(struct perf_counter *counter)
1030 {
1031         page_faults_perf_counter_update(counter);
1032 }
1033
1034 static int page_faults_perf_counter_enable(struct perf_counter *counter)
1035 {
1036         /*
1037          * page-faults is a per-task value already,
1038          * so we dont have to clear it on switch-in.
1039          */
1040
1041         return 0;
1042 }
1043
1044 static void page_faults_perf_counter_disable(struct perf_counter *counter)
1045 {
1046         page_faults_perf_counter_update(counter);
1047 }
1048
1049 static const struct hw_perf_counter_ops perf_ops_page_faults = {
1050         .enable         = page_faults_perf_counter_enable,
1051         .disable        = page_faults_perf_counter_disable,
1052         .read           = page_faults_perf_counter_read,
1053 };
1054
1055 static u64 get_context_switches(void)
1056 {
1057         struct task_struct *curr = current;
1058
1059         return curr->nvcsw + curr->nivcsw;
1060 }
1061
1062 static void context_switches_perf_counter_update(struct perf_counter *counter)
1063 {
1064         u64 prev, now;
1065         s64 delta;
1066
1067         prev = atomic64_read(&counter->hw.prev_count);
1068         now = get_context_switches();
1069
1070         atomic64_set(&counter->hw.prev_count, now);
1071
1072         delta = now - prev;
1073
1074         atomic64_add(delta, &counter->count);
1075 }
1076
1077 static void context_switches_perf_counter_read(struct perf_counter *counter)
1078 {
1079         context_switches_perf_counter_update(counter);
1080 }
1081
1082 static int context_switches_perf_counter_enable(struct perf_counter *counter)
1083 {
1084         /*
1085          * ->nvcsw + curr->nivcsw is a per-task value already,
1086          * so we dont have to clear it on switch-in.
1087          */
1088
1089         return 0;
1090 }
1091
1092 static void context_switches_perf_counter_disable(struct perf_counter *counter)
1093 {
1094         context_switches_perf_counter_update(counter);
1095 }
1096
1097 static const struct hw_perf_counter_ops perf_ops_context_switches = {
1098         .enable         = context_switches_perf_counter_enable,
1099         .disable        = context_switches_perf_counter_disable,
1100         .read           = context_switches_perf_counter_read,
1101 };
1102
1103 static inline u64 get_cpu_migrations(void)
1104 {
1105         return current->se.nr_migrations;
1106 }
1107
1108 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1109 {
1110         u64 prev, now;
1111         s64 delta;
1112
1113         prev = atomic64_read(&counter->hw.prev_count);
1114         now = get_cpu_migrations();
1115
1116         atomic64_set(&counter->hw.prev_count, now);
1117
1118         delta = now - prev;
1119
1120         atomic64_add(delta, &counter->count);
1121 }
1122
1123 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1124 {
1125         cpu_migrations_perf_counter_update(counter);
1126 }
1127
1128 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1129 {
1130         /*
1131          * se.nr_migrations is a per-task value already,
1132          * so we dont have to clear it on switch-in.
1133          */
1134
1135         return 0;
1136 }
1137
1138 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1139 {
1140         cpu_migrations_perf_counter_update(counter);
1141 }
1142
1143 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1144         .enable         = cpu_migrations_perf_counter_enable,
1145         .disable        = cpu_migrations_perf_counter_disable,
1146         .read           = cpu_migrations_perf_counter_read,
1147 };
1148
1149 static const struct hw_perf_counter_ops *
1150 sw_perf_counter_init(struct perf_counter *counter)
1151 {
1152         const struct hw_perf_counter_ops *hw_ops = NULL;
1153
1154         switch (counter->hw_event.type) {
1155         case PERF_COUNT_CPU_CLOCK:
1156                 hw_ops = &perf_ops_cpu_clock;
1157                 break;
1158         case PERF_COUNT_TASK_CLOCK:
1159                 hw_ops = &perf_ops_task_clock;
1160                 break;
1161         case PERF_COUNT_PAGE_FAULTS:
1162                 hw_ops = &perf_ops_page_faults;
1163                 break;
1164         case PERF_COUNT_CONTEXT_SWITCHES:
1165                 hw_ops = &perf_ops_context_switches;
1166                 break;
1167         case PERF_COUNT_CPU_MIGRATIONS:
1168                 hw_ops = &perf_ops_cpu_migrations;
1169                 break;
1170         default:
1171                 break;
1172         }
1173         return hw_ops;
1174 }
1175
1176 /*
1177  * Allocate and initialize a counter structure
1178  */
1179 static struct perf_counter *
1180 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1181                    int cpu,
1182                    struct perf_counter *group_leader,
1183                    gfp_t gfpflags)
1184 {
1185         const struct hw_perf_counter_ops *hw_ops;
1186         struct perf_counter *counter;
1187
1188         counter = kzalloc(sizeof(*counter), gfpflags);
1189         if (!counter)
1190                 return NULL;
1191
1192         /*
1193          * Single counters are their own group leaders, with an
1194          * empty sibling list:
1195          */
1196         if (!group_leader)
1197                 group_leader = counter;
1198
1199         mutex_init(&counter->mutex);
1200         INIT_LIST_HEAD(&counter->list_entry);
1201         INIT_LIST_HEAD(&counter->sibling_list);
1202         init_waitqueue_head(&counter->waitq);
1203
1204         counter->irqdata                = &counter->data[0];
1205         counter->usrdata                = &counter->data[1];
1206         counter->cpu                    = cpu;
1207         counter->hw_event               = *hw_event;
1208         counter->wakeup_pending         = 0;
1209         counter->group_leader           = group_leader;
1210         counter->hw_ops                 = NULL;
1211
1212         counter->state = PERF_COUNTER_STATE_INACTIVE;
1213         if (hw_event->disabled)
1214                 counter->state = PERF_COUNTER_STATE_OFF;
1215
1216         hw_ops = NULL;
1217         if (!hw_event->raw && hw_event->type < 0)
1218                 hw_ops = sw_perf_counter_init(counter);
1219         if (!hw_ops)
1220                 hw_ops = hw_perf_counter_init(counter);
1221
1222         if (!hw_ops) {
1223                 kfree(counter);
1224                 return NULL;
1225         }
1226         counter->hw_ops = hw_ops;
1227
1228         return counter;
1229 }
1230
1231 /**
1232  * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1233  *
1234  * @hw_event_uptr:      event type attributes for monitoring/sampling
1235  * @pid:                target pid
1236  * @cpu:                target cpu
1237  * @group_fd:           group leader counter fd
1238  */
1239 asmlinkage int
1240 sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1241                       pid_t pid, int cpu, int group_fd)
1242 {
1243         struct perf_counter *counter, *group_leader;
1244         struct perf_counter_hw_event hw_event;
1245         struct perf_counter_context *ctx;
1246         struct file *counter_file = NULL;
1247         struct file *group_file = NULL;
1248         int fput_needed = 0;
1249         int fput_needed2 = 0;
1250         int ret;
1251
1252         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1253                 return -EFAULT;
1254
1255         /*
1256          * Get the target context (task or percpu):
1257          */
1258         ctx = find_get_context(pid, cpu);
1259         if (IS_ERR(ctx))
1260                 return PTR_ERR(ctx);
1261
1262         /*
1263          * Look up the group leader (we will attach this counter to it):
1264          */
1265         group_leader = NULL;
1266         if (group_fd != -1) {
1267                 ret = -EINVAL;
1268                 group_file = fget_light(group_fd, &fput_needed);
1269                 if (!group_file)
1270                         goto err_put_context;
1271                 if (group_file->f_op != &perf_fops)
1272                         goto err_put_context;
1273
1274                 group_leader = group_file->private_data;
1275                 /*
1276                  * Do not allow a recursive hierarchy (this new sibling
1277                  * becoming part of another group-sibling):
1278                  */
1279                 if (group_leader->group_leader != group_leader)
1280                         goto err_put_context;
1281                 /*
1282                  * Do not allow to attach to a group in a different
1283                  * task or CPU context:
1284                  */
1285                 if (group_leader->ctx != ctx)
1286                         goto err_put_context;
1287         }
1288
1289         ret = -EINVAL;
1290         counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1291         if (!counter)
1292                 goto err_put_context;
1293
1294         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1295         if (ret < 0)
1296                 goto err_free_put_context;
1297
1298         counter_file = fget_light(ret, &fput_needed2);
1299         if (!counter_file)
1300                 goto err_free_put_context;
1301
1302         counter->filp = counter_file;
1303         perf_install_in_context(ctx, counter, cpu);
1304
1305         fput_light(counter_file, fput_needed2);
1306
1307 out_fput:
1308         fput_light(group_file, fput_needed);
1309
1310         return ret;
1311
1312 err_free_put_context:
1313         kfree(counter);
1314
1315 err_put_context:
1316         put_context(ctx);
1317
1318         goto out_fput;
1319 }
1320
1321 /*
1322  * Initialize the perf_counter context in a task_struct:
1323  */
1324 static void
1325 __perf_counter_init_context(struct perf_counter_context *ctx,
1326                             struct task_struct *task)
1327 {
1328         memset(ctx, 0, sizeof(*ctx));
1329         spin_lock_init(&ctx->lock);
1330         INIT_LIST_HEAD(&ctx->counter_list);
1331         ctx->task = task;
1332 }
1333
1334 /*
1335  * inherit a counter from parent task to child task:
1336  */
1337 static int
1338 inherit_counter(struct perf_counter *parent_counter,
1339               struct task_struct *parent,
1340               struct perf_counter_context *parent_ctx,
1341               struct task_struct *child,
1342               struct perf_counter_context *child_ctx)
1343 {
1344         struct perf_counter *child_counter;
1345
1346         child_counter = perf_counter_alloc(&parent_counter->hw_event,
1347                                             parent_counter->cpu, NULL,
1348                                             GFP_ATOMIC);
1349         if (!child_counter)
1350                 return -ENOMEM;
1351
1352         /*
1353          * Link it up in the child's context:
1354          */
1355         child_counter->ctx = child_ctx;
1356         child_counter->task = child;
1357         list_add_counter(child_counter, child_ctx);
1358         child_ctx->nr_counters++;
1359
1360         child_counter->parent = parent_counter;
1361         /*
1362          * inherit into child's child as well:
1363          */
1364         child_counter->hw_event.inherit = 1;
1365
1366         /*
1367          * Get a reference to the parent filp - we will fput it
1368          * when the child counter exits. This is safe to do because
1369          * we are in the parent and we know that the filp still
1370          * exists and has a nonzero count:
1371          */
1372         atomic_long_inc(&parent_counter->filp->f_count);
1373
1374         return 0;
1375 }
1376
1377 static void
1378 __perf_counter_exit_task(struct task_struct *child,
1379                          struct perf_counter *child_counter,
1380                          struct perf_counter_context *child_ctx)
1381 {
1382         struct perf_counter *parent_counter;
1383         u64 parent_val, child_val;
1384
1385         /*
1386          * If we do not self-reap then we have to wait for the
1387          * child task to unschedule (it will happen for sure),
1388          * so that its counter is at its final count. (This
1389          * condition triggers rarely - child tasks usually get
1390          * off their CPU before the parent has a chance to
1391          * get this far into the reaping action)
1392          */
1393         if (child != current) {
1394                 wait_task_inactive(child, 0);
1395                 list_del_init(&child_counter->list_entry);
1396         } else {
1397                 struct perf_cpu_context *cpuctx;
1398                 unsigned long flags;
1399                 u64 perf_flags;
1400
1401                 /*
1402                  * Disable and unlink this counter.
1403                  *
1404                  * Be careful about zapping the list - IRQ/NMI context
1405                  * could still be processing it:
1406                  */
1407                 curr_rq_lock_irq_save(&flags);
1408                 perf_flags = hw_perf_save_disable();
1409
1410                 cpuctx = &__get_cpu_var(perf_cpu_context);
1411
1412                 if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
1413                         child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1414                         child_counter->hw_ops->disable(child_counter);
1415                         cpuctx->active_oncpu--;
1416                         child_ctx->nr_active--;
1417                         child_counter->oncpu = -1;
1418                 }
1419
1420                 list_del_init(&child_counter->list_entry);
1421
1422                 child_ctx->nr_counters--;
1423
1424                 hw_perf_restore(perf_flags);
1425                 curr_rq_unlock_irq_restore(&flags);
1426         }
1427
1428         parent_counter = child_counter->parent;
1429         /*
1430          * It can happen that parent exits first, and has counters
1431          * that are still around due to the child reference. These
1432          * counters need to be zapped - but otherwise linger.
1433          */
1434         if (!parent_counter)
1435                 return;
1436
1437         parent_val = atomic64_read(&parent_counter->count);
1438         child_val = atomic64_read(&child_counter->count);
1439
1440         /*
1441          * Add back the child's count to the parent's count:
1442          */
1443         atomic64_add(child_val, &parent_counter->count);
1444
1445         fput(parent_counter->filp);
1446
1447         kfree(child_counter);
1448 }
1449
1450 /*
1451  * When a child task exist, feed back counter values to parent counters.
1452  *
1453  * Note: we are running in child context, but the PID is not hashed
1454  * anymore so new counters will not be added.
1455  */
1456 void perf_counter_exit_task(struct task_struct *child)
1457 {
1458         struct perf_counter *child_counter, *tmp;
1459         struct perf_counter_context *child_ctx;
1460
1461         child_ctx = &child->perf_counter_ctx;
1462
1463         if (likely(!child_ctx->nr_counters))
1464                 return;
1465
1466         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1467                                  list_entry)
1468                 __perf_counter_exit_task(child, child_counter, child_ctx);
1469 }
1470
1471 /*
1472  * Initialize the perf_counter context in task_struct
1473  */
1474 void perf_counter_init_task(struct task_struct *child)
1475 {
1476         struct perf_counter_context *child_ctx, *parent_ctx;
1477         struct perf_counter *counter, *parent_counter;
1478         struct task_struct *parent = current;
1479         unsigned long flags;
1480
1481         child_ctx  =  &child->perf_counter_ctx;
1482         parent_ctx = &parent->perf_counter_ctx;
1483
1484         __perf_counter_init_context(child_ctx, child);
1485
1486         /*
1487          * This is executed from the parent task context, so inherit
1488          * counters that have been marked for cloning:
1489          */
1490
1491         if (likely(!parent_ctx->nr_counters))
1492                 return;
1493
1494         /*
1495          * Lock the parent list. No need to lock the child - not PID
1496          * hashed yet and not running, so nobody can access it.
1497          */
1498         spin_lock_irqsave(&parent_ctx->lock, flags);
1499
1500         /*
1501          * We dont have to disable NMIs - we are only looking at
1502          * the list, not manipulating it:
1503          */
1504         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1505                 if (!counter->hw_event.inherit || counter->group_leader != counter)
1506                         continue;
1507
1508                 /*
1509                  * Instead of creating recursive hierarchies of counters,
1510                  * we link inheritd counters back to the original parent,
1511                  * which has a filp for sure, which we use as the reference
1512                  * count:
1513                  */
1514                 parent_counter = counter;
1515                 if (counter->parent)
1516                         parent_counter = counter->parent;
1517
1518                 if (inherit_counter(parent_counter, parent,
1519                                   parent_ctx, child, child_ctx))
1520                         break;
1521         }
1522
1523         spin_unlock_irqrestore(&parent_ctx->lock, flags);
1524 }
1525
1526 static void __cpuinit perf_counter_init_cpu(int cpu)
1527 {
1528         struct perf_cpu_context *cpuctx;
1529
1530         cpuctx = &per_cpu(perf_cpu_context, cpu);
1531         __perf_counter_init_context(&cpuctx->ctx, NULL);
1532
1533         mutex_lock(&perf_resource_mutex);
1534         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
1535         mutex_unlock(&perf_resource_mutex);
1536
1537         hw_perf_counter_setup();
1538 }
1539
1540 #ifdef CONFIG_HOTPLUG_CPU
1541 static void __perf_counter_exit_cpu(void *info)
1542 {
1543         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1544         struct perf_counter_context *ctx = &cpuctx->ctx;
1545         struct perf_counter *counter, *tmp;
1546
1547         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
1548                 __perf_counter_remove_from_context(counter);
1549
1550 }
1551 static void perf_counter_exit_cpu(int cpu)
1552 {
1553         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
1554 }
1555 #else
1556 static inline void perf_counter_exit_cpu(int cpu) { }
1557 #endif
1558
1559 static int __cpuinit
1560 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
1561 {
1562         unsigned int cpu = (long)hcpu;
1563
1564         switch (action) {
1565
1566         case CPU_UP_PREPARE:
1567         case CPU_UP_PREPARE_FROZEN:
1568                 perf_counter_init_cpu(cpu);
1569                 break;
1570
1571         case CPU_DOWN_PREPARE:
1572         case CPU_DOWN_PREPARE_FROZEN:
1573                 perf_counter_exit_cpu(cpu);
1574                 break;
1575
1576         default:
1577                 break;
1578         }
1579
1580         return NOTIFY_OK;
1581 }
1582
1583 static struct notifier_block __cpuinitdata perf_cpu_nb = {
1584         .notifier_call          = perf_cpu_notify,
1585 };
1586
1587 static int __init perf_counter_init(void)
1588 {
1589         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
1590                         (void *)(long)smp_processor_id());
1591         register_cpu_notifier(&perf_cpu_nb);
1592
1593         return 0;
1594 }
1595 early_initcall(perf_counter_init);
1596
1597 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
1598 {
1599         return sprintf(buf, "%d\n", perf_reserved_percpu);
1600 }
1601
1602 static ssize_t
1603 perf_set_reserve_percpu(struct sysdev_class *class,
1604                         const char *buf,
1605                         size_t count)
1606 {
1607         struct perf_cpu_context *cpuctx;
1608         unsigned long val;
1609         int err, cpu, mpt;
1610
1611         err = strict_strtoul(buf, 10, &val);
1612         if (err)
1613                 return err;
1614         if (val > perf_max_counters)
1615                 return -EINVAL;
1616
1617         mutex_lock(&perf_resource_mutex);
1618         perf_reserved_percpu = val;
1619         for_each_online_cpu(cpu) {
1620                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1621                 spin_lock_irq(&cpuctx->ctx.lock);
1622                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
1623                           perf_max_counters - perf_reserved_percpu);
1624                 cpuctx->max_pertask = mpt;
1625                 spin_unlock_irq(&cpuctx->ctx.lock);
1626         }
1627         mutex_unlock(&perf_resource_mutex);
1628
1629         return count;
1630 }
1631
1632 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
1633 {
1634         return sprintf(buf, "%d\n", perf_overcommit);
1635 }
1636
1637 static ssize_t
1638 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
1639 {
1640         unsigned long val;
1641         int err;
1642
1643         err = strict_strtoul(buf, 10, &val);
1644         if (err)
1645                 return err;
1646         if (val > 1)
1647                 return -EINVAL;
1648
1649         mutex_lock(&perf_resource_mutex);
1650         perf_overcommit = val;
1651         mutex_unlock(&perf_resource_mutex);
1652
1653         return count;
1654 }
1655
1656 static SYSDEV_CLASS_ATTR(
1657                                 reserve_percpu,
1658                                 0644,
1659                                 perf_show_reserve_percpu,
1660                                 perf_set_reserve_percpu
1661                         );
1662
1663 static SYSDEV_CLASS_ATTR(
1664                                 overcommit,
1665                                 0644,
1666                                 perf_show_overcommit,
1667                                 perf_set_overcommit
1668                         );
1669
1670 static struct attribute *perfclass_attrs[] = {
1671         &attr_reserve_percpu.attr,
1672         &attr_overcommit.attr,
1673         NULL
1674 };
1675
1676 static struct attribute_group perfclass_attr_group = {
1677         .attrs                  = perfclass_attrs,
1678         .name                   = "perf_counters",
1679 };
1680
1681 static int __init perf_counter_sysfs_init(void)
1682 {
1683         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
1684                                   &perfclass_attr_group);
1685 }
1686 device_initcall(perf_counter_sysfs_init);