kernel/perf_event.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/sysfs.h>
  19 #include <linux/dcache.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/vmstat.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/hardirq.h>
  25 #include <linux/rculist.h>
  26 #include <linux/uaccess.h>
  27 #include <linux/syscalls.h>
  28 #include <linux/anon_inodes.h>
  29 #include <linux/kernel_stat.h>
  30 #include <linux/perf_event.h>
  31 #include <linux/ftrace_event.h>
  32 #include <linux/hw_breakpoint.h>
  33
  34 #include <asm/irq_regs.h>
  35
  36 /*
  37  * Each CPU has a list of per CPU events:
  38  */
  39 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  40
  41 int perf_max_events __read_mostly = 1;
  42 static int perf_reserved_percpu __read_mostly;
  43 static int perf_overcommit __read_mostly = 1;
  44
  45 static atomic_t nr_events __read_mostly;
  46 static atomic_t nr_mmap_events __read_mostly;
  47 static atomic_t nr_comm_events __read_mostly;
  48 static atomic_t nr_task_events __read_mostly;
  49
  50 /*
  51  * perf event paranoia level:
  52  *  -1 - not paranoid at all
  53  *   0 - disallow raw tracepoint access for unpriv
  54  *   1 - disallow cpu events for unpriv
  55  *   2 - disallow kernel profiling for unpriv
  56  */
  57 int sysctl_perf_event_paranoid __read_mostly = 1;
  58
  59 static inline bool perf_paranoid_tracepoint_raw(void)
  60 {
  61         return sysctl_perf_event_paranoid > -1;
  62 }
  63
  64 static inline bool perf_paranoid_cpu(void)
  65 {
  66         return sysctl_perf_event_paranoid > 0;
  67 }
  68
  69 static inline bool perf_paranoid_kernel(void)
  70 {
  71         return sysctl_perf_event_paranoid > 1;
  72 }
  73
  74 /* Minimum for 128 pages + 1 for the user control page */
  75 int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */
  76
  77 /*
  78  * max perf event sample rate
  79  */
  80 int sysctl_perf_event_sample_rate __read_mostly = 100000;
  81
  82 static atomic64_t perf_event_id;
  83
  84 /*
  85  * Lock for (sysadmin-configurable) event reservations:
  86  */
  87 static DEFINE_SPINLOCK(perf_resource_lock);
  88
  89 /*
  90  * Architecture provided APIs - weak aliases:
  91  */
  92 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  93 {
  94         return NULL;
  95 }
  96
  97 void __weak hw_perf_disable(void)               { barrier(); }
  98 void __weak hw_perf_enable(void)                { barrier(); }
  99
 100 void __weak hw_perf_event_setup(int cpu)        { barrier(); }
 101 void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
 102
 103 int __weak
 104 hw_perf_group_sched_in(struct perf_event *group_leader,
 105                struct perf_cpu_context *cpuctx,
 106                struct perf_event_context *ctx, int cpu)
 107 {
 108         return 0;
 109 }
 110
 111 void __weak perf_event_print_debug(void)        { }
 112
 113 static DEFINE_PER_CPU(int, perf_disable_count);
 114
 115 void __perf_disable(void)
 116 {
 117         __get_cpu_var(perf_disable_count)++;
 118 }
 119
 120 bool __perf_enable(void)
 121 {
 122         return !--__get_cpu_var(perf_disable_count);
 123 }
 124
 125 void perf_disable(void)
 126 {
 127         __perf_disable();
 128         hw_perf_disable();
 129 }
 130
 131 void perf_enable(void)
 132 {
 133         if (__perf_enable())
 134                 hw_perf_enable();
 135 }
 136
 137 static void get_ctx(struct perf_event_context *ctx)
 138 {
 139         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 140 }
 141
 142 static void free_ctx(struct rcu_head *head)
 143 {
 144         struct perf_event_context *ctx;
 145
 146         ctx = container_of(head, struct perf_event_context, rcu_head);
 147         kfree(ctx);
 148 }
 149
 150 static void put_ctx(struct perf_event_context *ctx)
 151 {
 152         if (atomic_dec_and_test(&ctx->refcount)) {
 153                 if (ctx->parent_ctx)
 154                         put_ctx(ctx->parent_ctx);
 155                 if (ctx->task)
 156                         put_task_struct(ctx->task);
 157                 call_rcu(&ctx->rcu_head, free_ctx);
 158         }
 159 }
 160
 161 static void unclone_ctx(struct perf_event_context *ctx)
 162 {
 163         if (ctx->parent_ctx) {
 164                 put_ctx(ctx->parent_ctx);
 165                 ctx->parent_ctx = NULL;
 166         }
 167 }
 168
 169 /*
 170  * If we inherit events we want to return the parent event id
 171  * to userspace.
 172  */
 173 static u64 primary_event_id(struct perf_event *event)
 174 {
 175         u64 id = event->id;
 176
 177         if (event->parent)
 178                 id = event->parent->id;
 179
 180         return id;
 181 }
 182
 183 /*
 184  * Get the perf_event_context for a task and lock it.
 185  * This has to cope with with the fact that until it is locked,
 186  * the context could get moved to another task.
 187  */
 188 static struct perf_event_context *
 189 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 190 {
 191         struct perf_event_context *ctx;
 192
 193         rcu_read_lock();
 194  retry:
 195         ctx = rcu_dereference(task->perf_event_ctxp);
 196         if (ctx) {
 197                 /*
 198                  * If this context is a clone of another, it might
 199                  * get swapped for another underneath us by
 200                  * perf_event_task_sched_out, though the
 201                  * rcu_read_lock() protects us from any context
 202                  * getting freed.  Lock the context and check if it
 203                  * got swapped before we could get the lock, and retry
 204                  * if so.  If we locked the right context, then it
 205                  * can't get swapped on us any more.
 206                  */
 207                 raw_spin_lock_irqsave(&ctx->lock, *flags);
 208                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 209                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 210                         goto retry;
 211                 }
 212
 213                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 214                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 215                         ctx = NULL;
 216                 }
 217         }
 218         rcu_read_unlock();
 219         return ctx;
 220 }
 221
 222 /*
 223  * Get the context for a task and increment its pin_count so it
 224  * can't get swapped to another task.  This also increments its
 225  * reference count so that the context can't get freed.
 226  */
 227 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 228 {
 229         struct perf_event_context *ctx;
 230         unsigned long flags;
 231
 232         ctx = perf_lock_task_context(task, &flags);
 233         if (ctx) {
 234                 ++ctx->pin_count;
 235                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
 236         }
 237         return ctx;
 238 }
 239
 240 static void perf_unpin_context(struct perf_event_context *ctx)
 241 {
 242         unsigned long flags;
 243
 244         raw_spin_lock_irqsave(&ctx->lock, flags);
 245         --ctx->pin_count;
 246         raw_spin_unlock_irqrestore(&ctx->lock, flags);
 247         put_ctx(ctx);
 248 }
 249
 250 static inline u64 perf_clock(void)
 251 {
 252         return cpu_clock(raw_smp_processor_id());
 253 }
 254
 255 /*
 256  * Update the record of the current time in a context.
 257  */
 258 static void update_context_time(struct perf_event_context *ctx)
 259 {
 260         u64 now = perf_clock();
 261
 262         ctx->time += now - ctx->timestamp;
 263         ctx->timestamp = now;
 264 }
 265
 266 /*
 267  * Update the total_time_enabled and total_time_running fields for a event.
 268  */
 269 static void update_event_times(struct perf_event *event)
 270 {
 271         struct perf_event_context *ctx = event->ctx;
 272         u64 run_end;
 273
 274         if (event->state < PERF_EVENT_STATE_INACTIVE ||
 275             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 276                 return;
 277
 278         if (ctx->is_active)
 279                 run_end = ctx->time;
 280         else
 281                 run_end = event->tstamp_stopped;
 282
 283         event->total_time_enabled = run_end - event->tstamp_enabled;
 284
 285         if (event->state == PERF_EVENT_STATE_INACTIVE)
 286                 run_end = event->tstamp_stopped;
 287         else
 288                 run_end = ctx->time;
 289
 290         event->total_time_running = run_end - event->tstamp_running;
 291 }
 292
 293 /*
 294  * Add a event from the lists for its context.
 295  * Must be called with ctx->mutex and ctx->lock held.
 296  */
 297 static void
 298 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 299 {
 300         struct perf_event *group_leader = event->group_leader;
 301
 302         /*
 303          * Depending on whether it is a standalone or sibling event,
 304          * add it straight to the context's event list, or to the group
 305          * leader's sibling list:
 306          */
 307         if (group_leader == event)
 308                 list_add_tail(&event->group_entry, &ctx->group_list);
 309         else {
 310                 list_add_tail(&event->group_entry, &group_leader->sibling_list);
 311                 group_leader->nr_siblings++;
 312         }
 313
 314         list_add_rcu(&event->event_entry, &ctx->event_list);
 315         ctx->nr_events++;
 316         if (event->attr.inherit_stat)
 317                 ctx->nr_stat++;
 318 }
 319
 320 /*
 321  * Remove a event from the lists for its context.
 322  * Must be called with ctx->mutex and ctx->lock held.
 323  */
 324 static void
 325 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 326 {
 327         struct perf_event *sibling, *tmp;
 328
 329         if (list_empty(&event->group_entry))
 330                 return;
 331         ctx->nr_events--;
 332         if (event->attr.inherit_stat)
 333                 ctx->nr_stat--;
 334
 335         list_del_init(&event->group_entry);
 336         list_del_rcu(&event->event_entry);
 337
 338         if (event->group_leader != event)
 339                 event->group_leader->nr_siblings--;
 340
 341         update_event_times(event);
 342
 343         /*
 344          * If event was in error state, then keep it
 345          * that way, otherwise bogus counts will be
 346          * returned on read(). The only way to get out
 347          * of error state is by explicit re-enabling
 348          * of the event
 349          */
 350         if (event->state > PERF_EVENT_STATE_OFF)
 351                 event->state = PERF_EVENT_STATE_OFF;
 352
 353         /*
 354          * If this was a group event with sibling events then
 355          * upgrade the siblings to singleton events by adding them
 356          * to the context list directly:
 357          */
 358         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 359
 360                 list_move_tail(&sibling->group_entry, &ctx->group_list);
 361                 sibling->group_leader = sibling;
 362         }
 363 }
 364
 365 static void
 366 event_sched_out(struct perf_event *event,
 367                   struct perf_cpu_context *cpuctx,
 368                   struct perf_event_context *ctx)
 369 {
 370         if (event->state != PERF_EVENT_STATE_ACTIVE)
 371                 return;
 372
 373         event->state = PERF_EVENT_STATE_INACTIVE;
 374         if (event->pending_disable) {
 375                 event->pending_disable = 0;
 376                 event->state = PERF_EVENT_STATE_OFF;
 377         }
 378         event->tstamp_stopped = ctx->time;
 379         event->pmu->disable(event);
 380         event->oncpu = -1;
 381
 382         if (!is_software_event(event))
 383                 cpuctx->active_oncpu--;
 384         ctx->nr_active--;
 385         if (event->attr.exclusive || !cpuctx->active_oncpu)
 386                 cpuctx->exclusive = 0;
 387 }
 388
 389 static void
 390 group_sched_out(struct perf_event *group_event,
 391                 struct perf_cpu_context *cpuctx,
 392                 struct perf_event_context *ctx)
 393 {
 394         struct perf_event *event;
 395
 396         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 397                 return;
 398
 399         event_sched_out(group_event, cpuctx, ctx);
 400
 401         /*
 402          * Schedule out siblings (if any):
 403          */
 404         list_for_each_entry(event, &group_event->sibling_list, group_entry)
 405                 event_sched_out(event, cpuctx, ctx);
 406
 407         if (group_event->attr.exclusive)
 408                 cpuctx->exclusive = 0;
 409 }
 410
 411 /*
 412  * Cross CPU call to remove a performance event
 413  *
 414  * We disable the event on the hardware level first. After that we
 415  * remove it from the context list.
 416  */
 417 static void __perf_event_remove_from_context(void *info)
 418 {
 419         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 420         struct perf_event *event = info;
 421         struct perf_event_context *ctx = event->ctx;
 422
 423         /*
 424          * If this is a task context, we need to check whether it is
 425          * the current task context of this cpu. If not it has been
 426          * scheduled out before the smp call arrived.
 427          */
 428         if (ctx->task && cpuctx->task_ctx != ctx)
 429                 return;
 430
 431         raw_spin_lock(&ctx->lock);
 432         /*
 433          * Protect the list operation against NMI by disabling the
 434          * events on a global level.
 435          */
 436         perf_disable();
 437
 438         event_sched_out(event, cpuctx, ctx);
 439
 440         list_del_event(event, ctx);
 441
 442         if (!ctx->task) {
 443                 /*
 444                  * Allow more per task events with respect to the
 445                  * reservation:
 446                  */
 447                 cpuctx->max_pertask =
 448                         min(perf_max_events - ctx->nr_events,
 449                             perf_max_events - perf_reserved_percpu);
 450         }
 451
 452         perf_enable();
 453         raw_spin_unlock(&ctx->lock);
 454 }
 455
 456
 457 /*
 458  * Remove the event from a task's (or a CPU's) list of events.
 459  *
 460  * Must be called with ctx->mutex held.
 461  *
 462  * CPU events are removed with a smp call. For task events we only
 463  * call when the task is on a CPU.
 464  *
 465  * If event->ctx is a cloned context, callers must make sure that
 466  * every task struct that event->ctx->task could possibly point to
 467  * remains valid.  This is OK when called from perf_release since
 468  * that only calls us on the top-level context, which can't be a clone.
 469  * When called from perf_event_exit_task, it's OK because the
 470  * context has been detached from its task.
 471  */
 472 static void perf_event_remove_from_context(struct perf_event *event)
 473 {
 474         struct perf_event_context *ctx = event->ctx;
 475         struct task_struct *task = ctx->task;
 476
 477         if (!task) {
 478                 /*
 479                  * Per cpu events are removed via an smp call and
 480                  * the removal is always successful.
 481                  */
 482                 smp_call_function_single(event->cpu,
 483                                          __perf_event_remove_from_context,
 484                                          event, 1);
 485                 return;
 486         }
 487
 488 retry:
 489         task_oncpu_function_call(task, __perf_event_remove_from_context,
 490                                  event);
 491
 492         raw_spin_lock_irq(&ctx->lock);
 493         /*
 494          * If the context is active we need to retry the smp call.
 495          */
 496         if (ctx->nr_active && !list_empty(&event->group_entry)) {
 497                 raw_spin_unlock_irq(&ctx->lock);
 498                 goto retry;
 499         }
 500
 501         /*
 502          * The lock prevents that this context is scheduled in so we
 503          * can remove the event safely, if the call above did not
 504          * succeed.
 505          */
 506         if (!list_empty(&event->group_entry))
 507                 list_del_event(event, ctx);
 508         raw_spin_unlock_irq(&ctx->lock);
 509 }
 510
 511 /*
 512  * Update total_time_enabled and total_time_running for all events in a group.
 513  */
 514 static void update_group_times(struct perf_event *leader)
 515 {
 516         struct perf_event *event;
 517
 518         update_event_times(leader);
 519         list_for_each_entry(event, &leader->sibling_list, group_entry)
 520                 update_event_times(event);
 521 }
 522
 523 /*
 524  * Cross CPU call to disable a performance event
 525  */
 526 static void __perf_event_disable(void *info)
 527 {
 528         struct perf_event *event = info;
 529         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 530         struct perf_event_context *ctx = event->ctx;
 531
 532         /*
 533          * If this is a per-task event, need to check whether this
 534          * event's task is the current task on this cpu.
 535          */
 536         if (ctx->task && cpuctx->task_ctx != ctx)
 537                 return;
 538
 539         raw_spin_lock(&ctx->lock);
 540
 541         /*
 542          * If the event is on, turn it off.
 543          * If it is in error state, leave it in error state.
 544          */
 545         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 546                 update_context_time(ctx);
 547                 update_group_times(event);
 548                 if (event == event->group_leader)
 549                         group_sched_out(event, cpuctx, ctx);
 550                 else
 551                         event_sched_out(event, cpuctx, ctx);
 552                 event->state = PERF_EVENT_STATE_OFF;
 553         }
 554
 555         raw_spin_unlock(&ctx->lock);
 556 }
 557
 558 /*
 559  * Disable a event.
 560  *
 561  * If event->ctx is a cloned context, callers must make sure that
 562  * every task struct that event->ctx->task could possibly point to
 563  * remains valid.  This condition is satisifed when called through
 564  * perf_event_for_each_child or perf_event_for_each because they
 565  * hold the top-level event's child_mutex, so any descendant that
 566  * goes to exit will block in sync_child_event.
 567  * When called from perf_pending_event it's OK because event->ctx
 568  * is the current context on this CPU and preemption is disabled,
 569  * hence we can't get into perf_event_task_sched_out for this context.
 570  */
 571 void perf_event_disable(struct perf_event *event)
 572 {
 573         struct perf_event_context *ctx = event->ctx;
 574         struct task_struct *task = ctx->task;
 575
 576         if (!task) {
 577                 /*
 578                  * Disable the event on the cpu that it's on
 579                  */
 580                 smp_call_function_single(event->cpu, __perf_event_disable,
 581                                          event, 1);
 582                 return;
 583         }
 584
 585  retry:
 586         task_oncpu_function_call(task, __perf_event_disable, event);
 587
 588         raw_spin_lock_irq(&ctx->lock);
 589         /*
 590          * If the event is still active, we need to retry the cross-call.
 591          */
 592         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 593                 raw_spin_unlock_irq(&ctx->lock);
 594                 goto retry;
 595         }
 596
 597         /*
 598          * Since we have the lock this context can't be scheduled
 599          * in, so we can change the state safely.
 600          */
 601         if (event->state == PERF_EVENT_STATE_INACTIVE) {
 602                 update_group_times(event);
 603                 event->state = PERF_EVENT_STATE_OFF;
 604         }
 605
 606         raw_spin_unlock_irq(&ctx->lock);
 607 }
 608
 609 static int
 610 event_sched_in(struct perf_event *event,
 611                  struct perf_cpu_context *cpuctx,
 612                  struct perf_event_context *ctx,
 613                  int cpu)
 614 {
 615         if (event->state <= PERF_EVENT_STATE_OFF)
 616                 return 0;
 617
 618         event->state = PERF_EVENT_STATE_ACTIVE;
 619         event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
 620         /*
 621          * The new state must be visible before we turn it on in the hardware:
 622          */
 623         smp_wmb();
 624
 625         if (event->pmu->enable(event)) {
 626                 event->state = PERF_EVENT_STATE_INACTIVE;
 627                 event->oncpu = -1;
 628                 return -EAGAIN;
 629         }
 630
 631         event->tstamp_running += ctx->time - event->tstamp_stopped;
 632
 633         if (!is_software_event(event))
 634                 cpuctx->active_oncpu++;
 635         ctx->nr_active++;
 636
 637         if (event->attr.exclusive)
 638                 cpuctx->exclusive = 1;
 639
 640         return 0;
 641 }
 642
 643 static int
 644 group_sched_in(struct perf_event *group_event,
 645                struct perf_cpu_context *cpuctx,
 646                struct perf_event_context *ctx,
 647                int cpu)
 648 {
 649         struct perf_event *event, *partial_group;
 650         int ret;
 651
 652         if (group_event->state == PERF_EVENT_STATE_OFF)
 653                 return 0;
 654
 655         ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
 656         if (ret)
 657                 return ret < 0 ? ret : 0;
 658
 659         if (event_sched_in(group_event, cpuctx, ctx, cpu))
 660                 return -EAGAIN;
 661
 662         /*
 663          * Schedule in siblings as one group (if any):
 664          */
 665         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 666                 if (event_sched_in(event, cpuctx, ctx, cpu)) {
 667                         partial_group = event;
 668                         goto group_error;
 669                 }
 670         }
 671
 672         return 0;
 673
 674 group_error:
 675         /*
 676          * Groups can be scheduled in as one unit only, so undo any
 677          * partial group before returning:
 678          */
 679         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 680                 if (event == partial_group)
 681                         break;
 682                 event_sched_out(event, cpuctx, ctx);
 683         }
 684         event_sched_out(group_event, cpuctx, ctx);
 685
 686         return -EAGAIN;
 687 }
 688
 689 /*
 690  * Return 1 for a group consisting entirely of software events,
 691  * 0 if the group contains any hardware events.
 692  */
 693 static int is_software_only_group(struct perf_event *leader)
 694 {
 695         struct perf_event *event;
 696
 697         if (!is_software_event(leader))
 698                 return 0;
 699
 700         list_for_each_entry(event, &leader->sibling_list, group_entry)
 701                 if (!is_software_event(event))
 702                         return 0;
 703
 704         return 1;
 705 }
 706
 707 /*
 708  * Work out whether we can put this event group on the CPU now.
 709  */
 710 static int group_can_go_on(struct perf_event *event,
 711                            struct perf_cpu_context *cpuctx,
 712                            int can_add_hw)
 713 {
 714         /*
 715          * Groups consisting entirely of software events can always go on.
 716          */
 717         if (is_software_only_group(event))
 718                 return 1;
 719         /*
 720          * If an exclusive group is already on, no other hardware
 721          * events can go on.
 722          */
 723         if (cpuctx->exclusive)
 724                 return 0;
 725         /*
 726          * If this group is exclusive and there are already
 727          * events on the CPU, it can't go on.
 728          */
 729         if (event->attr.exclusive && cpuctx->active_oncpu)
 730                 return 0;
 731         /*
 732          * Otherwise, try to add it if all previous groups were able
 733          * to go on.
 734          */
 735         return can_add_hw;
 736 }
 737
 738 static void add_event_to_ctx(struct perf_event *event,
 739                                struct perf_event_context *ctx)
 740 {
 741         list_add_event(event, ctx);
 742         event->tstamp_enabled = ctx->time;
 743         event->tstamp_running = ctx->time;
 744         event->tstamp_stopped = ctx->time;
 745 }
 746
 747 /*
 748  * Cross CPU call to install and enable a performance event
 749  *
 750  * Must be called with ctx->mutex held
 751  */
 752 static void __perf_install_in_context(void *info)
 753 {
 754         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 755         struct perf_event *event = info;
 756         struct perf_event_context *ctx = event->ctx;
 757         struct perf_event *leader = event->group_leader;
 758         int cpu = smp_processor_id();
 759         int err;
 760
 761         /*
 762          * If this is a task context, we need to check whether it is
 763          * the current task context of this cpu. If not it has been
 764          * scheduled out before the smp call arrived.
 765          * Or possibly this is the right context but it isn't
 766          * on this cpu because it had no events.
 767          */
 768         if (ctx->task && cpuctx->task_ctx != ctx) {
 769                 if (cpuctx->task_ctx || ctx->task != current)
 770                         return;
 771                 cpuctx->task_ctx = ctx;
 772         }
 773
 774         raw_spin_lock(&ctx->lock);
 775         ctx->is_active = 1;
 776         update_context_time(ctx);
 777
 778         /*
 779          * Protect the list operation against NMI by disabling the
 780          * events on a global level. NOP for non NMI based events.
 781          */
 782         perf_disable();
 783
 784         add_event_to_ctx(event, ctx);
 785
 786         if (event->cpu != -1 && event->cpu != smp_processor_id())
 787                 goto unlock;
 788
 789         /*
 790          * Don't put the event on if it is disabled or if
 791          * it is in a group and the group isn't on.
 792          */
 793         if (event->state != PERF_EVENT_STATE_INACTIVE ||
 794             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 795                 goto unlock;
 796
 797         /*
 798          * An exclusive event can't go on if there are already active
 799          * hardware events, and no hardware event can go on if there
 800          * is already an exclusive event on.
 801          */
 802         if (!group_can_go_on(event, cpuctx, 1))
 803                 err = -EEXIST;
 804         else
 805                 err = event_sched_in(event, cpuctx, ctx, cpu);
 806
 807         if (err) {
 808                 /*
 809                  * This event couldn't go on.  If it is in a group
 810                  * then we have to pull the whole group off.
 811                  * If the event group is pinned then put it in error state.
 812                  */
 813                 if (leader != event)
 814                         group_sched_out(leader, cpuctx, ctx);
 815                 if (leader->attr.pinned) {
 816                         update_group_times(leader);
 817                         leader->state = PERF_EVENT_STATE_ERROR;
 818                 }
 819         }
 820
 821         if (!err && !ctx->task && cpuctx->max_pertask)
 822                 cpuctx->max_pertask--;
 823
 824  unlock:
 825         perf_enable();
 826
 827         raw_spin_unlock(&ctx->lock);
 828 }
 829
 830 /*
 831  * Attach a performance event to a context
 832  *
 833  * First we add the event to the list with the hardware enable bit
 834  * in event->hw_config cleared.
 835  *
 836  * If the event is attached to a task which is on a CPU we use a smp
 837  * call to enable it in the task context. The task might have been
 838  * scheduled away, but we check this in the smp call again.
 839  *
 840  * Must be called with ctx->mutex held.
 841  */
 842 static void
 843 perf_install_in_context(struct perf_event_context *ctx,
 844                         struct perf_event *event,
 845                         int cpu)
 846 {
 847         struct task_struct *task = ctx->task;
 848
 849         if (!task) {
 850                 /*
 851                  * Per cpu events are installed via an smp call and
 852                  * the install is always successful.
 853                  */
 854                 smp_call_function_single(cpu, __perf_install_in_context,
 855                                          event, 1);
 856                 return;
 857         }
 858
 859 retry:
 860         task_oncpu_function_call(task, __perf_install_in_context,
 861                                  event);
 862
 863         raw_spin_lock_irq(&ctx->lock);
 864         /*
 865          * we need to retry the smp call.
 866          */
 867         if (ctx->is_active && list_empty(&event->group_entry)) {
 868                 raw_spin_unlock_irq(&ctx->lock);
 869                 goto retry;
 870         }
 871
 872         /*
 873          * The lock prevents that this context is scheduled in so we
 874          * can add the event safely, if it the call above did not
 875          * succeed.
 876          */
 877         if (list_empty(&event->group_entry))
 878                 add_event_to_ctx(event, ctx);
 879         raw_spin_unlock_irq(&ctx->lock);
 880 }
 881
 882 /*
 883  * Put a event into inactive state and update time fields.
 884  * Enabling the leader of a group effectively enables all
 885  * the group members that aren't explicitly disabled, so we
 886  * have to update their ->tstamp_enabled also.
 887  * Note: this works for group members as well as group leaders
 888  * since the non-leader members' sibling_lists will be empty.
 889  */
 890 static void __perf_event_mark_enabled(struct perf_event *event,
 891                                         struct perf_event_context *ctx)
 892 {
 893         struct perf_event *sub;
 894
 895         event->state = PERF_EVENT_STATE_INACTIVE;
 896         event->tstamp_enabled = ctx->time - event->total_time_enabled;
 897         list_for_each_entry(sub, &event->sibling_list, group_entry)
 898                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 899                         sub->tstamp_enabled =
 900                                 ctx->time - sub->total_time_enabled;
 901 }
 902
 903 /*
 904  * Cross CPU call to enable a performance event
 905  */
 906 static void __perf_event_enable(void *info)
 907 {
 908         struct perf_event *event = info;
 909         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 910         struct perf_event_context *ctx = event->ctx;
 911         struct perf_event *leader = event->group_leader;
 912         int err;
 913
 914         /*
 915          * If this is a per-task event, need to check whether this
 916          * event's task is the current task on this cpu.
 917          */
 918         if (ctx->task && cpuctx->task_ctx != ctx) {
 919                 if (cpuctx->task_ctx || ctx->task != current)
 920                         return;
 921                 cpuctx->task_ctx = ctx;
 922         }
 923
 924         raw_spin_lock(&ctx->lock);
 925         ctx->is_active = 1;
 926         update_context_time(ctx);
 927
 928         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 929                 goto unlock;
 930         __perf_event_mark_enabled(event, ctx);
 931
 932         if (event->cpu != -1 && event->cpu != smp_processor_id())
 933                 goto unlock;
 934
 935         /*
 936          * If the event is in a group and isn't the group leader,
 937          * then don't put it on unless the group is on.
 938          */
 939         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 940                 goto unlock;
 941
 942         if (!group_can_go_on(event, cpuctx, 1)) {
 943                 err = -EEXIST;
 944         } else {
 945                 perf_disable();
 946                 if (event == leader)
 947                         err = group_sched_in(event, cpuctx, ctx,
 948                                              smp_processor_id());
 949                 else
 950                         err = event_sched_in(event, cpuctx, ctx,
 951                                                smp_processor_id());
 952                 perf_enable();
 953         }
 954
 955         if (err) {
 956                 /*
 957                  * If this event can't go on and it's part of a
 958                  * group, then the whole group has to come off.
 959                  */
 960                 if (leader != event)
 961                         group_sched_out(leader, cpuctx, ctx);
 962                 if (leader->attr.pinned) {
 963                         update_group_times(leader);
 964                         leader->state = PERF_EVENT_STATE_ERROR;
 965                 }
 966         }
 967
 968  unlock:
 969         raw_spin_unlock(&ctx->lock);
 970 }
 971
 972 /*
 973  * Enable a event.
 974  *
 975  * If event->ctx is a cloned context, callers must make sure that
 976  * every task struct that event->ctx->task could possibly point to
 977  * remains valid.  This condition is satisfied when called through
 978  * perf_event_for_each_child or perf_event_for_each as described
 979  * for perf_event_disable.
 980  */
 981 void perf_event_enable(struct perf_event *event)
 982 {
 983         struct perf_event_context *ctx = event->ctx;
 984         struct task_struct *task = ctx->task;
 985
 986         if (!task) {
 987                 /*
 988                  * Enable the event on the cpu that it's on
 989                  */
 990                 smp_call_function_single(event->cpu, __perf_event_enable,
 991                                          event, 1);
 992                 return;
 993         }
 994
 995         raw_spin_lock_irq(&ctx->lock);
 996         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 997                 goto out;
 998
 999         /*
1000          * If the event is in error state, clear that first.
1001          * That way, if we see the event in error state below, we
1002          * know that it has gone back into error state, as distinct
1003          * from the task having been scheduled away before the
1004          * cross-call arrived.
1005          */
1006         if (event->state == PERF_EVENT_STATE_ERROR)
1007                 event->state = PERF_EVENT_STATE_OFF;
1008
1009  retry:
1010         raw_spin_unlock_irq(&ctx->lock);
1011         task_oncpu_function_call(task, __perf_event_enable, event);
1012
1013         raw_spin_lock_irq(&ctx->lock);
1014
1015         /*
1016          * If the context is active and the event is still off,
1017          * we need to retry the cross-call.
1018          */
1019         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1020                 goto retry;
1021
1022         /*
1023          * Since we have the lock this context can't be scheduled
1024          * in, so we can change the state safely.
1025          */
1026         if (event->state == PERF_EVENT_STATE_OFF)
1027                 __perf_event_mark_enabled(event, ctx);
1028
1029  out:
1030         raw_spin_unlock_irq(&ctx->lock);
1031 }
1032
1033 static int perf_event_refresh(struct perf_event *event, int refresh)
1034 {
1035         /*
1036          * not supported on inherited events
1037          */
1038         if (event->attr.inherit)
1039                 return -EINVAL;
1040
1041         atomic_add(refresh, &event->event_limit);
1042         perf_event_enable(event);
1043
1044         return 0;
1045 }
1046
1047 void __perf_event_sched_out(struct perf_event_context *ctx,
1048                               struct perf_cpu_context *cpuctx)
1049 {
1050         struct perf_event *event;
1051
1052         raw_spin_lock(&ctx->lock);
1053         ctx->is_active = 0;
1054         if (likely(!ctx->nr_events))
1055                 goto out;
1056         update_context_time(ctx);
1057
1058         perf_disable();
1059         if (ctx->nr_active) {
1060                 list_for_each_entry(event, &ctx->group_list, group_entry)
1061                         group_sched_out(event, cpuctx, ctx);
1062         }
1063         perf_enable();
1064  out:
1065         raw_spin_unlock(&ctx->lock);
1066 }
1067
1068 /*
1069  * Test whether two contexts are equivalent, i.e. whether they
1070  * have both been cloned from the same version of the same context
1071  * and they both have the same number of enabled events.
1072  * If the number of enabled events is the same, then the set
1073  * of enabled events should be the same, because these are both
1074  * inherited contexts, therefore we can't access individual events
1075  * in them directly with an fd; we can only enable/disable all
1076  * events via prctl, or enable/disable all events in a family
1077  * via ioctl, which will have the same effect on both contexts.
1078  */
1079 static int context_equiv(struct perf_event_context *ctx1,
1080                          struct perf_event_context *ctx2)
1081 {
1082         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1083                 && ctx1->parent_gen == ctx2->parent_gen
1084                 && !ctx1->pin_count && !ctx2->pin_count;
1085 }
1086
1087 static void __perf_event_sync_stat(struct perf_event *event,
1088                                      struct perf_event *next_event)
1089 {
1090         u64 value;
1091
1092         if (!event->attr.inherit_stat)
1093                 return;
1094
1095         /*
1096          * Update the event value, we cannot use perf_event_read()
1097          * because we're in the middle of a context switch and have IRQs
1098          * disabled, which upsets smp_call_function_single(), however
1099          * we know the event must be on the current CPU, therefore we
1100          * don't need to use it.
1101          */
1102         switch (event->state) {
1103         case PERF_EVENT_STATE_ACTIVE:
1104                 event->pmu->read(event);
1105                 /* fall-through */
1106
1107         case PERF_EVENT_STATE_INACTIVE:
1108                 update_event_times(event);
1109                 break;
1110
1111         default:
1112                 break;
1113         }
1114
1115         /*
1116          * In order to keep per-task stats reliable we need to flip the event
1117          * values when we flip the contexts.
1118          */
1119         value = atomic64_read(&next_event->count);
1120         value = atomic64_xchg(&event->count, value);
1121         atomic64_set(&next_event->count, value);
1122
1123         swap(event->total_time_enabled, next_event->total_time_enabled);
1124         swap(event->total_time_running, next_event->total_time_running);
1125
1126         /*
1127          * Since we swizzled the values, update the user visible data too.
1128          */
1129         perf_event_update_userpage(event);
1130         perf_event_update_userpage(next_event);
1131 }
1132
1133 #define list_next_entry(pos, member) \
1134         list_entry(pos->member.next, typeof(*pos), member)
1135
1136 static void perf_event_sync_stat(struct perf_event_context *ctx,
1137                                    struct perf_event_context *next_ctx)
1138 {
1139         struct perf_event *event, *next_event;
1140
1141         if (!ctx->nr_stat)
1142                 return;
1143
1144         update_context_time(ctx);
1145
1146         event = list_first_entry(&ctx->event_list,
1147                                    struct perf_event, event_entry);
1148
1149         next_event = list_first_entry(&next_ctx->event_list,
1150                                         struct perf_event, event_entry);
1151
1152         while (&event->event_entry != &ctx->event_list &&
1153                &next_event->event_entry != &next_ctx->event_list) {
1154
1155                 __perf_event_sync_stat(event, next_event);
1156
1157                 event = list_next_entry(event, event_entry);
1158                 next_event = list_next_entry(next_event, event_entry);
1159         }
1160 }
1161
1162 /*
1163  * Called from scheduler to remove the events of the current task,
1164  * with interrupts disabled.
1165  *
1166  * We stop each event and update the event value in event->count.
1167  *
1168  * This does not protect us against NMI, but disable()
1169  * sets the disabled bit in the control field of event _before_
1170  * accessing the event control register. If a NMI hits, then it will
1171  * not restart the event.
1172  */
1173 void perf_event_task_sched_out(struct task_struct *task,
1174                                  struct task_struct *next, int cpu)
1175 {
1176         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1177         struct perf_event_context *ctx = task->perf_event_ctxp;
1178         struct perf_event_context *next_ctx;
1179         struct perf_event_context *parent;
1180         struct pt_regs *regs;
1181         int do_switch = 1;
1182
1183         regs = task_pt_regs(task);
1184         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1185
1186         if (likely(!ctx || !cpuctx->task_ctx))
1187                 return;
1188
1189         rcu_read_lock();
1190         parent = rcu_dereference(ctx->parent_ctx);
1191         next_ctx = next->perf_event_ctxp;
1192         if (parent && next_ctx &&
1193             rcu_dereference(next_ctx->parent_ctx) == parent) {
1194                 /*
1195                  * Looks like the two contexts are clones, so we might be
1196                  * able to optimize the context switch.  We lock both
1197                  * contexts and check that they are clones under the
1198                  * lock (including re-checking that neither has been
1199                  * uncloned in the meantime).  It doesn't matter which
1200                  * order we take the locks because no other cpu could
1201                  * be trying to lock both of these tasks.
1202                  */
1203                 raw_spin_lock(&ctx->lock);
1204                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1205                 if (context_equiv(ctx, next_ctx)) {
1206                         /*
1207                          * XXX do we need a memory barrier of sorts
1208                          * wrt to rcu_dereference() of perf_event_ctxp
1209                          */
1210                         task->perf_event_ctxp = next_ctx;
1211                         next->perf_event_ctxp = ctx;
1212                         ctx->task = next;
1213                         next_ctx->task = task;
1214                         do_switch = 0;
1215
1216                         perf_event_sync_stat(ctx, next_ctx);
1217                 }
1218                 raw_spin_unlock(&next_ctx->lock);
1219                 raw_spin_unlock(&ctx->lock);
1220         }
1221         rcu_read_unlock();
1222
1223         if (do_switch) {
1224                 __perf_event_sched_out(ctx, cpuctx);
1225                 cpuctx->task_ctx = NULL;
1226         }
1227 }
1228
1229 /*
1230  * Called with IRQs disabled
1231  */
1232 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1233 {
1234         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1235
1236         if (!cpuctx->task_ctx)
1237                 return;
1238
1239         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1240                 return;
1241
1242         __perf_event_sched_out(ctx, cpuctx);
1243         cpuctx->task_ctx = NULL;
1244 }
1245
1246 /*
1247  * Called with IRQs disabled
1248  */
1249 static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1250 {
1251         __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1252 }
1253
1254 static void
1255 __perf_event_sched_in(struct perf_event_context *ctx,
1256                         struct perf_cpu_context *cpuctx, int cpu)
1257 {
1258         struct perf_event *event;
1259         int can_add_hw = 1;
1260
1261         raw_spin_lock(&ctx->lock);
1262         ctx->is_active = 1;
1263         if (likely(!ctx->nr_events))
1264                 goto out;
1265
1266         ctx->timestamp = perf_clock();
1267
1268         perf_disable();
1269
1270         /*
1271          * First go through the list and put on any pinned groups
1272          * in order to give them the best chance of going on.
1273          */
1274         list_for_each_entry(event, &ctx->group_list, group_entry) {
1275                 if (event->state <= PERF_EVENT_STATE_OFF ||
1276                     !event->attr.pinned)
1277                         continue;
1278                 if (event->cpu != -1 && event->cpu != cpu)
1279                         continue;
1280
1281                 if (group_can_go_on(event, cpuctx, 1))
1282                         group_sched_in(event, cpuctx, ctx, cpu);
1283
1284                 /*
1285                  * If this pinned group hasn't been scheduled,
1286                  * put it in error state.
1287                  */
1288                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1289                         update_group_times(event);
1290                         event->state = PERF_EVENT_STATE_ERROR;
1291                 }
1292         }
1293
1294         list_for_each_entry(event, &ctx->group_list, group_entry) {
1295                 /*
1296                  * Ignore events in OFF or ERROR state, and
1297                  * ignore pinned events since we did them already.
1298                  */
1299                 if (event->state <= PERF_EVENT_STATE_OFF ||
1300                     event->attr.pinned)
1301                         continue;
1302
1303                 /*
1304                  * Listen to the 'cpu' scheduling filter constraint
1305                  * of events:
1306                  */
1307                 if (event->cpu != -1 && event->cpu != cpu)
1308                         continue;
1309
1310                 if (group_can_go_on(event, cpuctx, can_add_hw))
1311                         if (group_sched_in(event, cpuctx, ctx, cpu))
1312                                 can_add_hw = 0;
1313         }
1314         perf_enable();
1315  out:
1316         raw_spin_unlock(&ctx->lock);
1317 }
1318
1319 /*
1320  * Called from scheduler to add the events of the current task
1321  * with interrupts disabled.
1322  *
1323  * We restore the event value and then enable it.
1324  *
1325  * This does not protect us against NMI, but enable()
1326  * sets the enabled bit in the control field of event _before_
1327  * accessing the event control register. If a NMI hits, then it will
1328  * keep the event running.
1329  */
1330 void perf_event_task_sched_in(struct task_struct *task, int cpu)
1331 {
1332         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1333         struct perf_event_context *ctx = task->perf_event_ctxp;
1334
1335         if (likely(!ctx))
1336                 return;
1337         if (cpuctx->task_ctx == ctx)
1338                 return;
1339         __perf_event_sched_in(ctx, cpuctx, cpu);
1340         cpuctx->task_ctx = ctx;
1341 }
1342
1343 static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1344 {
1345         struct perf_event_context *ctx = &cpuctx->ctx;
1346
1347         __perf_event_sched_in(ctx, cpuctx, cpu);
1348 }
1349
1350 #define MAX_INTERRUPTS (~0ULL)
1351
1352 static void perf_log_throttle(struct perf_event *event, int enable);
1353
1354 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1355 {
1356         u64 frequency = event->attr.sample_freq;
1357         u64 sec = NSEC_PER_SEC;
1358         u64 divisor, dividend;
1359
1360         int count_fls, nsec_fls, frequency_fls, sec_fls;
1361
1362         count_fls = fls64(count);
1363         nsec_fls = fls64(nsec);
1364         frequency_fls = fls64(frequency);
1365         sec_fls = 30;
1366
1367         /*
1368          * We got @count in @nsec, with a target of sample_freq HZ
1369          * the target period becomes:
1370          *
1371          *             @count * 10^9
1372          * period = -------------------
1373          *          @nsec * sample_freq
1374          *
1375          */
1376
1377         /*
1378          * Reduce accuracy by one bit such that @a and @b converge
1379          * to a similar magnitude.
1380          */
1381 #define REDUCE_FLS(a, b)                \
1382 do {                                    \
1383         if (a##_fls > b##_fls) {        \
1384                 a >>= 1;                \
1385                 a##_fls--;              \
1386         } else {                        \
1387                 b >>= 1;                \
1388                 b##_fls--;              \
1389         }                               \
1390 } while (0)
1391
1392         /*
1393          * Reduce accuracy until either term fits in a u64, then proceed with
1394          * the other, so that finally we can do a u64/u64 division.
1395          */
1396         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1397                 REDUCE_FLS(nsec, frequency);
1398                 REDUCE_FLS(sec, count);
1399         }
1400
1401         if (count_fls + sec_fls > 64) {
1402                 divisor = nsec * frequency;
1403
1404                 while (count_fls + sec_fls > 64) {
1405                         REDUCE_FLS(count, sec);
1406                         divisor >>= 1;
1407                 }
1408
1409                 dividend = count * sec;
1410         } else {
1411                 dividend = count * sec;
1412
1413                 while (nsec_fls + frequency_fls > 64) {
1414                         REDUCE_FLS(nsec, frequency);
1415                         dividend >>= 1;
1416                 }
1417
1418                 divisor = nsec * frequency;
1419         }
1420
1421         if (!divisor)
1422                 return dividend;
1423
1424         return div64_u64(dividend, divisor);
1425 }
1426
1427 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1428 {
1429         struct hw_perf_event *hwc = &event->hw;
1430         s64 period, sample_period;
1431         s64 delta;
1432
1433         period = perf_calculate_period(event, nsec, count);
1434
1435         delta = (s64)(period - hwc->sample_period);
1436         delta = (delta + 7) / 8; /* low pass filter */
1437
1438         sample_period = hwc->sample_period + delta;
1439
1440         if (!sample_period)
1441                 sample_period = 1;
1442
1443         hwc->sample_period = sample_period;
1444
1445         if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1446                 perf_disable();
1447                 event->pmu->disable(event);
1448                 atomic64_set(&hwc->period_left, 0);
1449                 event->pmu->enable(event);
1450                 perf_enable();
1451         }
1452 }
1453
1454 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1455 {
1456         struct perf_event *event;
1457         struct hw_perf_event *hwc;
1458         u64 interrupts, now;
1459         s64 delta;
1460
1461         raw_spin_lock(&ctx->lock);
1462         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1463                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1464                         continue;
1465
1466                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1467                         continue;
1468
1469                 hwc = &event->hw;
1470
1471                 interrupts = hwc->interrupts;
1472                 hwc->interrupts = 0;
1473
1474                 /*
1475                  * unthrottle events on the tick
1476                  */
1477                 if (interrupts == MAX_INTERRUPTS) {
1478                         perf_log_throttle(event, 1);
1479                         event->pmu->unthrottle(event);
1480                 }
1481
1482                 if (!event->attr.freq || !event->attr.sample_freq)
1483                         continue;
1484
1485                 event->pmu->read(event);
1486                 now = atomic64_read(&event->count);
1487                 delta = now - hwc->freq_count_stamp;
1488                 hwc->freq_count_stamp = now;
1489
1490                 if (delta > 0)
1491                         perf_adjust_period(event, TICK_NSEC, delta);
1492         }
1493         raw_spin_unlock(&ctx->lock);
1494 }
1495
1496 /*
1497  * Round-robin a context's events:
1498  */
1499 static void rotate_ctx(struct perf_event_context *ctx)
1500 {
1501         struct perf_event *event;
1502
1503         if (!ctx->nr_events)
1504                 return;
1505
1506         raw_spin_lock(&ctx->lock);
1507         /*
1508          * Rotate the first entry last (works just fine for group events too):
1509          */
1510         perf_disable();
1511         list_for_each_entry(event, &ctx->group_list, group_entry) {
1512                 list_move_tail(&event->group_entry, &ctx->group_list);
1513                 break;
1514         }
1515         perf_enable();
1516
1517         raw_spin_unlock(&ctx->lock);
1518 }
1519
1520 void perf_event_task_tick(struct task_struct *curr, int cpu)
1521 {
1522         struct perf_cpu_context *cpuctx;
1523         struct perf_event_context *ctx;
1524
1525         if (!atomic_read(&nr_events))
1526                 return;
1527
1528         cpuctx = &per_cpu(perf_cpu_context, cpu);
1529         ctx = curr->perf_event_ctxp;
1530
1531         perf_ctx_adjust_freq(&cpuctx->ctx);
1532         if (ctx)
1533                 perf_ctx_adjust_freq(ctx);
1534
1535         perf_event_cpu_sched_out(cpuctx);
1536         if (ctx)
1537                 __perf_event_task_sched_out(ctx);
1538
1539         rotate_ctx(&cpuctx->ctx);
1540         if (ctx)
1541                 rotate_ctx(ctx);
1542
1543         perf_event_cpu_sched_in(cpuctx, cpu);
1544         if (ctx)
1545                 perf_event_task_sched_in(curr, cpu);
1546 }
1547
1548 /*
1549  * Enable all of a task's events that have been marked enable-on-exec.
1550  * This expects task == current.
1551  */
1552 static void perf_event_enable_on_exec(struct task_struct *task)
1553 {
1554         struct perf_event_context *ctx;
1555         struct perf_event *event;
1556         unsigned long flags;
1557         int enabled = 0;
1558
1559         local_irq_save(flags);
1560         ctx = task->perf_event_ctxp;
1561         if (!ctx || !ctx->nr_events)
1562                 goto out;
1563
1564         __perf_event_task_sched_out(ctx);
1565
1566         raw_spin_lock(&ctx->lock);
1567
1568         list_for_each_entry(event, &ctx->group_list, group_entry) {
1569                 if (!event->attr.enable_on_exec)
1570                         continue;
1571                 event->attr.enable_on_exec = 0;
1572                 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1573                         continue;
1574                 __perf_event_mark_enabled(event, ctx);
1575                 enabled = 1;
1576         }
1577
1578         /*
1579          * Unclone this context if we enabled any event.
1580          */
1581         if (enabled)
1582                 unclone_ctx(ctx);
1583
1584         raw_spin_unlock(&ctx->lock);
1585
1586         perf_event_task_sched_in(task, smp_processor_id());
1587  out:
1588         local_irq_restore(flags);
1589 }
1590
1591 /*
1592  * Cross CPU call to read the hardware event
1593  */
1594 static void __perf_event_read(void *info)
1595 {
1596         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1597         struct perf_event *event = info;
1598         struct perf_event_context *ctx = event->ctx;
1599
1600         /*
1601          * If this is a task context, we need to check whether it is
1602          * the current task context of this cpu.  If not it has been
1603          * scheduled out before the smp call arrived.  In that case
1604          * event->count would have been updated to a recent sample
1605          * when the event was scheduled out.
1606          */
1607         if (ctx->task && cpuctx->task_ctx != ctx)
1608                 return;
1609
1610         raw_spin_lock(&ctx->lock);
1611         update_context_time(ctx);
1612         update_event_times(event);
1613         raw_spin_unlock(&ctx->lock);
1614
1615         event->pmu->read(event);
1616 }
1617
1618 static u64 perf_event_read(struct perf_event *event)
1619 {
1620         /*
1621          * If event is enabled and currently active on a CPU, update the
1622          * value in the event structure:
1623          */
1624         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1625                 smp_call_function_single(event->oncpu,
1626                                          __perf_event_read, event, 1);
1627         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1628                 struct perf_event_context *ctx = event->ctx;
1629                 unsigned long flags;
1630
1631                 raw_spin_lock_irqsave(&ctx->lock, flags);
1632                 update_context_time(ctx);
1633                 update_event_times(event);
1634                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1635         }
1636
1637         return atomic64_read(&event->count);
1638 }
1639
1640 /*
1641  * Initialize the perf_event context in a task_struct:
1642  */
1643 static void
1644 __perf_event_init_context(struct perf_event_context *ctx,
1645                             struct task_struct *task)
1646 {
1647         raw_spin_lock_init(&ctx->lock);
1648         mutex_init(&ctx->mutex);
1649         INIT_LIST_HEAD(&ctx->group_list);
1650         INIT_LIST_HEAD(&ctx->event_list);
1651         atomic_set(&ctx->refcount, 1);
1652         ctx->task = task;
1653 }
1654
1655 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1656 {
1657         struct perf_event_context *ctx;
1658         struct perf_cpu_context *cpuctx;
1659         struct task_struct *task;
1660         unsigned long flags;
1661         int err;
1662
1663         if (pid == -1 && cpu != -1) {
1664                 /* Must be root to operate on a CPU event: */
1665                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1666                         return ERR_PTR(-EACCES);
1667
1668                 if (cpu < 0 || cpu >= nr_cpumask_bits)
1669                         return ERR_PTR(-EINVAL);
1670
1671                 /*
1672                  * We could be clever and allow to attach a event to an
1673                  * offline CPU and activate it when the CPU comes up, but
1674                  * that's for later.
1675                  */
1676                 if (!cpu_online(cpu))
1677                         return ERR_PTR(-ENODEV);
1678
1679                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1680                 ctx = &cpuctx->ctx;
1681                 get_ctx(ctx);
1682
1683                 return ctx;
1684         }
1685
1686         rcu_read_lock();
1687         if (!pid)
1688                 task = current;
1689         else
1690                 task = find_task_by_vpid(pid);
1691         if (task)
1692                 get_task_struct(task);
1693         rcu_read_unlock();
1694
1695         if (!task)
1696                 return ERR_PTR(-ESRCH);
1697
1698         /*
1699          * Can't attach events to a dying task.
1700          */
1701         err = -ESRCH;
1702         if (task->flags & PF_EXITING)
1703                 goto errout;
1704
1705         /* Reuse ptrace permission checks for now. */
1706         err = -EACCES;
1707         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1708                 goto errout;
1709
1710  retry:
1711         ctx = perf_lock_task_context(task, &flags);
1712         if (ctx) {
1713                 unclone_ctx(ctx);
1714                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1715         }
1716
1717         if (!ctx) {
1718                 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1719                 err = -ENOMEM;
1720                 if (!ctx)
1721                         goto errout;
1722                 __perf_event_init_context(ctx, task);
1723                 get_ctx(ctx);
1724                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1725                         /*
1726                          * We raced with some other task; use
1727                          * the context they set.
1728                          */
1729                         kfree(ctx);
1730                         goto retry;
1731                 }
1732                 get_task_struct(task);
1733         }
1734
1735         put_task_struct(task);
1736         return ctx;
1737
1738  errout:
1739         put_task_struct(task);
1740         return ERR_PTR(err);
1741 }
1742
1743 static void perf_event_free_filter(struct perf_event *event);
1744
1745 static void free_event_rcu(struct rcu_head *head)
1746 {
1747         struct perf_event *event;
1748
1749         event = container_of(head, struct perf_event, rcu_head);
1750         if (event->ns)
1751                 put_pid_ns(event->ns);
1752         perf_event_free_filter(event);
1753         kfree(event);
1754 }
1755
1756 static void perf_pending_sync(struct perf_event *event);
1757
1758 static void free_event(struct perf_event *event)
1759 {
1760         perf_pending_sync(event);
1761
1762         if (!event->parent) {
1763                 atomic_dec(&nr_events);
1764                 if (event->attr.mmap)
1765                         atomic_dec(&nr_mmap_events);
1766                 if (event->attr.comm)
1767                         atomic_dec(&nr_comm_events);
1768                 if (event->attr.task)
1769                         atomic_dec(&nr_task_events);
1770         }
1771
1772         if (event->output) {
1773                 fput(event->output->filp);
1774                 event->output = NULL;
1775         }
1776
1777         if (event->destroy)
1778                 event->destroy(event);
1779
1780         put_ctx(event->ctx);
1781         call_rcu(&event->rcu_head, free_event_rcu);
1782 }
1783
1784 int perf_event_release_kernel(struct perf_event *event)
1785 {
1786         struct perf_event_context *ctx = event->ctx;
1787
1788         WARN_ON_ONCE(ctx->parent_ctx);
1789         mutex_lock(&ctx->mutex);
1790         perf_event_remove_from_context(event);
1791         mutex_unlock(&ctx->mutex);
1792
1793         mutex_lock(&event->owner->perf_event_mutex);
1794         list_del_init(&event->owner_entry);
1795         mutex_unlock(&event->owner->perf_event_mutex);
1796         put_task_struct(event->owner);
1797
1798         free_event(event);
1799
1800         return 0;
1801 }
1802 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1803
1804 /*
1805  * Called when the last reference to the file is gone.
1806  */
1807 static int perf_release(struct inode *inode, struct file *file)
1808 {
1809         struct perf_event *event = file->private_data;
1810
1811         file->private_data = NULL;
1812
1813         return perf_event_release_kernel(event);
1814 }
1815
1816 static int perf_event_read_size(struct perf_event *event)
1817 {
1818         int entry = sizeof(u64); /* value */
1819         int size = 0;
1820         int nr = 1;
1821
1822         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1823                 size += sizeof(u64);
1824
1825         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1826                 size += sizeof(u64);
1827
1828         if (event->attr.read_format & PERF_FORMAT_ID)
1829                 entry += sizeof(u64);
1830
1831         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1832                 nr += event->group_leader->nr_siblings;
1833                 size += sizeof(u64);
1834         }
1835
1836         size += entry * nr;
1837
1838         return size;
1839 }
1840
1841 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1842 {
1843         struct perf_event *child;
1844         u64 total = 0;
1845
1846         *enabled = 0;
1847         *running = 0;
1848
1849         mutex_lock(&event->child_mutex);
1850         total += perf_event_read(event);
1851         *enabled += event->total_time_enabled +
1852                         atomic64_read(&event->child_total_time_enabled);
1853         *running += event->total_time_running +
1854                         atomic64_read(&event->child_total_time_running);
1855
1856         list_for_each_entry(child, &event->child_list, child_list) {
1857                 total += perf_event_read(child);
1858                 *enabled += child->total_time_enabled;
1859                 *running += child->total_time_running;
1860         }
1861         mutex_unlock(&event->child_mutex);
1862
1863         return total;
1864 }
1865 EXPORT_SYMBOL_GPL(perf_event_read_value);
1866
1867 static int perf_event_read_group(struct perf_event *event,
1868                                    u64 read_format, char __user *buf)
1869 {
1870         struct perf_event *leader = event->group_leader, *sub;
1871         int n = 0, size = 0, ret = -EFAULT;
1872         struct perf_event_context *ctx = leader->ctx;
1873         u64 values[5];
1874         u64 count, enabled, running;
1875
1876         mutex_lock(&ctx->mutex);
1877         count = perf_event_read_value(leader, &enabled, &running);
1878
1879         values[n++] = 1 + leader->nr_siblings;
1880         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1881                 values[n++] = enabled;
1882         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1883                 values[n++] = running;
1884         values[n++] = count;
1885         if (read_format & PERF_FORMAT_ID)
1886                 values[n++] = primary_event_id(leader);
1887
1888         size = n * sizeof(u64);
1889
1890         if (copy_to_user(buf, values, size))
1891                 goto unlock;
1892
1893         ret = size;
1894
1895         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1896                 n = 0;
1897
1898                 values[n++] = perf_event_read_value(sub, &enabled, &running);
1899                 if (read_format & PERF_FORMAT_ID)
1900                         values[n++] = primary_event_id(sub);
1901
1902                 size = n * sizeof(u64);
1903
1904                 if (copy_to_user(buf + ret, values, size)) {
1905                         ret = -EFAULT;
1906                         goto unlock;
1907                 }
1908
1909                 ret += size;
1910         }
1911 unlock:
1912         mutex_unlock(&ctx->mutex);
1913
1914         return ret;
1915 }
1916
1917 static int perf_event_read_one(struct perf_event *event,
1918                                  u64 read_format, char __user *buf)
1919 {
1920         u64 enabled, running;
1921         u64 values[4];
1922         int n = 0;
1923
1924         values[n++] = perf_event_read_value(event, &enabled, &running);
1925         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1926                 values[n++] = enabled;
1927         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1928                 values[n++] = running;
1929         if (read_format & PERF_FORMAT_ID)
1930                 values[n++] = primary_event_id(event);
1931
1932         if (copy_to_user(buf, values, n * sizeof(u64)))
1933                 return -EFAULT;
1934
1935         return n * sizeof(u64);
1936 }
1937
1938 /*
1939  * Read the performance event - simple non blocking version for now
1940  */
1941 static ssize_t
1942 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1943 {
1944         u64 read_format = event->attr.read_format;
1945         int ret;
1946
1947         /*
1948          * Return end-of-file for a read on a event that is in
1949          * error state (i.e. because it was pinned but it couldn't be
1950          * scheduled on to the CPU at some point).
1951          */
1952         if (event->state == PERF_EVENT_STATE_ERROR)
1953                 return 0;
1954
1955         if (count < perf_event_read_size(event))
1956                 return -ENOSPC;
1957
1958         WARN_ON_ONCE(event->ctx->parent_ctx);
1959         if (read_format & PERF_FORMAT_GROUP)
1960                 ret = perf_event_read_group(event, read_format, buf);
1961         else
1962                 ret = perf_event_read_one(event, read_format, buf);
1963
1964         return ret;
1965 }
1966
1967 static ssize_t
1968 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1969 {
1970         struct perf_event *event = file->private_data;
1971
1972         return perf_read_hw(event, buf, count);
1973 }
1974
1975 static unsigned int perf_poll(struct file *file, poll_table *wait)
1976 {
1977         struct perf_event *event = file->private_data;
1978         struct perf_mmap_data *data;
1979         unsigned int events = POLL_HUP;
1980
1981         rcu_read_lock();
1982         data = rcu_dereference(event->data);
1983         if (data)
1984                 events = atomic_xchg(&data->poll, 0);
1985         rcu_read_unlock();
1986
1987         poll_wait(file, &event->waitq, wait);
1988
1989         return events;
1990 }
1991
1992 static void perf_event_reset(struct perf_event *event)
1993 {
1994         (void)perf_event_read(event);
1995         atomic64_set(&event->count, 0);
1996         perf_event_update_userpage(event);
1997 }
1998
1999 /*
2000  * Holding the top-level event's child_mutex means that any
2001  * descendant process that has inherited this event will block
2002  * in sync_child_event if it goes to exit, thus satisfying the
2003  * task existence requirements of perf_event_enable/disable.
2004  */
2005 static void perf_event_for_each_child(struct perf_event *event,
2006                                         void (*func)(struct perf_event *))
2007 {
2008         struct perf_event *child;
2009
2010         WARN_ON_ONCE(event->ctx->parent_ctx);
2011         mutex_lock(&event->child_mutex);
2012         func(event);
2013         list_for_each_entry(child, &event->child_list, child_list)
2014                 func(child);
2015         mutex_unlock(&event->child_mutex);
2016 }
2017
2018 static void perf_event_for_each(struct perf_event *event,
2019                                   void (*func)(struct perf_event *))
2020 {
2021         struct perf_event_context *ctx = event->ctx;
2022         struct perf_event *sibling;
2023
2024         WARN_ON_ONCE(ctx->parent_ctx);
2025         mutex_lock(&ctx->mutex);
2026         event = event->group_leader;
2027
2028         perf_event_for_each_child(event, func);
2029         func(event);
2030         list_for_each_entry(sibling, &event->sibling_list, group_entry)
2031                 perf_event_for_each_child(event, func);
2032         mutex_unlock(&ctx->mutex);
2033 }
2034
2035 static int perf_event_period(struct perf_event *event, u64 __user *arg)
2036 {
2037         struct perf_event_context *ctx = event->ctx;
2038         unsigned long size;
2039         int ret = 0;
2040         u64 value;
2041
2042         if (!event->attr.sample_period)
2043                 return -EINVAL;
2044
2045         size = copy_from_user(&value, arg, sizeof(value));
2046         if (size != sizeof(value))
2047                 return -EFAULT;
2048
2049         if (!value)
2050                 return -EINVAL;
2051
2052         raw_spin_lock_irq(&ctx->lock);
2053         if (event->attr.freq) {
2054                 if (value > sysctl_perf_event_sample_rate) {
2055                         ret = -EINVAL;
2056                         goto unlock;
2057                 }
2058
2059                 event->attr.sample_freq = value;
2060         } else {
2061                 event->attr.sample_period = value;
2062                 event->hw.sample_period = value;
2063         }
2064 unlock:
2065         raw_spin_unlock_irq(&ctx->lock);
2066
2067         return ret;
2068 }
2069
2070 static int perf_event_set_output(struct perf_event *event, int output_fd);
2071 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2072
2073 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2074 {
2075         struct perf_event *event = file->private_data;
2076         void (*func)(struct perf_event *);
2077         u32 flags = arg;
2078
2079         switch (cmd) {
2080         case PERF_EVENT_IOC_ENABLE:
2081                 func = perf_event_enable;
2082                 break;
2083         case PERF_EVENT_IOC_DISABLE:
2084                 func = perf_event_disable;
2085                 break;
2086         case PERF_EVENT_IOC_RESET:
2087                 func = perf_event_reset;
2088                 break;
2089
2090         case PERF_EVENT_IOC_REFRESH:
2091                 return perf_event_refresh(event, arg);
2092
2093         case PERF_EVENT_IOC_PERIOD:
2094                 return perf_event_period(event, (u64 __user *)arg);
2095
2096         case PERF_EVENT_IOC_SET_OUTPUT:
2097                 return perf_event_set_output(event, arg);
2098
2099         case PERF_EVENT_IOC_SET_FILTER:
2100                 return perf_event_set_filter(event, (void __user *)arg);
2101
2102         default:
2103                 return -ENOTTY;
2104         }
2105
2106         if (flags & PERF_IOC_FLAG_GROUP)
2107                 perf_event_for_each(event, func);
2108         else
2109                 perf_event_for_each_child(event, func);
2110
2111         return 0;
2112 }
2113
2114 int perf_event_task_enable(void)
2115 {
2116         struct perf_event *event;
2117
2118         mutex_lock(&current->perf_event_mutex);
2119         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2120                 perf_event_for_each_child(event, perf_event_enable);
2121         mutex_unlock(&current->perf_event_mutex);
2122
2123         return 0;
2124 }
2125
2126 int perf_event_task_disable(void)
2127 {
2128         struct perf_event *event;
2129
2130         mutex_lock(&current->perf_event_mutex);
2131         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2132                 perf_event_for_each_child(event, perf_event_disable);
2133         mutex_unlock(&current->perf_event_mutex);
2134
2135         return 0;
2136 }
2137
2138 #ifndef PERF_EVENT_INDEX_OFFSET
2139 # define PERF_EVENT_INDEX_OFFSET 0
2140 #endif
2141
2142 static int perf_event_index(struct perf_event *event)
2143 {
2144         if (event->state != PERF_EVENT_STATE_ACTIVE)
2145                 return 0;
2146
2147         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2148 }
2149
2150 /*
2151  * Callers need to ensure there can be no nesting of this function, otherwise
2152  * the seqlock logic goes bad. We can not serialize this because the arch
2153  * code calls this from NMI context.
2154  */
2155 void perf_event_update_userpage(struct perf_event *event)
2156 {
2157         struct perf_event_mmap_page *userpg;
2158         struct perf_mmap_data *data;
2159
2160         rcu_read_lock();
2161         data = rcu_dereference(event->data);
2162         if (!data)
2163                 goto unlock;
2164
2165         userpg = data->user_page;
2166
2167         /*
2168          * Disable preemption so as to not let the corresponding user-space
2169          * spin too long if we get preempted.
2170          */
2171         preempt_disable();
2172         ++userpg->lock;
2173         barrier();
2174         userpg->index = perf_event_index(event);
2175         userpg->offset = atomic64_read(&event->count);
2176         if (event->state == PERF_EVENT_STATE_ACTIVE)
2177                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2178
2179         userpg->time_enabled = event->total_time_enabled +
2180                         atomic64_read(&event->child_total_time_enabled);
2181
2182         userpg->time_running = event->total_time_running +
2183                         atomic64_read(&event->child_total_time_running);
2184
2185         barrier();
2186         ++userpg->lock;
2187         preempt_enable();
2188 unlock:
2189         rcu_read_unlock();
2190 }
2191
2192 static unsigned long perf_data_size(struct perf_mmap_data *data)
2193 {
2194         return data->nr_pages << (PAGE_SHIFT + data->data_order);
2195 }
2196
2197 #ifndef CONFIG_PERF_USE_VMALLOC
2198
2199 /*
2200  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2201  */
2202
2203 static struct page *
2204 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2205 {
2206         if (pgoff > data->nr_pages)
2207                 return NULL;
2208
2209         if (pgoff == 0)
2210                 return virt_to_page(data->user_page);
2211
2212         return virt_to_page(data->data_pages[pgoff - 1]);
2213 }
2214
2215 static struct perf_mmap_data *
2216 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2217 {
2218         struct perf_mmap_data *data;
2219         unsigned long size;
2220         int i;
2221
2222         WARN_ON(atomic_read(&event->mmap_count));
2223
2224         size = sizeof(struct perf_mmap_data);
2225         size += nr_pages * sizeof(void *);
2226
2227         data = kzalloc(size, GFP_KERNEL);
2228         if (!data)
2229                 goto fail;
2230
2231         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2232         if (!data->user_page)
2233                 goto fail_user_page;
2234
2235         for (i = 0; i < nr_pages; i++) {
2236                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2237                 if (!data->data_pages[i])
2238                         goto fail_data_pages;
2239         }
2240
2241         data->data_order = 0;
2242         data->nr_pages = nr_pages;
2243
2244         return data;
2245
2246 fail_data_pages:
2247         for (i--; i >= 0; i--)
2248                 free_page((unsigned long)data->data_pages[i]);
2249
2250         free_page((unsigned long)data->user_page);
2251
2252 fail_user_page:
2253         kfree(data);
2254
2255 fail:
2256         return NULL;
2257 }
2258
2259 static void perf_mmap_free_page(unsigned long addr)
2260 {
2261         struct page *page = virt_to_page((void *)addr);
2262
2263         page->mapping = NULL;
2264         __free_page(page);
2265 }
2266
2267 static void perf_mmap_data_free(struct perf_mmap_data *data)
2268 {
2269         int i;
2270
2271         perf_mmap_free_page((unsigned long)data->user_page);
2272         for (i = 0; i < data->nr_pages; i++)
2273                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2274         kfree(data);
2275 }
2276
2277 #else
2278
2279 /*
2280  * Back perf_mmap() with vmalloc memory.
2281  *
2282  * Required for architectures that have d-cache aliasing issues.
2283  */
2284
2285 static struct page *
2286 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2287 {
2288         if (pgoff > (1UL << data->data_order))
2289                 return NULL;
2290
2291         return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2292 }
2293
2294 static void perf_mmap_unmark_page(void *addr)
2295 {
2296         struct page *page = vmalloc_to_page(addr);
2297
2298         page->mapping = NULL;
2299 }
2300
2301 static void perf_mmap_data_free_work(struct work_struct *work)
2302 {
2303         struct perf_mmap_data *data;
2304         void *base;
2305         int i, nr;
2306
2307         data = container_of(work, struct perf_mmap_data, work);
2308         nr = 1 << data->data_order;
2309
2310         base = data->user_page;
2311         for (i = 0; i < nr + 1; i++)
2312                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2313
2314         vfree(base);
2315         kfree(data);
2316 }
2317
2318 static void perf_mmap_data_free(struct perf_mmap_data *data)
2319 {
2320         schedule_work(&data->work);
2321 }
2322
2323 static struct perf_mmap_data *
2324 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2325 {
2326         struct perf_mmap_data *data;
2327         unsigned long size;
2328         void *all_buf;
2329
2330         WARN_ON(atomic_read(&event->mmap_count));
2331
2332         size = sizeof(struct perf_mmap_data);
2333         size += sizeof(void *);
2334
2335         data = kzalloc(size, GFP_KERNEL);
2336         if (!data)
2337                 goto fail;
2338
2339         INIT_WORK(&data->work, perf_mmap_data_free_work);
2340
2341         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2342         if (!all_buf)
2343                 goto fail_all_buf;
2344
2345         data->user_page = all_buf;
2346         data->data_pages[0] = all_buf + PAGE_SIZE;
2347         data->data_order = ilog2(nr_pages);
2348         data->nr_pages = 1;
2349
2350         return data;
2351
2352 fail_all_buf:
2353         kfree(data);
2354
2355 fail:
2356         return NULL;
2357 }
2358
2359 #endif
2360
2361 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2362 {
2363         struct perf_event *event = vma->vm_file->private_data;
2364         struct perf_mmap_data *data;
2365         int ret = VM_FAULT_SIGBUS;
2366
2367         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2368                 if (vmf->pgoff == 0)
2369                         ret = 0;
2370                 return ret;
2371         }
2372
2373         rcu_read_lock();
2374         data = rcu_dereference(event->data);
2375         if (!data)
2376                 goto unlock;
2377
2378         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2379                 goto unlock;
2380
2381         vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2382         if (!vmf->page)
2383                 goto unlock;
2384
2385         get_page(vmf->page);
2386         vmf->page->mapping = vma->vm_file->f_mapping;
2387         vmf->page->index   = vmf->pgoff;
2388
2389         ret = 0;
2390 unlock:
2391         rcu_read_unlock();
2392
2393         return ret;
2394 }
2395
2396 static void
2397 perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2398 {
2399         long max_size = perf_data_size(data);
2400
2401         atomic_set(&data->lock, -1);
2402
2403         if (event->attr.watermark) {
2404                 data->watermark = min_t(long, max_size,
2405                                         event->attr.wakeup_watermark);
2406         }
2407
2408         if (!data->watermark)
2409                 data->watermark = max_size / 2;
2410
2411
2412         rcu_assign_pointer(event->data, data);
2413 }
2414
2415 static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2416 {
2417         struct perf_mmap_data *data;
2418
2419         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2420         perf_mmap_data_free(data);
2421 }
2422
2423 static void perf_mmap_data_release(struct perf_event *event)
2424 {
2425         struct perf_mmap_data *data = event->data;
2426
2427         WARN_ON(atomic_read(&event->mmap_count));
2428
2429         rcu_assign_pointer(event->data, NULL);
2430         call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2431 }
2432
2433 static void perf_mmap_open(struct vm_area_struct *vma)
2434 {
2435         struct perf_event *event = vma->vm_file->private_data;
2436
2437         atomic_inc(&event->mmap_count);
2438 }
2439
2440 static void perf_mmap_close(struct vm_area_struct *vma)
2441 {
2442         struct perf_event *event = vma->vm_file->private_data;
2443
2444         WARN_ON_ONCE(event->ctx->parent_ctx);
2445         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2446                 unsigned long size = perf_data_size(event->data);
2447                 struct user_struct *user = current_user();
2448
2449                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2450                 vma->vm_mm->locked_vm -= event->data->nr_locked;
2451                 perf_mmap_data_release(event);
2452                 mutex_unlock(&event->mmap_mutex);
2453         }
2454 }
2455
2456 static const struct vm_operations_struct perf_mmap_vmops = {
2457         .open           = perf_mmap_open,
2458         .close          = perf_mmap_close,
2459         .fault          = perf_mmap_fault,
2460         .page_mkwrite   = perf_mmap_fault,
2461 };
2462
2463 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2464 {
2465         struct perf_event *event = file->private_data;
2466         unsigned long user_locked, user_lock_limit;
2467         struct user_struct *user = current_user();
2468         unsigned long locked, lock_limit;
2469         struct perf_mmap_data *data;
2470         unsigned long vma_size;
2471         unsigned long nr_pages;
2472         long user_extra, extra;
2473         int ret = 0;
2474
2475         if (!(vma->vm_flags & VM_SHARED))
2476                 return -EINVAL;
2477
2478         vma_size = vma->vm_end - vma->vm_start;
2479         nr_pages = (vma_size / PAGE_SIZE) - 1;
2480
2481         /*
2482          * If we have data pages ensure they're a power-of-two number, so we
2483          * can do bitmasks instead of modulo.
2484          */
2485         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2486                 return -EINVAL;
2487
2488         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2489                 return -EINVAL;
2490
2491         if (vma->vm_pgoff != 0)
2492                 return -EINVAL;
2493
2494         WARN_ON_ONCE(event->ctx->parent_ctx);
2495         mutex_lock(&event->mmap_mutex);
2496         if (event->output) {
2497                 ret = -EINVAL;
2498                 goto unlock;
2499         }
2500
2501         if (atomic_inc_not_zero(&event->mmap_count)) {
2502                 if (nr_pages != event->data->nr_pages)
2503                         ret = -EINVAL;
2504                 goto unlock;
2505         }
2506
2507         user_extra = nr_pages + 1;
2508         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2509
2510         /*
2511          * Increase the limit linearly with more CPUs:
2512          */
2513         user_lock_limit *= num_online_cpus();
2514
2515         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2516
2517         extra = 0;
2518         if (user_locked > user_lock_limit)
2519                 extra = user_locked - user_lock_limit;
2520
2521         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2522         lock_limit >>= PAGE_SHIFT;
2523         locked = vma->vm_mm->locked_vm + extra;
2524
2525         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2526                 !capable(CAP_IPC_LOCK)) {
2527                 ret = -EPERM;
2528                 goto unlock;
2529         }
2530
2531         WARN_ON(event->data);
2532
2533         data = perf_mmap_data_alloc(event, nr_pages);
2534         ret = -ENOMEM;
2535         if (!data)
2536                 goto unlock;
2537
2538         ret = 0;
2539         perf_mmap_data_init(event, data);
2540
2541         atomic_set(&event->mmap_count, 1);
2542         atomic_long_add(user_extra, &user->locked_vm);
2543         vma->vm_mm->locked_vm += extra;
2544         event->data->nr_locked = extra;
2545         if (vma->vm_flags & VM_WRITE)
2546                 event->data->writable = 1;
2547
2548 unlock:
2549         mutex_unlock(&event->mmap_mutex);
2550
2551         vma->vm_flags |= VM_RESERVED;
2552         vma->vm_ops = &perf_mmap_vmops;
2553
2554         return ret;
2555 }
2556
2557 static int perf_fasync(int fd, struct file *filp, int on)
2558 {
2559         struct inode *inode = filp->f_path.dentry->d_inode;
2560         struct perf_event *event = filp->private_data;
2561         int retval;
2562
2563         mutex_lock(&inode->i_mutex);
2564         retval = fasync_helper(fd, filp, on, &event->fasync);
2565         mutex_unlock(&inode->i_mutex);
2566
2567         if (retval < 0)
2568                 return retval;
2569
2570         return 0;
2571 }
2572
2573 static const struct file_operations perf_fops = {
2574         .release                = perf_release,
2575         .read                   = perf_read,
2576         .poll                   = perf_poll,
2577         .unlocked_ioctl         = perf_ioctl,
2578         .compat_ioctl           = perf_ioctl,
2579         .mmap                   = perf_mmap,
2580         .fasync                 = perf_fasync,
2581 };
2582
2583 /*
2584  * Perf event wakeup
2585  *
2586  * If there's data, ensure we set the poll() state and publish everything
2587  * to user-space before waking everybody up.
2588  */
2589
2590 void perf_event_wakeup(struct perf_event *event)
2591 {
2592         wake_up_all(&event->waitq);
2593
2594         if (event->pending_kill) {
2595                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2596                 event->pending_kill = 0;
2597         }
2598 }
2599
2600 /*
2601  * Pending wakeups
2602  *
2603  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2604  *
2605  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2606  * single linked list and use cmpxchg() to add entries lockless.
2607  */
2608
2609 static void perf_pending_event(struct perf_pending_entry *entry)
2610 {
2611         struct perf_event *event = container_of(entry,
2612                         struct perf_event, pending);
2613
2614         if (event->pending_disable) {
2615                 event->pending_disable = 0;
2616                 __perf_event_disable(event);
2617         }
2618
2619         if (event->pending_wakeup) {
2620                 event->pending_wakeup = 0;
2621                 perf_event_wakeup(event);
2622         }
2623 }
2624
2625 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2626
2627 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2628         PENDING_TAIL,
2629 };
2630
2631 static void perf_pending_queue(struct perf_pending_entry *entry,
2632                                void (*func)(struct perf_pending_entry *))
2633 {
2634         struct perf_pending_entry **head;
2635
2636         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2637                 return;
2638
2639         entry->func = func;
2640
2641         head = &get_cpu_var(perf_pending_head);
2642
2643         do {
2644                 entry->next = *head;
2645         } while (cmpxchg(head, entry->next, entry) != entry->next);
2646
2647         set_perf_event_pending();
2648
2649         put_cpu_var(perf_pending_head);
2650 }
2651
2652 static int __perf_pending_run(void)
2653 {
2654         struct perf_pending_entry *list;
2655         int nr = 0;
2656
2657         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2658         while (list != PENDING_TAIL) {
2659                 void (*func)(struct perf_pending_entry *);
2660                 struct perf_pending_entry *entry = list;
2661
2662                 list = list->next;
2663
2664                 func = entry->func;
2665                 entry->next = NULL;
2666                 /*
2667                  * Ensure we observe the unqueue before we issue the wakeup,
2668                  * so that we won't be waiting forever.
2669                  * -- see perf_not_pending().
2670                  */
2671                 smp_wmb();
2672
2673                 func(entry);
2674                 nr++;
2675         }
2676
2677         return nr;
2678 }
2679
2680 static inline int perf_not_pending(struct perf_event *event)
2681 {
2682         /*
2683          * If we flush on whatever cpu we run, there is a chance we don't
2684          * need to wait.
2685          */
2686         get_cpu();
2687         __perf_pending_run();
2688         put_cpu();
2689
2690         /*
2691          * Ensure we see the proper queue state before going to sleep
2692          * so that we do not miss the wakeup. -- see perf_pending_handle()
2693          */
2694         smp_rmb();
2695         return event->pending.next == NULL;
2696 }
2697
2698 static void perf_pending_sync(struct perf_event *event)
2699 {
2700         wait_event(event->waitq, perf_not_pending(event));
2701 }
2702
2703 void perf_event_do_pending(void)
2704 {
2705         __perf_pending_run();
2706 }
2707
2708 /*
2709  * Callchain support -- arch specific
2710  */
2711
2712 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2713 {
2714         return NULL;
2715 }
2716
2717 /*
2718  * Output
2719  */
2720 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2721                               unsigned long offset, unsigned long head)
2722 {
2723         unsigned long mask;
2724
2725         if (!data->writable)
2726                 return true;
2727
2728         mask = perf_data_size(data) - 1;
2729
2730         offset = (offset - tail) & mask;
2731         head   = (head   - tail) & mask;
2732
2733         if ((int)(head - offset) < 0)
2734                 return false;
2735
2736         return true;
2737 }
2738
2739 static void perf_output_wakeup(struct perf_output_handle *handle)
2740 {
2741         atomic_set(&handle->data->poll, POLL_IN);
2742
2743         if (handle->nmi) {
2744                 handle->event->pending_wakeup = 1;
2745                 perf_pending_queue(&handle->event->pending,
2746                                    perf_pending_event);
2747         } else
2748                 perf_event_wakeup(handle->event);
2749 }
2750
2751 /*
2752  * Curious locking construct.
2753  *
2754  * We need to ensure a later event_id doesn't publish a head when a former
2755  * event_id isn't done writing. However since we need to deal with NMIs we
2756  * cannot fully serialize things.
2757  *
2758  * What we do is serialize between CPUs so we only have to deal with NMI
2759  * nesting on a single CPU.
2760  *
2761  * We only publish the head (and generate a wakeup) when the outer-most
2762  * event_id completes.
2763  */
2764 static void perf_output_lock(struct perf_output_handle *handle)
2765 {
2766         struct perf_mmap_data *data = handle->data;
2767         int cur, cpu = get_cpu();
2768
2769         handle->locked = 0;
2770
2771         for (;;) {
2772                 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2773                 if (cur == -1) {
2774                         handle->locked = 1;
2775                         break;
2776                 }
2777                 if (cur == cpu)
2778                         break;
2779
2780                 cpu_relax();
2781         }
2782 }
2783
2784 static void perf_output_unlock(struct perf_output_handle *handle)
2785 {
2786         struct perf_mmap_data *data = handle->data;
2787         unsigned long head;
2788         int cpu;
2789
2790         data->done_head = data->head;
2791
2792         if (!handle->locked)
2793                 goto out;
2794
2795 again:
2796         /*
2797          * The xchg implies a full barrier that ensures all writes are done
2798          * before we publish the new head, matched by a rmb() in userspace when
2799          * reading this position.
2800          */
2801         while ((head = atomic_long_xchg(&data->done_head, 0)))
2802                 data->user_page->data_head = head;
2803
2804         /*
2805          * NMI can happen here, which means we can miss a done_head update.
2806          */
2807
2808         cpu = atomic_xchg(&data->lock, -1);
2809         WARN_ON_ONCE(cpu != smp_processor_id());
2810
2811         /*
2812          * Therefore we have to validate we did not indeed do so.
2813          */
2814         if (unlikely(atomic_long_read(&data->done_head))) {
2815                 /*
2816                  * Since we had it locked, we can lock it again.
2817                  */
2818                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2819                         cpu_relax();
2820
2821                 goto again;
2822         }
2823
2824         if (atomic_xchg(&data->wakeup, 0))
2825                 perf_output_wakeup(handle);
2826 out:
2827         put_cpu();
2828 }
2829
2830 void perf_output_copy(struct perf_output_handle *handle,
2831                       const void *buf, unsigned int len)
2832 {
2833         unsigned int pages_mask;
2834         unsigned long offset;
2835         unsigned int size;
2836         void **pages;
2837
2838         offset          = handle->offset;
2839         pages_mask      = handle->data->nr_pages - 1;
2840         pages           = handle->data->data_pages;
2841
2842         do {
2843                 unsigned long page_offset;
2844                 unsigned long page_size;
2845                 int nr;
2846
2847                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2848                 page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
2849                 page_offset = offset & (page_size - 1);
2850                 size        = min_t(unsigned int, page_size - page_offset, len);
2851
2852                 memcpy(pages[nr] + page_offset, buf, size);
2853
2854                 len         -= size;
2855                 buf         += size;
2856                 offset      += size;
2857         } while (len);
2858
2859         handle->offset = offset;
2860
2861         /*
2862          * Check we didn't copy past our reservation window, taking the
2863          * possible unsigned int wrap into account.
2864          */
2865         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2866 }
2867
2868 int perf_output_begin(struct perf_output_handle *handle,
2869                       struct perf_event *event, unsigned int size,
2870                       int nmi, int sample)
2871 {
2872         struct perf_event *output_event;
2873         struct perf_mmap_data *data;
2874         unsigned long tail, offset, head;
2875         int have_lost;
2876         struct {
2877                 struct perf_event_header header;
2878                 u64                      id;
2879                 u64                      lost;
2880         } lost_event;
2881
2882         rcu_read_lock();
2883         /*
2884          * For inherited events we send all the output towards the parent.
2885          */
2886         if (event->parent)
2887                 event = event->parent;
2888
2889         output_event = rcu_dereference(event->output);
2890         if (output_event)
2891                 event = output_event;
2892
2893         data = rcu_dereference(event->data);
2894         if (!data)
2895                 goto out;
2896
2897         handle->data    = data;
2898         handle->event   = event;
2899         handle->nmi     = nmi;
2900         handle->sample  = sample;
2901
2902         if (!data->nr_pages)
2903                 goto fail;
2904
2905         have_lost = atomic_read(&data->lost);
2906         if (have_lost)
2907                 size += sizeof(lost_event);
2908
2909         perf_output_lock(handle);
2910
2911         do {
2912                 /*
2913                  * Userspace could choose to issue a mb() before updating the
2914                  * tail pointer. So that all reads will be completed before the
2915                  * write is issued.
2916                  */
2917                 tail = ACCESS_ONCE(data->user_page->data_tail);
2918                 smp_rmb();
2919                 offset = head = atomic_long_read(&data->head);
2920                 head += size;
2921                 if (unlikely(!perf_output_space(data, tail, offset, head)))
2922                         goto fail;
2923         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2924
2925         handle->offset  = offset;
2926         handle->head    = head;
2927
2928         if (head - tail > data->watermark)
2929                 atomic_set(&data->wakeup, 1);
2930
2931         if (have_lost) {
2932                 lost_event.header.type = PERF_RECORD_LOST;
2933                 lost_event.header.misc = 0;
2934                 lost_event.header.size = sizeof(lost_event);
2935                 lost_event.id          = event->id;
2936                 lost_event.lost        = atomic_xchg(&data->lost, 0);
2937
2938                 perf_output_put(handle, lost_event);
2939         }
2940
2941         return 0;
2942
2943 fail:
2944         atomic_inc(&data->lost);
2945         perf_output_unlock(handle);
2946 out:
2947         rcu_read_unlock();
2948
2949         return -ENOSPC;
2950 }
2951
2952 void perf_output_end(struct perf_output_handle *handle)
2953 {
2954         struct perf_event *event = handle->event;
2955         struct perf_mmap_data *data = handle->data;
2956
2957         int wakeup_events = event->attr.wakeup_events;
2958
2959         if (handle->sample && wakeup_events) {
2960                 int events = atomic_inc_return(&data->events);
2961                 if (events >= wakeup_events) {
2962                         atomic_sub(wakeup_events, &data->events);
2963                         atomic_set(&data->wakeup, 1);
2964                 }
2965         }
2966
2967         perf_output_unlock(handle);
2968         rcu_read_unlock();
2969 }
2970
2971 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2972 {
2973         /*
2974          * only top level events have the pid namespace they were created in
2975          */
2976         if (event->parent)
2977                 event = event->parent;
2978
2979         return task_tgid_nr_ns(p, event->ns);
2980 }
2981
2982 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2983 {
2984         /*
2985          * only top level events have the pid namespace they were created in
2986          */
2987         if (event->parent)
2988                 event = event->parent;
2989
2990         return task_pid_nr_ns(p, event->ns);
2991 }
2992
2993 static void perf_output_read_one(struct perf_output_handle *handle,
2994                                  struct perf_event *event)
2995 {
2996         u64 read_format = event->attr.read_format;
2997         u64 values[4];
2998         int n = 0;
2999
3000         values[n++] = atomic64_read(&event->count);
3001         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3002                 values[n++] = event->total_time_enabled +
3003                         atomic64_read(&event->child_total_time_enabled);
3004         }
3005         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3006                 values[n++] = event->total_time_running +
3007                         atomic64_read(&event->child_total_time_running);
3008         }
3009         if (read_format & PERF_FORMAT_ID)
3010                 values[n++] = primary_event_id(event);
3011
3012         perf_output_copy(handle, values, n * sizeof(u64));
3013 }
3014
3015 /*
3016  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3017  */
3018 static void perf_output_read_group(struct perf_output_handle *handle,
3019                             struct perf_event *event)
3020 {
3021         struct perf_event *leader = event->group_leader, *sub;
3022         u64 read_format = event->attr.read_format;
3023         u64 values[5];
3024         int n = 0;
3025
3026         values[n++] = 1 + leader->nr_siblings;
3027
3028         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3029                 values[n++] = leader->total_time_enabled;
3030
3031         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3032                 values[n++] = leader->total_time_running;
3033
3034         if (leader != event)
3035                 leader->pmu->read(leader);
3036
3037         values[n++] = atomic64_read(&leader->count);
3038         if (read_format & PERF_FORMAT_ID)
3039                 values[n++] = primary_event_id(leader);
3040
3041         perf_output_copy(handle, values, n * sizeof(u64));
3042
3043         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3044                 n = 0;
3045
3046                 if (sub != event)
3047                         sub->pmu->read(sub);
3048
3049                 values[n++] = atomic64_read(&sub->count);
3050                 if (read_format & PERF_FORMAT_ID)
3051                         values[n++] = primary_event_id(sub);
3052
3053                 perf_output_copy(handle, values, n * sizeof(u64));
3054         }
3055 }
3056
3057 static void perf_output_read(struct perf_output_handle *handle,
3058                              struct perf_event *event)
3059 {
3060         if (event->attr.read_format & PERF_FORMAT_GROUP)
3061                 perf_output_read_group(handle, event);
3062         else
3063                 perf_output_read_one(handle, event);
3064 }
3065
3066 void perf_output_sample(struct perf_output_handle *handle,
3067                         struct perf_event_header *header,
3068                         struct perf_sample_data *data,
3069                         struct perf_event *event)
3070 {
3071         u64 sample_type = data->type;
3072
3073         perf_output_put(handle, *header);
3074
3075         if (sample_type & PERF_SAMPLE_IP)
3076                 perf_output_put(handle, data->ip);
3077
3078         if (sample_type & PERF_SAMPLE_TID)
3079                 perf_output_put(handle, data->tid_entry);
3080
3081         if (sample_type & PERF_SAMPLE_TIME)
3082                 perf_output_put(handle, data->time);
3083
3084         if (sample_type & PERF_SAMPLE_ADDR)
3085                 perf_output_put(handle, data->addr);
3086
3087         if (sample_type & PERF_SAMPLE_ID)
3088                 perf_output_put(handle, data->id);
3089
3090         if (sample_type & PERF_SAMPLE_STREAM_ID)
3091                 perf_output_put(handle, data->stream_id);
3092
3093         if (sample_type & PERF_SAMPLE_CPU)
3094                 perf_output_put(handle, data->cpu_entry);
3095
3096         if (sample_type & PERF_SAMPLE_PERIOD)
3097                 perf_output_put(handle, data->period);
3098
3099         if (sample_type & PERF_SAMPLE_READ)
3100                 perf_output_read(handle, event);
3101
3102         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3103                 if (data->callchain) {
3104                         int size = 1;
3105
3106                         if (data->callchain)
3107                                 size += data->callchain->nr;
3108
3109                         size *= sizeof(u64);
3110
3111                         perf_output_copy(handle, data->callchain, size);
3112                 } else {
3113                         u64 nr = 0;
3114                         perf_output_put(handle, nr);
3115                 }
3116         }
3117
3118         if (sample_type & PERF_SAMPLE_RAW) {
3119                 if (data->raw) {
3120                         perf_output_put(handle, data->raw->size);
3121                         perf_output_copy(handle, data->raw->data,
3122                                          data->raw->size);
3123                 } else {
3124                         struct {
3125                                 u32     size;
3126                                 u32     data;
3127                         } raw = {
3128                                 .size = sizeof(u32),
3129                                 .data = 0,
3130                         };
3131                         perf_output_put(handle, raw);
3132                 }
3133         }
3134 }
3135
3136 void perf_prepare_sample(struct perf_event_header *header,
3137                          struct perf_sample_data *data,
3138                          struct perf_event *event,
3139                          struct pt_regs *regs)
3140 {
3141         u64 sample_type = event->attr.sample_type;
3142
3143         data->type = sample_type;
3144
3145         header->type = PERF_RECORD_SAMPLE;
3146         header->size = sizeof(*header);
3147
3148         header->misc = 0;
3149         header->misc |= perf_misc_flags(regs);
3150
3151         if (sample_type & PERF_SAMPLE_IP) {
3152                 data->ip = perf_instruction_pointer(regs);
3153
3154                 header->size += sizeof(data->ip);
3155         }
3156
3157         if (sample_type & PERF_SAMPLE_TID) {
3158                 /* namespace issues */
3159                 data->tid_entry.pid = perf_event_pid(event, current);
3160                 data->tid_entry.tid = perf_event_tid(event, current);
3161
3162                 header->size += sizeof(data->tid_entry);
3163         }
3164
3165         if (sample_type & PERF_SAMPLE_TIME) {
3166                 data->time = perf_clock();
3167
3168                 header->size += sizeof(data->time);
3169         }
3170
3171         if (sample_type & PERF_SAMPLE_ADDR)
3172                 header->size += sizeof(data->addr);
3173
3174         if (sample_type & PERF_SAMPLE_ID) {
3175                 data->id = primary_event_id(event);
3176
3177                 header->size += sizeof(data->id);
3178         }
3179
3180         if (sample_type & PERF_SAMPLE_STREAM_ID) {
3181                 data->stream_id = event->id;
3182
3183                 header->size += sizeof(data->stream_id);
3184         }
3185
3186         if (sample_type & PERF_SAMPLE_CPU) {
3187                 data->cpu_entry.cpu             = raw_smp_processor_id();
3188                 data->cpu_entry.reserved        = 0;
3189
3190                 header->size += sizeof(data->cpu_entry);
3191         }
3192
3193         if (sample_type & PERF_SAMPLE_PERIOD)
3194                 header->size += sizeof(data->period);
3195
3196         if (sample_type & PERF_SAMPLE_READ)
3197                 header->size += perf_event_read_size(event);
3198
3199         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3200                 int size = 1;
3201
3202                 data->callchain = perf_callchain(regs);
3203
3204                 if (data->callchain)
3205                         size += data->callchain->nr;
3206
3207                 header->size += size * sizeof(u64);
3208         }
3209
3210         if (sample_type & PERF_SAMPLE_RAW) {
3211                 int size = sizeof(u32);
3212
3213                 if (data->raw)
3214                         size += data->raw->size;
3215                 else
3216                         size += sizeof(u32);
3217
3218                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3219                 header->size += size;
3220         }
3221 }
3222
3223 static void perf_event_output(struct perf_event *event, int nmi,
3224                                 struct perf_sample_data *data,
3225                                 struct pt_regs *regs)
3226 {
3227         struct perf_output_handle handle;
3228         struct perf_event_header header;
3229
3230         perf_prepare_sample(&header, data, event, regs);
3231
3232         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3233                 return;
3234
3235         perf_output_sample(&handle, &header, data, event);
3236
3237         perf_output_end(&handle);
3238 }
3239
3240 /*
3241  * read event_id
3242  */
3243
3244 struct perf_read_event {
3245         struct perf_event_header        header;
3246
3247         u32                             pid;
3248         u32                             tid;
3249 };
3250
3251 static void
3252 perf_event_read_event(struct perf_event *event,
3253                         struct task_struct *task)
3254 {
3255         struct perf_output_handle handle;
3256         struct perf_read_event read_event = {
3257                 .header = {
3258                         .type = PERF_RECORD_READ,
3259                         .misc = 0,
3260                         .size = sizeof(read_event) + perf_event_read_size(event),
3261                 },
3262                 .pid = perf_event_pid(event, task),
3263                 .tid = perf_event_tid(event, task),
3264         };
3265         int ret;
3266
3267         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3268         if (ret)
3269                 return;
3270
3271         perf_output_put(&handle, read_event);
3272         perf_output_read(&handle, event);
3273
3274         perf_output_end(&handle);
3275 }
3276
3277 /*
3278  * task tracking -- fork/exit
3279  *
3280  * enabled by: attr.comm | attr.mmap | attr.task
3281  */
3282
3283 struct perf_task_event {
3284         struct task_struct              *task;
3285         struct perf_event_context       *task_ctx;
3286
3287         struct {
3288                 struct perf_event_header        header;
3289
3290                 u32                             pid;
3291                 u32                             ppid;
3292                 u32                             tid;
3293                 u32                             ptid;
3294                 u64                             time;
3295         } event_id;
3296 };
3297
3298 static void perf_event_task_output(struct perf_event *event,
3299                                      struct perf_task_event *task_event)
3300 {
3301         struct perf_output_handle handle;
3302         int size;
3303         struct task_struct *task = task_event->task;
3304         int ret;
3305
3306         size  = task_event->event_id.header.size;
3307         ret = perf_output_begin(&handle, event, size, 0, 0);
3308
3309         if (ret)
3310                 return;
3311
3312         task_event->event_id.pid = perf_event_pid(event, task);
3313         task_event->event_id.ppid = perf_event_pid(event, current);
3314
3315         task_event->event_id.tid = perf_event_tid(event, task);
3316         task_event->event_id.ptid = perf_event_tid(event, current);
3317
3318         perf_output_put(&handle, task_event->event_id);
3319
3320         perf_output_end(&handle);
3321 }
3322
3323 static int perf_event_task_match(struct perf_event *event)
3324 {
3325         if (event->state < PERF_EVENT_STATE_INACTIVE)
3326                 return 0;
3327
3328         if (event->cpu != -1 && event->cpu != smp_processor_id())
3329                 return 0;
3330
3331         if (event->attr.comm || event->attr.mmap || event->attr.task)
3332                 return 1;
3333
3334         return 0;
3335 }
3336
3337 static void perf_event_task_ctx(struct perf_event_context *ctx,
3338                                   struct perf_task_event *task_event)
3339 {
3340         struct perf_event *event;
3341
3342         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3343                 if (perf_event_task_match(event))
3344                         perf_event_task_output(event, task_event);
3345         }
3346 }
3347
3348 static void perf_event_task_event(struct perf_task_event *task_event)
3349 {
3350         struct perf_cpu_context *cpuctx;
3351         struct perf_event_context *ctx = task_event->task_ctx;
3352
3353         rcu_read_lock();
3354         cpuctx = &get_cpu_var(perf_cpu_context);
3355         perf_event_task_ctx(&cpuctx->ctx, task_event);
3356         if (!ctx)
3357                 ctx = rcu_dereference(current->perf_event_ctxp);
3358         if (ctx)
3359                 perf_event_task_ctx(ctx, task_event);
3360         put_cpu_var(perf_cpu_context);
3361         rcu_read_unlock();
3362 }
3363
3364 static void perf_event_task(struct task_struct *task,
3365                               struct perf_event_context *task_ctx,
3366                               int new)
3367 {
3368         struct perf_task_event task_event;
3369
3370         if (!atomic_read(&nr_comm_events) &&
3371             !atomic_read(&nr_mmap_events) &&
3372             !atomic_read(&nr_task_events))
3373                 return;
3374
3375         task_event = (struct perf_task_event){
3376                 .task     = task,
3377                 .task_ctx = task_ctx,
3378                 .event_id    = {
3379                         .header = {
3380                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3381                                 .misc = 0,
3382                                 .size = sizeof(task_event.event_id),
3383                         },
3384                         /* .pid  */
3385                         /* .ppid */
3386                         /* .tid  */
3387                         /* .ptid */
3388                         .time = perf_clock(),
3389                 },
3390         };
3391
3392         perf_event_task_event(&task_event);
3393 }
3394
3395 void perf_event_fork(struct task_struct *task)
3396 {
3397         perf_event_task(task, NULL, 1);
3398 }
3399
3400 /*
3401  * comm tracking
3402  */
3403
3404 struct perf_comm_event {
3405         struct task_struct      *task;
3406         char                    *comm;
3407         int                     comm_size;
3408
3409         struct {
3410                 struct perf_event_header        header;
3411
3412                 u32                             pid;
3413                 u32                             tid;
3414         } event_id;
3415 };
3416
3417 static void perf_event_comm_output(struct perf_event *event,
3418                                      struct perf_comm_event *comm_event)
3419 {
3420         struct perf_output_handle handle;
3421         int size = comm_event->event_id.header.size;
3422         int ret = perf_output_begin(&handle, event, size, 0, 0);
3423
3424         if (ret)
3425                 return;
3426
3427         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3428         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3429
3430         perf_output_put(&handle, comm_event->event_id);
3431         perf_output_copy(&handle, comm_event->comm,
3432                                    comm_event->comm_size);
3433         perf_output_end(&handle);
3434 }
3435
3436 static int perf_event_comm_match(struct perf_event *event)
3437 {
3438         if (event->state < PERF_EVENT_STATE_INACTIVE)
3439                 return 0;
3440
3441         if (event->cpu != -1 && event->cpu != smp_processor_id())
3442                 return 0;
3443
3444         if (event->attr.comm)
3445                 return 1;
3446
3447         return 0;
3448 }
3449
3450 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3451                                   struct perf_comm_event *comm_event)
3452 {
3453         struct perf_event *event;
3454
3455         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3456                 if (perf_event_comm_match(event))
3457                         perf_event_comm_output(event, comm_event);
3458         }
3459 }
3460
3461 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3462 {
3463         struct perf_cpu_context *cpuctx;
3464         struct perf_event_context *ctx;
3465         unsigned int size;
3466         char comm[TASK_COMM_LEN];
3467
3468         memset(comm, 0, sizeof(comm));
3469         strlcpy(comm, comm_event->task->comm, sizeof(comm));
3470         size = ALIGN(strlen(comm)+1, sizeof(u64));
3471
3472         comm_event->comm = comm;
3473         comm_event->comm_size = size;
3474
3475         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3476
3477         rcu_read_lock();
3478         cpuctx = &get_cpu_var(perf_cpu_context);
3479         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3480         ctx = rcu_dereference(current->perf_event_ctxp);
3481         if (ctx)
3482                 perf_event_comm_ctx(ctx, comm_event);
3483         put_cpu_var(perf_cpu_context);
3484         rcu_read_unlock();
3485 }
3486
3487 void perf_event_comm(struct task_struct *task)
3488 {
3489         struct perf_comm_event comm_event;
3490
3491         if (task->perf_event_ctxp)
3492                 perf_event_enable_on_exec(task);
3493
3494         if (!atomic_read(&nr_comm_events))
3495                 return;
3496
3497         comm_event = (struct perf_comm_event){
3498                 .task   = task,
3499                 /* .comm      */
3500                 /* .comm_size */
3501                 .event_id  = {
3502                         .header = {
3503                                 .type = PERF_RECORD_COMM,
3504                                 .misc = 0,
3505                                 /* .size */
3506                         },
3507                         /* .pid */
3508                         /* .tid */
3509                 },
3510         };
3511
3512         perf_event_comm_event(&comm_event);
3513 }
3514
3515 /*
3516  * mmap tracking
3517  */
3518
3519 struct perf_mmap_event {
3520         struct vm_area_struct   *vma;
3521
3522         const char              *file_name;
3523         int                     file_size;
3524
3525         struct {
3526                 struct perf_event_header        header;
3527
3528                 u32                             pid;
3529                 u32                             tid;
3530                 u64                             start;
3531                 u64                             len;
3532                 u64                             pgoff;
3533         } event_id;
3534 };
3535
3536 static void perf_event_mmap_output(struct perf_event *event,
3537                                      struct perf_mmap_event *mmap_event)
3538 {
3539         struct perf_output_handle handle;
3540         int size = mmap_event->event_id.header.size;
3541         int ret = perf_output_begin(&handle, event, size, 0, 0);
3542
3543         if (ret)
3544                 return;
3545
3546         mmap_event->event_id.pid = perf_event_pid(event, current);
3547         mmap_event->event_id.tid = perf_event_tid(event, current);
3548
3549         perf_output_put(&handle, mmap_event->event_id);
3550         perf_output_copy(&handle, mmap_event->file_name,
3551                                    mmap_event->file_size);
3552         perf_output_end(&handle);
3553 }
3554
3555 static int perf_event_mmap_match(struct perf_event *event,
3556                                    struct perf_mmap_event *mmap_event)
3557 {
3558         if (event->state < PERF_EVENT_STATE_INACTIVE)
3559                 return 0;
3560
3561         if (event->cpu != -1 && event->cpu != smp_processor_id())
3562                 return 0;
3563
3564         if (event->attr.mmap)
3565                 return 1;
3566
3567         return 0;
3568 }
3569
3570 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3571                                   struct perf_mmap_event *mmap_event)
3572 {
3573         struct perf_event *event;
3574
3575         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3576                 if (perf_event_mmap_match(event, mmap_event))
3577                         perf_event_mmap_output(event, mmap_event);
3578         }
3579 }
3580
3581 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3582 {
3583         struct perf_cpu_context *cpuctx;
3584         struct perf_event_context *ctx;
3585         struct vm_area_struct *vma = mmap_event->vma;
3586         struct file *file = vma->vm_file;
3587         unsigned int size;
3588         char tmp[16];
3589         char *buf = NULL;
3590         const char *name;
3591
3592         memset(tmp, 0, sizeof(tmp));
3593
3594         if (file) {
3595                 /*
3596                  * d_path works from the end of the buffer backwards, so we
3597                  * need to add enough zero bytes after the string to handle
3598                  * the 64bit alignment we do later.
3599                  */
3600                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3601                 if (!buf) {
3602                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3603                         goto got_name;
3604                 }
3605                 name = d_path(&file->f_path, buf, PATH_MAX);
3606                 if (IS_ERR(name)) {
3607                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3608                         goto got_name;
3609                 }
3610         } else {
3611                 if (arch_vma_name(mmap_event->vma)) {
3612                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3613                                        sizeof(tmp));
3614                         goto got_name;
3615                 }
3616
3617                 if (!vma->vm_mm) {
3618                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3619                         goto got_name;
3620                 }
3621
3622                 name = strncpy(tmp, "//anon", sizeof(tmp));
3623                 goto got_name;
3624         }
3625
3626 got_name:
3627         size = ALIGN(strlen(name)+1, sizeof(u64));
3628
3629         mmap_event->file_name = name;
3630         mmap_event->file_size = size;
3631
3632         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3633
3634         rcu_read_lock();
3635         cpuctx = &get_cpu_var(perf_cpu_context);
3636         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3637         ctx = rcu_dereference(current->perf_event_ctxp);
3638         if (ctx)
3639                 perf_event_mmap_ctx(ctx, mmap_event);
3640         put_cpu_var(perf_cpu_context);
3641         rcu_read_unlock();
3642
3643         kfree(buf);
3644 }
3645
3646 void __perf_event_mmap(struct vm_area_struct *vma)
3647 {
3648         struct perf_mmap_event mmap_event;
3649
3650         if (!atomic_read(&nr_mmap_events))
3651                 return;
3652
3653         mmap_event = (struct perf_mmap_event){
3654                 .vma    = vma,
3655                 /* .file_name */
3656                 /* .file_size */
3657                 .event_id  = {
3658                         .header = {
3659                                 .type = PERF_RECORD_MMAP,
3660                                 .misc = 0,
3661                                 /* .size */
3662                         },
3663                         /* .pid */
3664                         /* .tid */
3665                         .start  = vma->vm_start,
3666                         .len    = vma->vm_end - vma->vm_start,
3667                         .pgoff  = vma->vm_pgoff,
3668                 },
3669         };
3670
3671         perf_event_mmap_event(&mmap_event);
3672 }
3673
3674 /*
3675  * IRQ throttle logging
3676  */
3677
3678 static void perf_log_throttle(struct perf_event *event, int enable)
3679 {
3680         struct perf_output_handle handle;
3681         int ret;
3682
3683         struct {
3684                 struct perf_event_header        header;
3685                 u64                             time;
3686                 u64                             id;
3687                 u64                             stream_id;
3688         } throttle_event = {
3689                 .header = {
3690                         .type = PERF_RECORD_THROTTLE,
3691                         .misc = 0,
3692                         .size = sizeof(throttle_event),
3693                 },
3694                 .time           = perf_clock(),
3695                 .id             = primary_event_id(event),
3696                 .stream_id      = event->id,
3697         };
3698
3699         if (enable)
3700                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3701
3702         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3703         if (ret)
3704                 return;
3705
3706         perf_output_put(&handle, throttle_event);
3707         perf_output_end(&handle);
3708 }
3709
3710 /*
3711  * Generic event overflow handling, sampling.
3712  */
3713
3714 static int __perf_event_overflow(struct perf_event *event, int nmi,
3715                                    int throttle, struct perf_sample_data *data,
3716                                    struct pt_regs *regs)
3717 {
3718         int events = atomic_read(&event->event_limit);
3719         struct hw_perf_event *hwc = &event->hw;
3720         int ret = 0;
3721
3722         throttle = (throttle && event->pmu->unthrottle != NULL);
3723
3724         if (!throttle) {
3725                 hwc->interrupts++;
3726         } else {
3727                 if (hwc->interrupts != MAX_INTERRUPTS) {
3728                         hwc->interrupts++;
3729                         if (HZ * hwc->interrupts >
3730                                         (u64)sysctl_perf_event_sample_rate) {
3731                                 hwc->interrupts = MAX_INTERRUPTS;
3732                                 perf_log_throttle(event, 0);
3733                                 ret = 1;
3734                         }
3735                 } else {
3736                         /*
3737                          * Keep re-disabling events even though on the previous
3738                          * pass we disabled it - just in case we raced with a
3739                          * sched-in and the event got enabled again:
3740                          */
3741                         ret = 1;
3742                 }
3743         }
3744
3745         if (event->attr.freq) {
3746                 u64 now = perf_clock();
3747                 s64 delta = now - hwc->freq_time_stamp;
3748
3749                 hwc->freq_time_stamp = now;
3750
3751                 if (delta > 0 && delta < 2*TICK_NSEC)
3752                         perf_adjust_period(event, delta, hwc->last_period);
3753         }
3754
3755         /*
3756          * XXX event_limit might not quite work as expected on inherited
3757          * events
3758          */
3759
3760         event->pending_kill = POLL_IN;
3761         if (events && atomic_dec_and_test(&event->event_limit)) {
3762                 ret = 1;
3763                 event->pending_kill = POLL_HUP;
3764                 if (nmi) {
3765                         event->pending_disable = 1;
3766                         perf_pending_queue(&event->pending,
3767                                            perf_pending_event);
3768                 } else
3769                         perf_event_disable(event);
3770         }
3771
3772         if (event->overflow_handler)
3773                 event->overflow_handler(event, nmi, data, regs);
3774         else
3775                 perf_event_output(event, nmi, data, regs);
3776
3777         return ret;
3778 }
3779
3780 int perf_event_overflow(struct perf_event *event, int nmi,
3781                           struct perf_sample_data *data,
3782                           struct pt_regs *regs)
3783 {
3784         return __perf_event_overflow(event, nmi, 1, data, regs);
3785 }
3786
3787 /*
3788  * Generic software event infrastructure
3789  */
3790
3791 /*
3792  * We directly increment event->count and keep a second value in
3793  * event->hw.period_left to count intervals. This period event
3794  * is kept in the range [-sample_period, 0] so that we can use the
3795  * sign as trigger.
3796  */
3797
3798 static u64 perf_swevent_set_period(struct perf_event *event)
3799 {
3800         struct hw_perf_event *hwc = &event->hw;
3801         u64 period = hwc->last_period;
3802         u64 nr, offset;
3803         s64 old, val;
3804
3805         hwc->last_period = hwc->sample_period;
3806
3807 again:
3808         old = val = atomic64_read(&hwc->period_left);
3809         if (val < 0)
3810                 return 0;
3811
3812         nr = div64_u64(period + val, period);
3813         offset = nr * period;
3814         val -= offset;
3815         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3816                 goto again;
3817
3818         return nr;
3819 }
3820
3821 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3822                                     int nmi, struct perf_sample_data *data,
3823                                     struct pt_regs *regs)
3824 {
3825         struct hw_perf_event *hwc = &event->hw;
3826         int throttle = 0;
3827
3828         data->period = event->hw.last_period;
3829         if (!overflow)
3830                 overflow = perf_swevent_set_period(event);
3831
3832         if (hwc->interrupts == MAX_INTERRUPTS)
3833                 return;
3834
3835         for (; overflow; overflow--) {
3836                 if (__perf_event_overflow(event, nmi, throttle,
3837                                             data, regs)) {
3838                         /*
3839                          * We inhibit the overflow from happening when
3840                          * hwc->interrupts == MAX_INTERRUPTS.
3841                          */
3842                         break;
3843                 }
3844                 throttle = 1;
3845         }
3846 }
3847
3848 static void perf_swevent_unthrottle(struct perf_event *event)
3849 {
3850         /*
3851          * Nothing to do, we already reset hwc->interrupts.
3852          */
3853 }
3854
3855 static void perf_swevent_add(struct perf_event *event, u64 nr,
3856                                int nmi, struct perf_sample_data *data,
3857                                struct pt_regs *regs)
3858 {
3859         struct hw_perf_event *hwc = &event->hw;
3860
3861         atomic64_add(nr, &event->count);
3862
3863         if (!regs)
3864                 return;
3865
3866         if (!hwc->sample_period)
3867                 return;
3868
3869         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3870                 return perf_swevent_overflow(event, 1, nmi, data, regs);
3871
3872         if (atomic64_add_negative(nr, &hwc->period_left))
3873                 return;
3874
3875         perf_swevent_overflow(event, 0, nmi, data, regs);
3876 }
3877
3878 static int perf_swevent_is_counting(struct perf_event *event)
3879 {
3880         /*
3881          * The event is active, we're good!
3882          */
3883         if (event->state == PERF_EVENT_STATE_ACTIVE)
3884                 return 1;
3885
3886         /*
3887          * The event is off/error, not counting.
3888          */
3889         if (event->state != PERF_EVENT_STATE_INACTIVE)
3890                 return 0;
3891
3892         /*
3893          * The event is inactive, if the context is active
3894          * we're part of a group that didn't make it on the 'pmu',
3895          * not counting.
3896          */
3897         if (event->ctx->is_active)
3898                 return 0;
3899
3900         /*
3901          * We're inactive and the context is too, this means the
3902          * task is scheduled out, we're counting events that happen
3903          * to us, like migration events.
3904          */
3905         return 1;
3906 }
3907
3908 static int perf_tp_event_match(struct perf_event *event,
3909                                 struct perf_sample_data *data);
3910
3911 static int perf_exclude_event(struct perf_event *event,
3912                               struct pt_regs *regs)
3913 {
3914         if (regs) {
3915                 if (event->attr.exclude_user && user_mode(regs))
3916                         return 1;
3917
3918                 if (event->attr.exclude_kernel && !user_mode(regs))
3919                         return 1;
3920         }
3921
3922         return 0;
3923 }
3924
3925 static int perf_swevent_match(struct perf_event *event,
3926                                 enum perf_type_id type,
3927                                 u32 event_id,
3928                                 struct perf_sample_data *data,
3929                                 struct pt_regs *regs)
3930 {
3931         if (event->cpu != -1 && event->cpu != smp_processor_id())
3932                 return 0;
3933
3934         if (!perf_swevent_is_counting(event))
3935                 return 0;
3936
3937         if (event->attr.type != type)
3938                 return 0;
3939
3940         if (event->attr.config != event_id)
3941                 return 0;
3942
3943         if (perf_exclude_event(event, regs))
3944                 return 0;
3945
3946         if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3947             !perf_tp_event_match(event, data))
3948                 return 0;
3949
3950         return 1;
3951 }
3952
3953 static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3954                                      enum perf_type_id type,
3955                                      u32 event_id, u64 nr, int nmi,
3956                                      struct perf_sample_data *data,
3957                                      struct pt_regs *regs)
3958 {
3959         struct perf_event *event;
3960
3961         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3962                 if (perf_swevent_match(event, type, event_id, data, regs))
3963                         perf_swevent_add(event, nr, nmi, data, regs);
3964         }
3965 }
3966
3967 int perf_swevent_get_recursion_context(void)
3968 {
3969         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3970         int rctx;
3971
3972         if (in_nmi())
3973                 rctx = 3;
3974         else if (in_irq())
3975                 rctx = 2;
3976         else if (in_softirq())
3977                 rctx = 1;
3978         else
3979                 rctx = 0;
3980
3981         if (cpuctx->recursion[rctx]) {
3982                 put_cpu_var(perf_cpu_context);
3983                 return -1;
3984         }
3985
3986         cpuctx->recursion[rctx]++;
3987         barrier();
3988
3989         return rctx;
3990 }
3991 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3992
3993 void perf_swevent_put_recursion_context(int rctx)
3994 {
3995         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3996         barrier();
3997         cpuctx->recursion[rctx]--;
3998         put_cpu_var(perf_cpu_context);
3999 }
4000 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4001
4002 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4003                                     u64 nr, int nmi,
4004                                     struct perf_sample_data *data,
4005                                     struct pt_regs *regs)
4006 {
4007         struct perf_cpu_context *cpuctx;
4008         struct perf_event_context *ctx;
4009
4010         cpuctx = &__get_cpu_var(perf_cpu_context);
4011         rcu_read_lock();
4012         perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4013                                  nr, nmi, data, regs);
4014         /*
4015          * doesn't really matter which of the child contexts the
4016          * events ends up in.
4017          */
4018         ctx = rcu_dereference(current->perf_event_ctxp);
4019         if (ctx)
4020                 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4021         rcu_read_unlock();
4022 }
4023
4024 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4025                             struct pt_regs *regs, u64 addr)
4026 {
4027         struct perf_sample_data data;
4028         int rctx;
4029
4030         rctx = perf_swevent_get_recursion_context();
4031         if (rctx < 0)
4032                 return;
4033
4034         perf_sample_data_init(&data, addr);
4035
4036         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4037
4038         perf_swevent_put_recursion_context(rctx);
4039 }
4040
4041 static void perf_swevent_read(struct perf_event *event)
4042 {
4043 }
4044
4045 static int perf_swevent_enable(struct perf_event *event)
4046 {
4047         struct hw_perf_event *hwc = &event->hw;
4048
4049         if (hwc->sample_period) {
4050                 hwc->last_period = hwc->sample_period;
4051                 perf_swevent_set_period(event);
4052         }
4053         return 0;
4054 }
4055
4056 static void perf_swevent_disable(struct perf_event *event)
4057 {
4058 }
4059
4060 static const struct pmu perf_ops_generic = {
4061         .enable         = perf_swevent_enable,
4062         .disable        = perf_swevent_disable,
4063         .read           = perf_swevent_read,
4064         .unthrottle     = perf_swevent_unthrottle,
4065 };
4066
4067 /*
4068  * hrtimer based swevent callback
4069  */
4070
4071 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4072 {
4073         enum hrtimer_restart ret = HRTIMER_RESTART;
4074         struct perf_sample_data data;
4075         struct pt_regs *regs;
4076         struct perf_event *event;
4077         u64 period;
4078
4079         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4080         event->pmu->read(event);
4081
4082         perf_sample_data_init(&data, 0);
4083         data.period = event->hw.last_period;
4084         regs = get_irq_regs();
4085         /*
4086          * In case we exclude kernel IPs or are somehow not in interrupt
4087          * context, provide the next best thing, the user IP.
4088          */
4089         if ((event->attr.exclude_kernel || !regs) &&
4090                         !event->attr.exclude_user)
4091                 regs = task_pt_regs(current);
4092
4093         if (regs) {
4094                 if (!(event->attr.exclude_idle && current->pid == 0))
4095                         if (perf_event_overflow(event, 0, &data, regs))
4096                                 ret = HRTIMER_NORESTART;
4097         }
4098
4099         period = max_t(u64, 10000, event->hw.sample_period);
4100         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4101
4102         return ret;
4103 }
4104
4105 static void perf_swevent_start_hrtimer(struct perf_event *event)
4106 {
4107         struct hw_perf_event *hwc = &event->hw;
4108
4109         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4110         hwc->hrtimer.function = perf_swevent_hrtimer;
4111         if (hwc->sample_period) {
4112                 u64 period;
4113
4114                 if (hwc->remaining) {
4115                         if (hwc->remaining < 0)
4116                                 period = 10000;
4117                         else
4118                                 period = hwc->remaining;
4119                         hwc->remaining = 0;
4120                 } else {
4121                         period = max_t(u64, 10000, hwc->sample_period);
4122                 }
4123                 __hrtimer_start_range_ns(&hwc->hrtimer,
4124                                 ns_to_ktime(period), 0,
4125                                 HRTIMER_MODE_REL, 0);
4126         }
4127 }
4128
4129 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4130 {
4131         struct hw_perf_event *hwc = &event->hw;
4132
4133         if (hwc->sample_period) {
4134                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4135                 hwc->remaining = ktime_to_ns(remaining);
4136
4137                 hrtimer_cancel(&hwc->hrtimer);
4138         }
4139 }
4140
4141 /*
4142  * Software event: cpu wall time clock
4143  */
4144
4145 static void cpu_clock_perf_event_update(struct perf_event *event)
4146 {
4147         int cpu = raw_smp_processor_id();
4148         s64 prev;
4149         u64 now;
4150
4151         now = cpu_clock(cpu);
4152         prev = atomic64_xchg(&event->hw.prev_count, now);
4153         atomic64_add(now - prev, &event->count);
4154 }
4155
4156 static int cpu_clock_perf_event_enable(struct perf_event *event)
4157 {
4158         struct hw_perf_event *hwc = &event->hw;
4159         int cpu = raw_smp_processor_id();
4160
4161         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4162         perf_swevent_start_hrtimer(event);
4163
4164         return 0;
4165 }
4166
4167 static void cpu_clock_perf_event_disable(struct perf_event *event)
4168 {
4169         perf_swevent_cancel_hrtimer(event);
4170         cpu_clock_perf_event_update(event);
4171 }
4172
4173 static void cpu_clock_perf_event_read(struct perf_event *event)
4174 {
4175         cpu_clock_perf_event_update(event);
4176 }
4177
4178 static const struct pmu perf_ops_cpu_clock = {
4179         .enable         = cpu_clock_perf_event_enable,
4180         .disable        = cpu_clock_perf_event_disable,
4181         .read           = cpu_clock_perf_event_read,
4182 };
4183
4184 /*
4185  * Software event: task time clock
4186  */
4187
4188 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4189 {
4190         u64 prev;
4191         s64 delta;
4192
4193         prev = atomic64_xchg(&event->hw.prev_count, now);
4194         delta = now - prev;
4195         atomic64_add(delta, &event->count);
4196 }
4197
4198 static int task_clock_perf_event_enable(struct perf_event *event)
4199 {
4200         struct hw_perf_event *hwc = &event->hw;
4201         u64 now;
4202
4203         now = event->ctx->time;
4204
4205         atomic64_set(&hwc->prev_count, now);
4206
4207         perf_swevent_start_hrtimer(event);
4208
4209         return 0;
4210 }
4211
4212 static void task_clock_perf_event_disable(struct perf_event *event)
4213 {
4214         perf_swevent_cancel_hrtimer(event);
4215         task_clock_perf_event_update(event, event->ctx->time);
4216
4217 }
4218
4219 static void task_clock_perf_event_read(struct perf_event *event)
4220 {
4221         u64 time;
4222
4223         if (!in_nmi()) {
4224                 update_context_time(event->ctx);
4225                 time = event->ctx->time;
4226         } else {
4227                 u64 now = perf_clock();
4228                 u64 delta = now - event->ctx->timestamp;
4229                 time = event->ctx->time + delta;
4230         }
4231
4232         task_clock_perf_event_update(event, time);
4233 }
4234
4235 static const struct pmu perf_ops_task_clock = {
4236         .enable         = task_clock_perf_event_enable,
4237         .disable        = task_clock_perf_event_disable,
4238         .read           = task_clock_perf_event_read,
4239 };
4240
4241 #ifdef CONFIG_EVENT_PROFILE
4242
4243 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4244                           int entry_size)
4245 {
4246         struct pt_regs *regs = get_irq_regs();
4247         struct perf_sample_data data;
4248         struct perf_raw_record raw = {
4249                 .size = entry_size,
4250                 .data = record,
4251         };
4252
4253         perf_sample_data_init(&data, addr);
4254         data.raw = &raw;
4255
4256         if (!regs)
4257                 regs = task_pt_regs(current);
4258
4259         /* Trace events already protected against recursion */
4260         do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4261                                 &data, regs);
4262 }
4263 EXPORT_SYMBOL_GPL(perf_tp_event);
4264
4265 static int perf_tp_event_match(struct perf_event *event,
4266                                 struct perf_sample_data *data)
4267 {
4268         void *record = data->raw->data;
4269
4270         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4271                 return 1;
4272         return 0;
4273 }
4274
4275 static void tp_perf_event_destroy(struct perf_event *event)
4276 {
4277         ftrace_profile_disable(event->attr.config);
4278 }
4279
4280 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4281 {
4282         /*
4283          * Raw tracepoint data is a severe data leak, only allow root to
4284          * have these.
4285          */
4286         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4287                         perf_paranoid_tracepoint_raw() &&
4288                         !capable(CAP_SYS_ADMIN))
4289                 return ERR_PTR(-EPERM);
4290
4291         if (ftrace_profile_enable(event->attr.config))
4292                 return NULL;
4293
4294         event->destroy = tp_perf_event_destroy;
4295
4296         return &perf_ops_generic;
4297 }
4298
4299 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4300 {
4301         char *filter_str;
4302         int ret;
4303
4304         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4305                 return -EINVAL;
4306
4307         filter_str = strndup_user(arg, PAGE_SIZE);
4308         if (IS_ERR(filter_str))
4309                 return PTR_ERR(filter_str);
4310
4311         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4312
4313         kfree(filter_str);
4314         return ret;
4315 }
4316
4317 static void perf_event_free_filter(struct perf_event *event)
4318 {
4319         ftrace_profile_free_filter(event);
4320 }
4321
4322 #else
4323
4324 static int perf_tp_event_match(struct perf_event *event,
4325                                 struct perf_sample_data *data)
4326 {
4327         return 1;
4328 }
4329
4330 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4331 {
4332         return NULL;
4333 }
4334
4335 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4336 {
4337         return -ENOENT;
4338 }
4339
4340 static void perf_event_free_filter(struct perf_event *event)
4341 {
4342 }
4343
4344 #endif /* CONFIG_EVENT_PROFILE */
4345
4346 #ifdef CONFIG_HAVE_HW_BREAKPOINT
4347 static void bp_perf_event_destroy(struct perf_event *event)
4348 {
4349         release_bp_slot(event);
4350 }
4351
4352 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4353 {
4354         int err;
4355
4356         err = register_perf_hw_breakpoint(bp);
4357         if (err)
4358                 return ERR_PTR(err);
4359
4360         bp->destroy = bp_perf_event_destroy;
4361
4362         return &perf_ops_bp;
4363 }
4364
4365 void perf_bp_event(struct perf_event *bp, void *data)
4366 {
4367         struct perf_sample_data sample;
4368         struct pt_regs *regs = data;
4369
4370         perf_sample_data_init(&sample, bp->attr.bp_addr);
4371
4372         if (!perf_exclude_event(bp, regs))
4373                 perf_swevent_add(bp, 1, 1, &sample, regs);
4374 }
4375 #else
4376 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4377 {
4378         return NULL;
4379 }
4380
4381 void perf_bp_event(struct perf_event *bp, void *regs)
4382 {
4383 }
4384 #endif
4385
4386 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4387
4388 static void sw_perf_event_destroy(struct perf_event *event)
4389 {
4390         u64 event_id = event->attr.config;
4391
4392         WARN_ON(event->parent);
4393
4394         atomic_dec(&perf_swevent_enabled[event_id]);
4395 }
4396
4397 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4398 {
4399         const struct pmu *pmu = NULL;
4400         u64 event_id = event->attr.config;
4401
4402         /*
4403          * Software events (currently) can't in general distinguish
4404          * between user, kernel and hypervisor events.
4405          * However, context switches and cpu migrations are considered
4406          * to be kernel events, and page faults are never hypervisor
4407          * events.
4408          */
4409         switch (event_id) {
4410         case PERF_COUNT_SW_CPU_CLOCK:
4411                 pmu = &perf_ops_cpu_clock;
4412
4413                 break;
4414         case PERF_COUNT_SW_TASK_CLOCK:
4415                 /*
4416                  * If the user instantiates this as a per-cpu event,
4417                  * use the cpu_clock event instead.
4418                  */
4419                 if (event->ctx->task)
4420                         pmu = &perf_ops_task_clock;
4421                 else
4422                         pmu = &perf_ops_cpu_clock;
4423
4424                 break;
4425         case PERF_COUNT_SW_PAGE_FAULTS:
4426         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4427         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4428         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4429         case PERF_COUNT_SW_CPU_MIGRATIONS:
4430         case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4431         case PERF_COUNT_SW_EMULATION_FAULTS:
4432                 if (!event->parent) {
4433                         atomic_inc(&perf_swevent_enabled[event_id]);
4434                         event->destroy = sw_perf_event_destroy;
4435                 }
4436                 pmu = &perf_ops_generic;
4437                 break;
4438         }
4439
4440         return pmu;
4441 }
4442
4443 /*
4444  * Allocate and initialize a event structure
4445  */
4446 static struct perf_event *
4447 perf_event_alloc(struct perf_event_attr *attr,
4448                    int cpu,
4449                    struct perf_event_context *ctx,
4450                    struct perf_event *group_leader,
4451                    struct perf_event *parent_event,
4452                    perf_overflow_handler_t overflow_handler,
4453                    gfp_t gfpflags)
4454 {
4455         const struct pmu *pmu;
4456         struct perf_event *event;
4457         struct hw_perf_event *hwc;
4458         long err;
4459
4460         event = kzalloc(sizeof(*event), gfpflags);
4461         if (!event)
4462                 return ERR_PTR(-ENOMEM);
4463
4464         /*
4465          * Single events are their own group leaders, with an
4466          * empty sibling list:
4467          */
4468         if (!group_leader)
4469                 group_leader = event;
4470
4471         mutex_init(&event->child_mutex);
4472         INIT_LIST_HEAD(&event->child_list);
4473
4474         INIT_LIST_HEAD(&event->group_entry);
4475         INIT_LIST_HEAD(&event->event_entry);
4476         INIT_LIST_HEAD(&event->sibling_list);
4477         init_waitqueue_head(&event->waitq);
4478
4479         mutex_init(&event->mmap_mutex);
4480
4481         event->cpu              = cpu;
4482         event->attr             = *attr;
4483         event->group_leader     = group_leader;
4484         event->pmu              = NULL;
4485         event->ctx              = ctx;
4486         event->oncpu            = -1;
4487
4488         event->parent           = parent_event;
4489
4490         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4491         event->id               = atomic64_inc_return(&perf_event_id);
4492
4493         event->state            = PERF_EVENT_STATE_INACTIVE;
4494
4495         if (!overflow_handler && parent_event)
4496                 overflow_handler = parent_event->overflow_handler;
4497
4498         event->overflow_handler = overflow_handler;
4499
4500         if (attr->disabled)
4501                 event->state = PERF_EVENT_STATE_OFF;
4502
4503         pmu = NULL;
4504
4505         hwc = &event->hw;
4506         hwc->sample_period = attr->sample_period;
4507         if (attr->freq && attr->sample_freq)
4508                 hwc->sample_period = 1;
4509         hwc->last_period = hwc->sample_period;
4510
4511         atomic64_set(&hwc->period_left, hwc->sample_period);
4512
4513         /*
4514          * we currently do not support PERF_FORMAT_GROUP on inherited events
4515          */
4516         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4517                 goto done;
4518
4519         switch (attr->type) {
4520         case PERF_TYPE_RAW:
4521         case PERF_TYPE_HARDWARE:
4522         case PERF_TYPE_HW_CACHE:
4523                 pmu = hw_perf_event_init(event);
4524                 break;
4525
4526         case PERF_TYPE_SOFTWARE:
4527                 pmu = sw_perf_event_init(event);
4528                 break;
4529
4530         case PERF_TYPE_TRACEPOINT:
4531                 pmu = tp_perf_event_init(event);
4532                 break;
4533
4534         case PERF_TYPE_BREAKPOINT:
4535                 pmu = bp_perf_event_init(event);
4536                 break;
4537
4538
4539         default:
4540                 break;
4541         }
4542 done:
4543         err = 0;
4544         if (!pmu)
4545                 err = -EINVAL;
4546         else if (IS_ERR(pmu))
4547                 err = PTR_ERR(pmu);
4548
4549         if (err) {
4550                 if (event->ns)
4551                         put_pid_ns(event->ns);
4552                 kfree(event);
4553                 return ERR_PTR(err);
4554         }
4555
4556         event->pmu = pmu;
4557
4558         if (!event->parent) {
4559                 atomic_inc(&nr_events);
4560                 if (event->attr.mmap)
4561                         atomic_inc(&nr_mmap_events);
4562                 if (event->attr.comm)
4563                         atomic_inc(&nr_comm_events);
4564                 if (event->attr.task)
4565                         atomic_inc(&nr_task_events);
4566         }
4567
4568         return event;
4569 }
4570
4571 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4572                           struct perf_event_attr *attr)
4573 {
4574         u32 size;
4575         int ret;
4576
4577         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4578                 return -EFAULT;
4579
4580         /*
4581          * zero the full structure, so that a short copy will be nice.
4582          */
4583         memset(attr, 0, sizeof(*attr));
4584
4585         ret = get_user(size, &uattr->size);
4586         if (ret)
4587                 return ret;
4588
4589         if (size > PAGE_SIZE)   /* silly large */
4590                 goto err_size;
4591
4592         if (!size)              /* abi compat */
4593                 size = PERF_ATTR_SIZE_VER0;
4594
4595         if (size < PERF_ATTR_SIZE_VER0)
4596                 goto err_size;
4597
4598         /*
4599          * If we're handed a bigger struct than we know of,
4600          * ensure all the unknown bits are 0 - i.e. new
4601          * user-space does not rely on any kernel feature
4602          * extensions we dont know about yet.
4603          */
4604         if (size > sizeof(*attr)) {
4605                 unsigned char __user *addr;
4606                 unsigned char __user *end;
4607                 unsigned char val;
4608
4609                 addr = (void __user *)uattr + sizeof(*attr);
4610                 end  = (void __user *)uattr + size;
4611
4612                 for (; addr < end; addr++) {
4613                         ret = get_user(val, addr);
4614                         if (ret)
4615                                 return ret;
4616                         if (val)
4617                                 goto err_size;
4618                 }
4619                 size = sizeof(*attr);
4620         }
4621
4622         ret = copy_from_user(attr, uattr, size);
4623         if (ret)
4624                 return -EFAULT;
4625
4626         /*
4627          * If the type exists, the corresponding creation will verify
4628          * the attr->config.
4629          */
4630         if (attr->type >= PERF_TYPE_MAX)
4631                 return -EINVAL;
4632
4633         if (attr->__reserved_1)
4634                 return -EINVAL;
4635
4636         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4637                 return -EINVAL;
4638
4639         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4640                 return -EINVAL;
4641
4642 out:
4643         return ret;
4644
4645 err_size:
4646         put_user(sizeof(*attr), &uattr->size);
4647         ret = -E2BIG;
4648         goto out;
4649 }
4650
4651 static int perf_event_set_output(struct perf_event *event, int output_fd)
4652 {
4653         struct perf_event *output_event = NULL;
4654         struct file *output_file = NULL;
4655         struct perf_event *old_output;
4656         int fput_needed = 0;
4657         int ret = -EINVAL;
4658
4659         if (!output_fd)
4660                 goto set;
4661
4662         output_file = fget_light(output_fd, &fput_needed);
4663         if (!output_file)
4664                 return -EBADF;
4665
4666         if (output_file->f_op != &perf_fops)
4667                 goto out;
4668
4669         output_event = output_file->private_data;
4670
4671         /* Don't chain output fds */
4672         if (output_event->output)
4673                 goto out;
4674
4675         /* Don't set an output fd when we already have an output channel */
4676         if (event->data)
4677                 goto out;
4678
4679         atomic_long_inc(&output_file->f_count);
4680
4681 set:
4682         mutex_lock(&event->mmap_mutex);
4683         old_output = event->output;
4684         rcu_assign_pointer(event->output, output_event);
4685         mutex_unlock(&event->mmap_mutex);
4686
4687         if (old_output) {
4688                 /*
4689                  * we need to make sure no existing perf_output_*()
4690                  * is still referencing this event.
4691                  */
4692                 synchronize_rcu();
4693                 fput(old_output->filp);
4694         }
4695
4696         ret = 0;
4697 out:
4698         fput_light(output_file, fput_needed);
4699         return ret;
4700 }
4701
4702 /**
4703  * sys_perf_event_open - open a performance event, associate it to a task/cpu
4704  *
4705  * @attr_uptr:  event_id type attributes for monitoring/sampling
4706  * @pid:                target pid
4707  * @cpu:                target cpu
4708  * @group_fd:           group leader event fd
4709  */
4710 SYSCALL_DEFINE5(perf_event_open,
4711                 struct perf_event_attr __user *, attr_uptr,
4712                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4713 {
4714         struct perf_event *event, *group_leader;
4715         struct perf_event_attr attr;
4716         struct perf_event_context *ctx;
4717         struct file *event_file = NULL;
4718         struct file *group_file = NULL;
4719         int event_fd;
4720         int fput_needed = 0;
4721         int err;
4722
4723         /* for future expandability... */
4724         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4725                 return -EINVAL;
4726
4727         err = perf_copy_attr(attr_uptr, &attr);
4728         if (err)
4729                 return err;
4730
4731         if (!attr.exclude_kernel) {
4732                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4733                         return -EACCES;
4734         }
4735
4736         if (attr.freq) {
4737                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4738                         return -EINVAL;
4739         }
4740
4741         event_fd = get_unused_fd_flags(O_RDWR);
4742         if (event_fd < 0)
4743                 return event_fd;
4744
4745         /*
4746          * Get the target context (task or percpu):
4747          */
4748         ctx = find_get_context(pid, cpu);
4749         if (IS_ERR(ctx)) {
4750                 err = PTR_ERR(ctx);
4751                 goto err_fd;
4752         }
4753
4754         /*
4755          * Look up the group leader (we will attach this event to it):
4756          */
4757         group_leader = NULL;
4758         if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4759                 err = -EINVAL;
4760                 group_file = fget_light(group_fd, &fput_needed);
4761                 if (!group_file)
4762                         goto err_put_context;
4763                 if (group_file->f_op != &perf_fops)
4764                         goto err_put_context;
4765
4766                 group_leader = group_file->private_data;
4767                 /*
4768                  * Do not allow a recursive hierarchy (this new sibling
4769                  * becoming part of another group-sibling):
4770                  */
4771                 if (group_leader->group_leader != group_leader)
4772                         goto err_put_context;
4773                 /*
4774                  * Do not allow to attach to a group in a different
4775                  * task or CPU context:
4776                  */
4777                 if (group_leader->ctx != ctx)
4778                         goto err_put_context;
4779                 /*
4780                  * Only a group leader can be exclusive or pinned
4781                  */
4782                 if (attr.exclusive || attr.pinned)
4783                         goto err_put_context;
4784         }
4785
4786         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4787                                      NULL, NULL, GFP_KERNEL);
4788         err = PTR_ERR(event);
4789         if (IS_ERR(event))
4790                 goto err_put_context;
4791
4792         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
4793         if (IS_ERR(event_file)) {
4794                 err = PTR_ERR(event_file);
4795                 goto err_free_put_context;
4796         }
4797
4798         if (flags & PERF_FLAG_FD_OUTPUT) {
4799                 err = perf_event_set_output(event, group_fd);
4800                 if (err)
4801                         goto err_fput_free_put_context;
4802         }
4803
4804         event->filp = event_file;
4805         WARN_ON_ONCE(ctx->parent_ctx);
4806         mutex_lock(&ctx->mutex);
4807         perf_install_in_context(ctx, event, cpu);
4808         ++ctx->generation;
4809         mutex_unlock(&ctx->mutex);
4810
4811         event->owner = current;
4812         get_task_struct(current);
4813         mutex_lock(&current->perf_event_mutex);
4814         list_add_tail(&event->owner_entry, &current->perf_event_list);
4815         mutex_unlock(&current->perf_event_mutex);
4816
4817         fput_light(group_file, fput_needed);
4818         fd_install(event_fd, event_file);
4819         return event_fd;
4820
4821 err_fput_free_put_context:
4822         fput(event_file);
4823 err_free_put_context:
4824         free_event(event);
4825 err_put_context:
4826         fput_light(group_file, fput_needed);
4827         put_ctx(ctx);
4828 err_fd:
4829         put_unused_fd(event_fd);
4830         return err;
4831 }
4832
4833 /**
4834  * perf_event_create_kernel_counter
4835  *
4836  * @attr: attributes of the counter to create
4837  * @cpu: cpu in which the counter is bound
4838  * @pid: task to profile
4839  */
4840 struct perf_event *
4841 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4842                                  pid_t pid,
4843                                  perf_overflow_handler_t overflow_handler)
4844 {
4845         struct perf_event *event;
4846         struct perf_event_context *ctx;
4847         int err;
4848
4849         /*
4850          * Get the target context (task or percpu):
4851          */
4852
4853         ctx = find_get_context(pid, cpu);
4854         if (IS_ERR(ctx)) {
4855                 err = PTR_ERR(ctx);
4856                 goto err_exit;
4857         }
4858
4859         event = perf_event_alloc(attr, cpu, ctx, NULL,
4860                                  NULL, overflow_handler, GFP_KERNEL);
4861         if (IS_ERR(event)) {
4862                 err = PTR_ERR(event);
4863                 goto err_put_context;
4864         }
4865
4866         event->filp = NULL;
4867         WARN_ON_ONCE(ctx->parent_ctx);
4868         mutex_lock(&ctx->mutex);
4869         perf_install_in_context(ctx, event, cpu);
4870         ++ctx->generation;
4871         mutex_unlock(&ctx->mutex);
4872
4873         event->owner = current;
4874         get_task_struct(current);
4875         mutex_lock(&current->perf_event_mutex);
4876         list_add_tail(&event->owner_entry, &current->perf_event_list);
4877         mutex_unlock(&current->perf_event_mutex);
4878
4879         return event;
4880
4881  err_put_context:
4882         put_ctx(ctx);
4883  err_exit:
4884         return ERR_PTR(err);
4885 }
4886 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4887
4888 /*
4889  * inherit a event from parent task to child task:
4890  */
4891 static struct perf_event *
4892 inherit_event(struct perf_event *parent_event,
4893               struct task_struct *parent,
4894               struct perf_event_context *parent_ctx,
4895               struct task_struct *child,
4896               struct perf_event *group_leader,
4897               struct perf_event_context *child_ctx)
4898 {
4899         struct perf_event *child_event;
4900
4901         /*
4902          * Instead of creating recursive hierarchies of events,
4903          * we link inherited events back to the original parent,
4904          * which has a filp for sure, which we use as the reference
4905          * count:
4906          */
4907         if (parent_event->parent)
4908                 parent_event = parent_event->parent;
4909
4910         child_event = perf_event_alloc(&parent_event->attr,
4911                                            parent_event->cpu, child_ctx,
4912                                            group_leader, parent_event,
4913                                            NULL, GFP_KERNEL);
4914         if (IS_ERR(child_event))
4915                 return child_event;
4916         get_ctx(child_ctx);
4917
4918         /*
4919          * Make the child state follow the state of the parent event,
4920          * not its attr.disabled bit.  We hold the parent's mutex,
4921          * so we won't race with perf_event_{en, dis}able_family.
4922          */
4923         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4924                 child_event->state = PERF_EVENT_STATE_INACTIVE;
4925         else
4926                 child_event->state = PERF_EVENT_STATE_OFF;
4927
4928         if (parent_event->attr.freq)
4929                 child_event->hw.sample_period = parent_event->hw.sample_period;
4930
4931         child_event->overflow_handler = parent_event->overflow_handler;
4932
4933         /*
4934          * Link it up in the child's context:
4935          */
4936         add_event_to_ctx(child_event, child_ctx);
4937
4938         /*
4939          * Get a reference to the parent filp - we will fput it
4940          * when the child event exits. This is safe to do because
4941          * we are in the parent and we know that the filp still
4942          * exists and has a nonzero count:
4943          */
4944         atomic_long_inc(&parent_event->filp->f_count);
4945
4946         /*
4947          * Link this into the parent event's child list
4948          */
4949         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4950         mutex_lock(&parent_event->child_mutex);
4951         list_add_tail(&child_event->child_list, &parent_event->child_list);
4952         mutex_unlock(&parent_event->child_mutex);
4953
4954         return child_event;
4955 }
4956
4957 static int inherit_group(struct perf_event *parent_event,
4958               struct task_struct *parent,
4959               struct perf_event_context *parent_ctx,
4960               struct task_struct *child,
4961               struct perf_event_context *child_ctx)
4962 {
4963         struct perf_event *leader;
4964         struct perf_event *sub;
4965         struct perf_event *child_ctr;
4966
4967         leader = inherit_event(parent_event, parent, parent_ctx,
4968                                  child, NULL, child_ctx);
4969         if (IS_ERR(leader))
4970                 return PTR_ERR(leader);
4971         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4972                 child_ctr = inherit_event(sub, parent, parent_ctx,
4973                                             child, leader, child_ctx);
4974                 if (IS_ERR(child_ctr))
4975                         return PTR_ERR(child_ctr);
4976         }
4977         return 0;
4978 }
4979
4980 static void sync_child_event(struct perf_event *child_event,
4981                                struct task_struct *child)
4982 {
4983         struct perf_event *parent_event = child_event->parent;
4984         u64 child_val;
4985
4986         if (child_event->attr.inherit_stat)
4987                 perf_event_read_event(child_event, child);
4988
4989         child_val = atomic64_read(&child_event->count);
4990
4991         /*
4992          * Add back the child's count to the parent's count:
4993          */
4994         atomic64_add(child_val, &parent_event->count);
4995         atomic64_add(child_event->total_time_enabled,
4996                      &parent_event->child_total_time_enabled);
4997         atomic64_add(child_event->total_time_running,
4998                      &parent_event->child_total_time_running);
4999
5000         /*
5001          * Remove this event from the parent's list
5002          */
5003         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5004         mutex_lock(&parent_event->child_mutex);
5005         list_del_init(&child_event->child_list);
5006         mutex_unlock(&parent_event->child_mutex);
5007
5008         /*
5009          * Release the parent event, if this was the last
5010          * reference to it.
5011          */
5012         fput(parent_event->filp);
5013 }
5014
5015 static void
5016 __perf_event_exit_task(struct perf_event *child_event,
5017                          struct perf_event_context *child_ctx,
5018                          struct task_struct *child)
5019 {
5020         struct perf_event *parent_event;
5021
5022         perf_event_remove_from_context(child_event);
5023
5024         parent_event = child_event->parent;
5025         /*
5026          * It can happen that parent exits first, and has events
5027          * that are still around due to the child reference. These
5028          * events need to be zapped - but otherwise linger.
5029          */
5030         if (parent_event) {
5031                 sync_child_event(child_event, child);
5032                 free_event(child_event);
5033         }
5034 }
5035
5036 /*
5037  * When a child task exits, feed back event values to parent events.
5038  */
5039 void perf_event_exit_task(struct task_struct *child)
5040 {
5041         struct perf_event *child_event, *tmp;
5042         struct perf_event_context *child_ctx;
5043         unsigned long flags;
5044
5045         if (likely(!child->perf_event_ctxp)) {
5046                 perf_event_task(child, NULL, 0);
5047                 return;
5048         }
5049
5050         local_irq_save(flags);
5051         /*
5052          * We can't reschedule here because interrupts are disabled,
5053          * and either child is current or it is a task that can't be
5054          * scheduled, so we are now safe from rescheduling changing
5055          * our context.
5056          */
5057         child_ctx = child->perf_event_ctxp;
5058         __perf_event_task_sched_out(child_ctx);
5059
5060         /*
5061          * Take the context lock here so that if find_get_context is
5062          * reading child->perf_event_ctxp, we wait until it has
5063          * incremented the context's refcount before we do put_ctx below.
5064          */
5065         raw_spin_lock(&child_ctx->lock);
5066         child->perf_event_ctxp = NULL;
5067         /*
5068          * If this context is a clone; unclone it so it can't get
5069          * swapped to another process while we're removing all
5070          * the events from it.
5071          */
5072         unclone_ctx(child_ctx);
5073         update_context_time(child_ctx);
5074         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5075
5076         /*
5077          * Report the task dead after unscheduling the events so that we
5078          * won't get any samples after PERF_RECORD_EXIT. We can however still
5079          * get a few PERF_RECORD_READ events.
5080          */
5081         perf_event_task(child, child_ctx, 0);
5082
5083         /*
5084          * We can recurse on the same lock type through:
5085          *
5086          *   __perf_event_exit_task()
5087          *     sync_child_event()
5088          *       fput(parent_event->filp)
5089          *         perf_release()
5090          *           mutex_lock(&ctx->mutex)
5091          *
5092          * But since its the parent context it won't be the same instance.
5093          */
5094         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5095
5096 again:
5097         list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
5098                                  group_entry)
5099                 __perf_event_exit_task(child_event, child_ctx, child);
5100
5101         /*
5102          * If the last event was a group event, it will have appended all
5103          * its siblings to the list, but we obtained 'tmp' before that which
5104          * will still point to the list head terminating the iteration.
5105          */
5106         if (!list_empty(&child_ctx->group_list))
5107                 goto again;
5108
5109         mutex_unlock(&child_ctx->mutex);
5110
5111         put_ctx(child_ctx);
5112 }
5113
5114 /*
5115  * free an unexposed, unused context as created by inheritance by
5116  * init_task below, used by fork() in case of fail.
5117  */
5118 void perf_event_free_task(struct task_struct *task)
5119 {
5120         struct perf_event_context *ctx = task->perf_event_ctxp;
5121         struct perf_event *event, *tmp;
5122
5123         if (!ctx)
5124                 return;
5125
5126         mutex_lock(&ctx->mutex);
5127 again:
5128         list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
5129                 struct perf_event *parent = event->parent;
5130
5131                 if (WARN_ON_ONCE(!parent))
5132                         continue;
5133
5134                 mutex_lock(&parent->child_mutex);
5135                 list_del_init(&event->child_list);
5136                 mutex_unlock(&parent->child_mutex);
5137
5138                 fput(parent->filp);
5139
5140                 list_del_event(event, ctx);
5141                 free_event(event);
5142         }
5143
5144         if (!list_empty(&ctx->group_list))
5145                 goto again;
5146
5147         mutex_unlock(&ctx->mutex);
5148
5149         put_ctx(ctx);
5150 }
5151
5152 /*
5153  * Initialize the perf_event context in task_struct
5154  */
5155 int perf_event_init_task(struct task_struct *child)
5156 {
5157         struct perf_event_context *child_ctx = NULL, *parent_ctx;
5158         struct perf_event_context *cloned_ctx;
5159         struct perf_event *event;
5160         struct task_struct *parent = current;
5161         int inherited_all = 1;
5162         int ret = 0;
5163
5164         child->perf_event_ctxp = NULL;
5165
5166         mutex_init(&child->perf_event_mutex);
5167         INIT_LIST_HEAD(&child->perf_event_list);
5168
5169         if (likely(!parent->perf_event_ctxp))
5170                 return 0;
5171
5172         /*
5173          * If the parent's context is a clone, pin it so it won't get
5174          * swapped under us.
5175          */
5176         parent_ctx = perf_pin_task_context(parent);
5177
5178         /*
5179          * No need to check if parent_ctx != NULL here; since we saw
5180          * it non-NULL earlier, the only reason for it to become NULL
5181          * is if we exit, and since we're currently in the middle of
5182          * a fork we can't be exiting at the same time.
5183          */
5184
5185         /*
5186          * Lock the parent list. No need to lock the child - not PID
5187          * hashed yet and not running, so nobody can access it.
5188          */
5189         mutex_lock(&parent_ctx->mutex);
5190
5191         /*
5192          * We dont have to disable NMIs - we are only looking at
5193          * the list, not manipulating it:
5194          */
5195         list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
5196
5197                 if (!event->attr.inherit) {
5198                         inherited_all = 0;
5199                         continue;
5200                 }
5201
5202                 if (!child->perf_event_ctxp) {
5203                         /*
5204                          * This is executed from the parent task context, so
5205                          * inherit events that have been marked for cloning.
5206                          * First allocate and initialize a context for the
5207                          * child.
5208                          */
5209
5210                         child_ctx = kzalloc(sizeof(struct perf_event_context),
5211                                             GFP_KERNEL);
5212                         if (!child_ctx) {
5213                                 ret = -ENOMEM;
5214                                 break;
5215                         }
5216
5217                         __perf_event_init_context(child_ctx, child);
5218                         child->perf_event_ctxp = child_ctx;
5219                         get_task_struct(child);
5220                 }
5221
5222                 ret = inherit_group(event, parent, parent_ctx,
5223                                              child, child_ctx);
5224                 if (ret) {
5225                         inherited_all = 0;
5226                         break;
5227                 }
5228         }
5229
5230         if (child_ctx && inherited_all) {
5231                 /*
5232                  * Mark the child context as a clone of the parent
5233                  * context, or of whatever the parent is a clone of.
5234                  * Note that if the parent is a clone, it could get
5235                  * uncloned at any point, but that doesn't matter
5236                  * because the list of events and the generation
5237                  * count can't have changed since we took the mutex.
5238                  */
5239                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5240                 if (cloned_ctx) {
5241                         child_ctx->parent_ctx = cloned_ctx;
5242                         child_ctx->parent_gen = parent_ctx->parent_gen;
5243                 } else {
5244                         child_ctx->parent_ctx = parent_ctx;
5245                         child_ctx->parent_gen = parent_ctx->generation;
5246                 }
5247                 get_ctx(child_ctx->parent_ctx);
5248         }
5249
5250         mutex_unlock(&parent_ctx->mutex);
5251
5252         perf_unpin_context(parent_ctx);
5253
5254         return ret;
5255 }
5256
5257 static void __init perf_event_init_all_cpus(void)
5258 {
5259         int cpu;
5260         struct perf_cpu_context *cpuctx;
5261
5262         for_each_possible_cpu(cpu) {
5263                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5264                 __perf_event_init_context(&cpuctx->ctx, NULL);
5265         }
5266 }
5267
5268 static void __cpuinit perf_event_init_cpu(int cpu)
5269 {
5270         struct perf_cpu_context *cpuctx;
5271
5272         cpuctx = &per_cpu(perf_cpu_context, cpu);
5273
5274         spin_lock(&perf_resource_lock);
5275         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5276         spin_unlock(&perf_resource_lock);
5277
5278         hw_perf_event_setup(cpu);
5279 }
5280
5281 #ifdef CONFIG_HOTPLUG_CPU
5282 static void __perf_event_exit_cpu(void *info)
5283 {
5284         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5285         struct perf_event_context *ctx = &cpuctx->ctx;
5286         struct perf_event *event, *tmp;
5287
5288         list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
5289                 __perf_event_remove_from_context(event);
5290 }
5291 static void perf_event_exit_cpu(int cpu)
5292 {
5293         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5294         struct perf_event_context *ctx = &cpuctx->ctx;
5295
5296         mutex_lock(&ctx->mutex);
5297         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5298         mutex_unlock(&ctx->mutex);
5299 }
5300 #else
5301 static inline void perf_event_exit_cpu(int cpu) { }
5302 #endif
5303
5304 static int __cpuinit
5305 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5306 {
5307         unsigned int cpu = (long)hcpu;
5308
5309         switch (action) {
5310
5311         case CPU_UP_PREPARE:
5312         case CPU_UP_PREPARE_FROZEN:
5313                 perf_event_init_cpu(cpu);
5314                 break;
5315
5316         case CPU_ONLINE:
5317         case CPU_ONLINE_FROZEN:
5318                 hw_perf_event_setup_online(cpu);
5319                 break;
5320
5321         case CPU_DOWN_PREPARE:
5322         case CPU_DOWN_PREPARE_FROZEN:
5323                 perf_event_exit_cpu(cpu);
5324                 break;
5325
5326         default:
5327                 break;
5328         }
5329
5330         return NOTIFY_OK;
5331 }
5332
5333 /*
5334  * This has to have a higher priority than migration_notifier in sched.c.
5335  */
5336 static struct notifier_block __cpuinitdata perf_cpu_nb = {
5337         .notifier_call          = perf_cpu_notify,
5338         .priority               = 20,
5339 };
5340
5341 void __init perf_event_init(void)
5342 {
5343         perf_event_init_all_cpus();
5344         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5345                         (void *)(long)smp_processor_id());
5346         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5347                         (void *)(long)smp_processor_id());
5348         register_cpu_notifier(&perf_cpu_nb);
5349 }
5350
5351 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5352 {
5353         return sprintf(buf, "%d\n", perf_reserved_percpu);
5354 }
5355
5356 static ssize_t
5357 perf_set_reserve_percpu(struct sysdev_class *class,
5358                         const char *buf,
5359                         size_t count)
5360 {
5361         struct perf_cpu_context *cpuctx;
5362         unsigned long val;
5363         int err, cpu, mpt;
5364
5365         err = strict_strtoul(buf, 10, &val);
5366         if (err)
5367                 return err;
5368         if (val > perf_max_events)
5369                 return -EINVAL;
5370
5371         spin_lock(&perf_resource_lock);
5372         perf_reserved_percpu = val;
5373         for_each_online_cpu(cpu) {
5374                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5375                 raw_spin_lock_irq(&cpuctx->ctx.lock);
5376                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5377                           perf_max_events - perf_reserved_percpu);
5378                 cpuctx->max_pertask = mpt;
5379                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5380         }
5381         spin_unlock(&perf_resource_lock);
5382
5383         return count;
5384 }
5385
5386 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5387 {
5388         return sprintf(buf, "%d\n", perf_overcommit);
5389 }
5390
5391 static ssize_t
5392 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5393 {
5394         unsigned long val;
5395         int err;
5396
5397         err = strict_strtoul(buf, 10, &val);
5398         if (err)
5399                 return err;
5400         if (val > 1)
5401                 return -EINVAL;
5402
5403         spin_lock(&perf_resource_lock);
5404         perf_overcommit = val;
5405         spin_unlock(&perf_resource_lock);
5406
5407         return count;
5408 }
5409
5410 static SYSDEV_CLASS_ATTR(
5411                                 reserve_percpu,
5412                                 0644,
5413                                 perf_show_reserve_percpu,
5414                                 perf_set_reserve_percpu
5415                         );
5416
5417 static SYSDEV_CLASS_ATTR(
5418                                 overcommit,
5419                                 0644,
5420                                 perf_show_overcommit,
5421                                 perf_set_overcommit
5422                         );
5423
5424 static struct attribute *perfclass_attrs[] = {
5425         &attr_reserve_percpu.attr,
5426         &attr_overcommit.attr,
5427         NULL
5428 };
5429
5430 static struct attribute_group perfclass_attr_group = {
5431         .attrs                  = perfclass_attrs,
5432         .name                   = "perf_events",
5433 };
5434
5435 static int __init perf_event_sysfs_init(void)
5436 {
5437         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5438                                   &perfclass_attr_group);
5439 }
5440 device_initcall(perf_event_sysfs_init);