kernel/perf_event.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/sysfs.h>
  19 #include <linux/dcache.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/vmstat.h>
  23 #include <linux/hardirq.h>
  24 #include <linux/rculist.h>
  25 #include <linux/uaccess.h>
  26 #include <linux/syscalls.h>
  27 #include <linux/anon_inodes.h>
  28 #include <linux/kernel_stat.h>
  29 #include <linux/perf_event.h>
  30
  31 #include <asm/irq_regs.h>
  32
  33 /*
  34  * Each CPU has a list of per CPU events:
  35  */
  36 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  37
  38 int perf_max_events __read_mostly = 1;
  39 static int perf_reserved_percpu __read_mostly;
  40 static int perf_overcommit __read_mostly = 1;
  41
  42 static atomic_t nr_events __read_mostly;
  43 static atomic_t nr_mmap_events __read_mostly;
  44 static atomic_t nr_comm_events __read_mostly;
  45 static atomic_t nr_task_events __read_mostly;
  46
  47 /*
  48  * perf event paranoia level:
  49  *  -1 - not paranoid at all
  50  *   0 - disallow raw tracepoint access for unpriv
  51  *   1 - disallow cpu events for unpriv
  52  *   2 - disallow kernel profiling for unpriv
  53  */
  54 int sysctl_perf_event_paranoid __read_mostly = 1;
  55
  56 static inline bool perf_paranoid_tracepoint_raw(void)
  57 {
  58         return sysctl_perf_event_paranoid > -1;
  59 }
  60
  61 static inline bool perf_paranoid_cpu(void)
  62 {
  63         return sysctl_perf_event_paranoid > 0;
  64 }
  65
  66 static inline bool perf_paranoid_kernel(void)
  67 {
  68         return sysctl_perf_event_paranoid > 1;
  69 }
  70
  71 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
  72
  73 /*
  74  * max perf event sample rate
  75  */
  76 int sysctl_perf_event_sample_rate __read_mostly = 100000;
  77
  78 static atomic64_t perf_event_id;
  79
  80 /*
  81  * Lock for (sysadmin-configurable) event reservations:
  82  */
  83 static DEFINE_SPINLOCK(perf_resource_lock);
  84
  85 /*
  86  * Architecture provided APIs - weak aliases:
  87  */
  88 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  89 {
  90         return NULL;
  91 }
  92
  93 void __weak hw_perf_disable(void)               { barrier(); }
  94 void __weak hw_perf_enable(void)                { barrier(); }
  95
  96 void __weak hw_perf_event_setup(int cpu)        { barrier(); }
  97 void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
  98
  99 int __weak
 100 hw_perf_group_sched_in(struct perf_event *group_leader,
 101                struct perf_cpu_context *cpuctx,
 102                struct perf_event_context *ctx, int cpu)
 103 {
 104         return 0;
 105 }
 106
 107 void __weak perf_event_print_debug(void)        { }
 108
 109 static DEFINE_PER_CPU(int, perf_disable_count);
 110
 111 void __perf_disable(void)
 112 {
 113         __get_cpu_var(perf_disable_count)++;
 114 }
 115
 116 bool __perf_enable(void)
 117 {
 118         return !--__get_cpu_var(perf_disable_count);
 119 }
 120
 121 void perf_disable(void)
 122 {
 123         __perf_disable();
 124         hw_perf_disable();
 125 }
 126
 127 void perf_enable(void)
 128 {
 129         if (__perf_enable())
 130                 hw_perf_enable();
 131 }
 132
 133 static void get_ctx(struct perf_event_context *ctx)
 134 {
 135         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 136 }
 137
 138 static void free_ctx(struct rcu_head *head)
 139 {
 140         struct perf_event_context *ctx;
 141
 142         ctx = container_of(head, struct perf_event_context, rcu_head);
 143         kfree(ctx);
 144 }
 145
 146 static void put_ctx(struct perf_event_context *ctx)
 147 {
 148         if (atomic_dec_and_test(&ctx->refcount)) {
 149                 if (ctx->parent_ctx)
 150                         put_ctx(ctx->parent_ctx);
 151                 if (ctx->task)
 152                         put_task_struct(ctx->task);
 153                 call_rcu(&ctx->rcu_head, free_ctx);
 154         }
 155 }
 156
 157 static void unclone_ctx(struct perf_event_context *ctx)
 158 {
 159         if (ctx->parent_ctx) {
 160                 put_ctx(ctx->parent_ctx);
 161                 ctx->parent_ctx = NULL;
 162         }
 163 }
 164
 165 /*
 166  * If we inherit events we want to return the parent event id
 167  * to userspace.
 168  */
 169 static u64 primary_event_id(struct perf_event *event)
 170 {
 171         u64 id = event->id;
 172
 173         if (event->parent)
 174                 id = event->parent->id;
 175
 176         return id;
 177 }
 178
 179 /*
 180  * Get the perf_event_context for a task and lock it.
 181  * This has to cope with with the fact that until it is locked,
 182  * the context could get moved to another task.
 183  */
 184 static struct perf_event_context *
 185 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 186 {
 187         struct perf_event_context *ctx;
 188
 189         rcu_read_lock();
 190  retry:
 191         ctx = rcu_dereference(task->perf_event_ctxp);
 192         if (ctx) {
 193                 /*
 194                  * If this context is a clone of another, it might
 195                  * get swapped for another underneath us by
 196                  * perf_event_task_sched_out, though the
 197                  * rcu_read_lock() protects us from any context
 198                  * getting freed.  Lock the context and check if it
 199                  * got swapped before we could get the lock, and retry
 200                  * if so.  If we locked the right context, then it
 201                  * can't get swapped on us any more.
 202                  */
 203                 spin_lock_irqsave(&ctx->lock, *flags);
 204                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 205                         spin_unlock_irqrestore(&ctx->lock, *flags);
 206                         goto retry;
 207                 }
 208
 209                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 210                         spin_unlock_irqrestore(&ctx->lock, *flags);
 211                         ctx = NULL;
 212                 }
 213         }
 214         rcu_read_unlock();
 215         return ctx;
 216 }
 217
 218 /*
 219  * Get the context for a task and increment its pin_count so it
 220  * can't get swapped to another task.  This also increments its
 221  * reference count so that the context can't get freed.
 222  */
 223 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 224 {
 225         struct perf_event_context *ctx;
 226         unsigned long flags;
 227
 228         ctx = perf_lock_task_context(task, &flags);
 229         if (ctx) {
 230                 ++ctx->pin_count;
 231                 spin_unlock_irqrestore(&ctx->lock, flags);
 232         }
 233         return ctx;
 234 }
 235
 236 static void perf_unpin_context(struct perf_event_context *ctx)
 237 {
 238         unsigned long flags;
 239
 240         spin_lock_irqsave(&ctx->lock, flags);
 241         --ctx->pin_count;
 242         spin_unlock_irqrestore(&ctx->lock, flags);
 243         put_ctx(ctx);
 244 }
 245
 246 /*
 247  * Add a event from the lists for its context.
 248  * Must be called with ctx->mutex and ctx->lock held.
 249  */
 250 static void
 251 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 252 {
 253         struct perf_event *group_leader = event->group_leader;
 254
 255         /*
 256          * Depending on whether it is a standalone or sibling event,
 257          * add it straight to the context's event list, or to the group
 258          * leader's sibling list:
 259          */
 260         if (group_leader == event)
 261                 list_add_tail(&event->group_entry, &ctx->group_list);
 262         else {
 263                 list_add_tail(&event->group_entry, &group_leader->sibling_list);
 264                 group_leader->nr_siblings++;
 265         }
 266
 267         list_add_rcu(&event->event_entry, &ctx->event_list);
 268         ctx->nr_events++;
 269         if (event->attr.inherit_stat)
 270                 ctx->nr_stat++;
 271 }
 272
 273 /*
 274  * Remove a event from the lists for its context.
 275  * Must be called with ctx->mutex and ctx->lock held.
 276  */
 277 static void
 278 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 279 {
 280         struct perf_event *sibling, *tmp;
 281
 282         if (list_empty(&event->group_entry))
 283                 return;
 284         ctx->nr_events--;
 285         if (event->attr.inherit_stat)
 286                 ctx->nr_stat--;
 287
 288         list_del_init(&event->group_entry);
 289         list_del_rcu(&event->event_entry);
 290
 291         if (event->group_leader != event)
 292                 event->group_leader->nr_siblings--;
 293
 294         /*
 295          * If this was a group event with sibling events then
 296          * upgrade the siblings to singleton events by adding them
 297          * to the context list directly:
 298          */
 299         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 300
 301                 list_move_tail(&sibling->group_entry, &ctx->group_list);
 302                 sibling->group_leader = sibling;
 303         }
 304 }
 305
 306 static void
 307 event_sched_out(struct perf_event *event,
 308                   struct perf_cpu_context *cpuctx,
 309                   struct perf_event_context *ctx)
 310 {
 311         if (event->state != PERF_EVENT_STATE_ACTIVE)
 312                 return;
 313
 314         event->state = PERF_EVENT_STATE_INACTIVE;
 315         if (event->pending_disable) {
 316                 event->pending_disable = 0;
 317                 event->state = PERF_EVENT_STATE_OFF;
 318         }
 319         event->tstamp_stopped = ctx->time;
 320         event->pmu->disable(event);
 321         event->oncpu = -1;
 322
 323         if (!is_software_event(event))
 324                 cpuctx->active_oncpu--;
 325         ctx->nr_active--;
 326         if (event->attr.exclusive || !cpuctx->active_oncpu)
 327                 cpuctx->exclusive = 0;
 328 }
 329
 330 static void
 331 group_sched_out(struct perf_event *group_event,
 332                 struct perf_cpu_context *cpuctx,
 333                 struct perf_event_context *ctx)
 334 {
 335         struct perf_event *event;
 336
 337         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 338                 return;
 339
 340         event_sched_out(group_event, cpuctx, ctx);
 341
 342         /*
 343          * Schedule out siblings (if any):
 344          */
 345         list_for_each_entry(event, &group_event->sibling_list, group_entry)
 346                 event_sched_out(event, cpuctx, ctx);
 347
 348         if (group_event->attr.exclusive)
 349                 cpuctx->exclusive = 0;
 350 }
 351
 352 /*
 353  * Cross CPU call to remove a performance event
 354  *
 355  * We disable the event on the hardware level first. After that we
 356  * remove it from the context list.
 357  */
 358 static void __perf_event_remove_from_context(void *info)
 359 {
 360         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 361         struct perf_event *event = info;
 362         struct perf_event_context *ctx = event->ctx;
 363
 364         /*
 365          * If this is a task context, we need to check whether it is
 366          * the current task context of this cpu. If not it has been
 367          * scheduled out before the smp call arrived.
 368          */
 369         if (ctx->task && cpuctx->task_ctx != ctx)
 370                 return;
 371
 372         spin_lock(&ctx->lock);
 373         /*
 374          * Protect the list operation against NMI by disabling the
 375          * events on a global level.
 376          */
 377         perf_disable();
 378
 379         event_sched_out(event, cpuctx, ctx);
 380
 381         list_del_event(event, ctx);
 382
 383         if (!ctx->task) {
 384                 /*
 385                  * Allow more per task events with respect to the
 386                  * reservation:
 387                  */
 388                 cpuctx->max_pertask =
 389                         min(perf_max_events - ctx->nr_events,
 390                             perf_max_events - perf_reserved_percpu);
 391         }
 392
 393         perf_enable();
 394         spin_unlock(&ctx->lock);
 395 }
 396
 397
 398 /*
 399  * Remove the event from a task's (or a CPU's) list of events.
 400  *
 401  * Must be called with ctx->mutex held.
 402  *
 403  * CPU events are removed with a smp call. For task events we only
 404  * call when the task is on a CPU.
 405  *
 406  * If event->ctx is a cloned context, callers must make sure that
 407  * every task struct that event->ctx->task could possibly point to
 408  * remains valid.  This is OK when called from perf_release since
 409  * that only calls us on the top-level context, which can't be a clone.
 410  * When called from perf_event_exit_task, it's OK because the
 411  * context has been detached from its task.
 412  */
 413 static void perf_event_remove_from_context(struct perf_event *event)
 414 {
 415         struct perf_event_context *ctx = event->ctx;
 416         struct task_struct *task = ctx->task;
 417
 418         if (!task) {
 419                 /*
 420                  * Per cpu events are removed via an smp call and
 421                  * the removal is always sucessful.
 422                  */
 423                 smp_call_function_single(event->cpu,
 424                                          __perf_event_remove_from_context,
 425                                          event, 1);
 426                 return;
 427         }
 428
 429 retry:
 430         task_oncpu_function_call(task, __perf_event_remove_from_context,
 431                                  event);
 432
 433         spin_lock_irq(&ctx->lock);
 434         /*
 435          * If the context is active we need to retry the smp call.
 436          */
 437         if (ctx->nr_active && !list_empty(&event->group_entry)) {
 438                 spin_unlock_irq(&ctx->lock);
 439                 goto retry;
 440         }
 441
 442         /*
 443          * The lock prevents that this context is scheduled in so we
 444          * can remove the event safely, if the call above did not
 445          * succeed.
 446          */
 447         if (!list_empty(&event->group_entry)) {
 448                 list_del_event(event, ctx);
 449         }
 450         spin_unlock_irq(&ctx->lock);
 451 }
 452
 453 static inline u64 perf_clock(void)
 454 {
 455         return cpu_clock(smp_processor_id());
 456 }
 457
 458 /*
 459  * Update the record of the current time in a context.
 460  */
 461 static void update_context_time(struct perf_event_context *ctx)
 462 {
 463         u64 now = perf_clock();
 464
 465         ctx->time += now - ctx->timestamp;
 466         ctx->timestamp = now;
 467 }
 468
 469 /*
 470  * Update the total_time_enabled and total_time_running fields for a event.
 471  */
 472 static void update_event_times(struct perf_event *event)
 473 {
 474         struct perf_event_context *ctx = event->ctx;
 475         u64 run_end;
 476
 477         if (event->state < PERF_EVENT_STATE_INACTIVE ||
 478             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 479                 return;
 480
 481         event->total_time_enabled = ctx->time - event->tstamp_enabled;
 482
 483         if (event->state == PERF_EVENT_STATE_INACTIVE)
 484                 run_end = event->tstamp_stopped;
 485         else
 486                 run_end = ctx->time;
 487
 488         event->total_time_running = run_end - event->tstamp_running;
 489 }
 490
 491 /*
 492  * Update total_time_enabled and total_time_running for all events in a group.
 493  */
 494 static void update_group_times(struct perf_event *leader)
 495 {
 496         struct perf_event *event;
 497
 498         update_event_times(leader);
 499         list_for_each_entry(event, &leader->sibling_list, group_entry)
 500                 update_event_times(event);
 501 }
 502
 503 /*
 504  * Cross CPU call to disable a performance event
 505  */
 506 static void __perf_event_disable(void *info)
 507 {
 508         struct perf_event *event = info;
 509         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 510         struct perf_event_context *ctx = event->ctx;
 511
 512         /*
 513          * If this is a per-task event, need to check whether this
 514          * event's task is the current task on this cpu.
 515          */
 516         if (ctx->task && cpuctx->task_ctx != ctx)
 517                 return;
 518
 519         spin_lock(&ctx->lock);
 520
 521         /*
 522          * If the event is on, turn it off.
 523          * If it is in error state, leave it in error state.
 524          */
 525         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 526                 update_context_time(ctx);
 527                 update_group_times(event);
 528                 if (event == event->group_leader)
 529                         group_sched_out(event, cpuctx, ctx);
 530                 else
 531                         event_sched_out(event, cpuctx, ctx);
 532                 event->state = PERF_EVENT_STATE_OFF;
 533         }
 534
 535         spin_unlock(&ctx->lock);
 536 }
 537
 538 /*
 539  * Disable a event.
 540  *
 541  * If event->ctx is a cloned context, callers must make sure that
 542  * every task struct that event->ctx->task could possibly point to
 543  * remains valid.  This condition is satisifed when called through
 544  * perf_event_for_each_child or perf_event_for_each because they
 545  * hold the top-level event's child_mutex, so any descendant that
 546  * goes to exit will block in sync_child_event.
 547  * When called from perf_pending_event it's OK because event->ctx
 548  * is the current context on this CPU and preemption is disabled,
 549  * hence we can't get into perf_event_task_sched_out for this context.
 550  */
 551 static void perf_event_disable(struct perf_event *event)
 552 {
 553         struct perf_event_context *ctx = event->ctx;
 554         struct task_struct *task = ctx->task;
 555
 556         if (!task) {
 557                 /*
 558                  * Disable the event on the cpu that it's on
 559                  */
 560                 smp_call_function_single(event->cpu, __perf_event_disable,
 561                                          event, 1);
 562                 return;
 563         }
 564
 565  retry:
 566         task_oncpu_function_call(task, __perf_event_disable, event);
 567
 568         spin_lock_irq(&ctx->lock);
 569         /*
 570          * If the event is still active, we need to retry the cross-call.
 571          */
 572         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 573                 spin_unlock_irq(&ctx->lock);
 574                 goto retry;
 575         }
 576
 577         /*
 578          * Since we have the lock this context can't be scheduled
 579          * in, so we can change the state safely.
 580          */
 581         if (event->state == PERF_EVENT_STATE_INACTIVE) {
 582                 update_group_times(event);
 583                 event->state = PERF_EVENT_STATE_OFF;
 584         }
 585
 586         spin_unlock_irq(&ctx->lock);
 587 }
 588
 589 static int
 590 event_sched_in(struct perf_event *event,
 591                  struct perf_cpu_context *cpuctx,
 592                  struct perf_event_context *ctx,
 593                  int cpu)
 594 {
 595         if (event->state <= PERF_EVENT_STATE_OFF)
 596                 return 0;
 597
 598         event->state = PERF_EVENT_STATE_ACTIVE;
 599         event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
 600         /*
 601          * The new state must be visible before we turn it on in the hardware:
 602          */
 603         smp_wmb();
 604
 605         if (event->pmu->enable(event)) {
 606                 event->state = PERF_EVENT_STATE_INACTIVE;
 607                 event->oncpu = -1;
 608                 return -EAGAIN;
 609         }
 610
 611         event->tstamp_running += ctx->time - event->tstamp_stopped;
 612
 613         if (!is_software_event(event))
 614                 cpuctx->active_oncpu++;
 615         ctx->nr_active++;
 616
 617         if (event->attr.exclusive)
 618                 cpuctx->exclusive = 1;
 619
 620         return 0;
 621 }
 622
 623 static int
 624 group_sched_in(struct perf_event *group_event,
 625                struct perf_cpu_context *cpuctx,
 626                struct perf_event_context *ctx,
 627                int cpu)
 628 {
 629         struct perf_event *event, *partial_group;
 630         int ret;
 631
 632         if (group_event->state == PERF_EVENT_STATE_OFF)
 633                 return 0;
 634
 635         ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
 636         if (ret)
 637                 return ret < 0 ? ret : 0;
 638
 639         if (event_sched_in(group_event, cpuctx, ctx, cpu))
 640                 return -EAGAIN;
 641
 642         /*
 643          * Schedule in siblings as one group (if any):
 644          */
 645         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 646                 if (event_sched_in(event, cpuctx, ctx, cpu)) {
 647                         partial_group = event;
 648                         goto group_error;
 649                 }
 650         }
 651
 652         return 0;
 653
 654 group_error:
 655         /*
 656          * Groups can be scheduled in as one unit only, so undo any
 657          * partial group before returning:
 658          */
 659         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 660                 if (event == partial_group)
 661                         break;
 662                 event_sched_out(event, cpuctx, ctx);
 663         }
 664         event_sched_out(group_event, cpuctx, ctx);
 665
 666         return -EAGAIN;
 667 }
 668
 669 /*
 670  * Return 1 for a group consisting entirely of software events,
 671  * 0 if the group contains any hardware events.
 672  */
 673 static int is_software_only_group(struct perf_event *leader)
 674 {
 675         struct perf_event *event;
 676
 677         if (!is_software_event(leader))
 678                 return 0;
 679
 680         list_for_each_entry(event, &leader->sibling_list, group_entry)
 681                 if (!is_software_event(event))
 682                         return 0;
 683
 684         return 1;
 685 }
 686
 687 /*
 688  * Work out whether we can put this event group on the CPU now.
 689  */
 690 static int group_can_go_on(struct perf_event *event,
 691                            struct perf_cpu_context *cpuctx,
 692                            int can_add_hw)
 693 {
 694         /*
 695          * Groups consisting entirely of software events can always go on.
 696          */
 697         if (is_software_only_group(event))
 698                 return 1;
 699         /*
 700          * If an exclusive group is already on, no other hardware
 701          * events can go on.
 702          */
 703         if (cpuctx->exclusive)
 704                 return 0;
 705         /*
 706          * If this group is exclusive and there are already
 707          * events on the CPU, it can't go on.
 708          */
 709         if (event->attr.exclusive && cpuctx->active_oncpu)
 710                 return 0;
 711         /*
 712          * Otherwise, try to add it if all previous groups were able
 713          * to go on.
 714          */
 715         return can_add_hw;
 716 }
 717
 718 static void add_event_to_ctx(struct perf_event *event,
 719                                struct perf_event_context *ctx)
 720 {
 721         list_add_event(event, ctx);
 722         event->tstamp_enabled = ctx->time;
 723         event->tstamp_running = ctx->time;
 724         event->tstamp_stopped = ctx->time;
 725 }
 726
 727 /*
 728  * Cross CPU call to install and enable a performance event
 729  *
 730  * Must be called with ctx->mutex held
 731  */
 732 static void __perf_install_in_context(void *info)
 733 {
 734         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 735         struct perf_event *event = info;
 736         struct perf_event_context *ctx = event->ctx;
 737         struct perf_event *leader = event->group_leader;
 738         int cpu = smp_processor_id();
 739         int err;
 740
 741         /*
 742          * If this is a task context, we need to check whether it is
 743          * the current task context of this cpu. If not it has been
 744          * scheduled out before the smp call arrived.
 745          * Or possibly this is the right context but it isn't
 746          * on this cpu because it had no events.
 747          */
 748         if (ctx->task && cpuctx->task_ctx != ctx) {
 749                 if (cpuctx->task_ctx || ctx->task != current)
 750                         return;
 751                 cpuctx->task_ctx = ctx;
 752         }
 753
 754         spin_lock(&ctx->lock);
 755         ctx->is_active = 1;
 756         update_context_time(ctx);
 757
 758         /*
 759          * Protect the list operation against NMI by disabling the
 760          * events on a global level. NOP for non NMI based events.
 761          */
 762         perf_disable();
 763
 764         add_event_to_ctx(event, ctx);
 765
 766         /*
 767          * Don't put the event on if it is disabled or if
 768          * it is in a group and the group isn't on.
 769          */
 770         if (event->state != PERF_EVENT_STATE_INACTIVE ||
 771             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 772                 goto unlock;
 773
 774         /*
 775          * An exclusive event can't go on if there are already active
 776          * hardware events, and no hardware event can go on if there
 777          * is already an exclusive event on.
 778          */
 779         if (!group_can_go_on(event, cpuctx, 1))
 780                 err = -EEXIST;
 781         else
 782                 err = event_sched_in(event, cpuctx, ctx, cpu);
 783
 784         if (err) {
 785                 /*
 786                  * This event couldn't go on.  If it is in a group
 787                  * then we have to pull the whole group off.
 788                  * If the event group is pinned then put it in error state.
 789                  */
 790                 if (leader != event)
 791                         group_sched_out(leader, cpuctx, ctx);
 792                 if (leader->attr.pinned) {
 793                         update_group_times(leader);
 794                         leader->state = PERF_EVENT_STATE_ERROR;
 795                 }
 796         }
 797
 798         if (!err && !ctx->task && cpuctx->max_pertask)
 799                 cpuctx->max_pertask--;
 800
 801  unlock:
 802         perf_enable();
 803
 804         spin_unlock(&ctx->lock);
 805 }
 806
 807 /*
 808  * Attach a performance event to a context
 809  *
 810  * First we add the event to the list with the hardware enable bit
 811  * in event->hw_config cleared.
 812  *
 813  * If the event is attached to a task which is on a CPU we use a smp
 814  * call to enable it in the task context. The task might have been
 815  * scheduled away, but we check this in the smp call again.
 816  *
 817  * Must be called with ctx->mutex held.
 818  */
 819 static void
 820 perf_install_in_context(struct perf_event_context *ctx,
 821                         struct perf_event *event,
 822                         int cpu)
 823 {
 824         struct task_struct *task = ctx->task;
 825
 826         if (!task) {
 827                 /*
 828                  * Per cpu events are installed via an smp call and
 829                  * the install is always sucessful.
 830                  */
 831                 smp_call_function_single(cpu, __perf_install_in_context,
 832                                          event, 1);
 833                 return;
 834         }
 835
 836 retry:
 837         task_oncpu_function_call(task, __perf_install_in_context,
 838                                  event);
 839
 840         spin_lock_irq(&ctx->lock);
 841         /*
 842          * we need to retry the smp call.
 843          */
 844         if (ctx->is_active && list_empty(&event->group_entry)) {
 845                 spin_unlock_irq(&ctx->lock);
 846                 goto retry;
 847         }
 848
 849         /*
 850          * The lock prevents that this context is scheduled in so we
 851          * can add the event safely, if it the call above did not
 852          * succeed.
 853          */
 854         if (list_empty(&event->group_entry))
 855                 add_event_to_ctx(event, ctx);
 856         spin_unlock_irq(&ctx->lock);
 857 }
 858
 859 /*
 860  * Put a event into inactive state and update time fields.
 861  * Enabling the leader of a group effectively enables all
 862  * the group members that aren't explicitly disabled, so we
 863  * have to update their ->tstamp_enabled also.
 864  * Note: this works for group members as well as group leaders
 865  * since the non-leader members' sibling_lists will be empty.
 866  */
 867 static void __perf_event_mark_enabled(struct perf_event *event,
 868                                         struct perf_event_context *ctx)
 869 {
 870         struct perf_event *sub;
 871
 872         event->state = PERF_EVENT_STATE_INACTIVE;
 873         event->tstamp_enabled = ctx->time - event->total_time_enabled;
 874         list_for_each_entry(sub, &event->sibling_list, group_entry)
 875                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 876                         sub->tstamp_enabled =
 877                                 ctx->time - sub->total_time_enabled;
 878 }
 879
 880 /*
 881  * Cross CPU call to enable a performance event
 882  */
 883 static void __perf_event_enable(void *info)
 884 {
 885         struct perf_event *event = info;
 886         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 887         struct perf_event_context *ctx = event->ctx;
 888         struct perf_event *leader = event->group_leader;
 889         int err;
 890
 891         /*
 892          * If this is a per-task event, need to check whether this
 893          * event's task is the current task on this cpu.
 894          */
 895         if (ctx->task && cpuctx->task_ctx != ctx) {
 896                 if (cpuctx->task_ctx || ctx->task != current)
 897                         return;
 898                 cpuctx->task_ctx = ctx;
 899         }
 900
 901         spin_lock(&ctx->lock);
 902         ctx->is_active = 1;
 903         update_context_time(ctx);
 904
 905         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 906                 goto unlock;
 907         __perf_event_mark_enabled(event, ctx);
 908
 909         /*
 910          * If the event is in a group and isn't the group leader,
 911          * then don't put it on unless the group is on.
 912          */
 913         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 914                 goto unlock;
 915
 916         if (!group_can_go_on(event, cpuctx, 1)) {
 917                 err = -EEXIST;
 918         } else {
 919                 perf_disable();
 920                 if (event == leader)
 921                         err = group_sched_in(event, cpuctx, ctx,
 922                                              smp_processor_id());
 923                 else
 924                         err = event_sched_in(event, cpuctx, ctx,
 925                                                smp_processor_id());
 926                 perf_enable();
 927         }
 928
 929         if (err) {
 930                 /*
 931                  * If this event can't go on and it's part of a
 932                  * group, then the whole group has to come off.
 933                  */
 934                 if (leader != event)
 935                         group_sched_out(leader, cpuctx, ctx);
 936                 if (leader->attr.pinned) {
 937                         update_group_times(leader);
 938                         leader->state = PERF_EVENT_STATE_ERROR;
 939                 }
 940         }
 941
 942  unlock:
 943         spin_unlock(&ctx->lock);
 944 }
 945
 946 /*
 947  * Enable a event.
 948  *
 949  * If event->ctx is a cloned context, callers must make sure that
 950  * every task struct that event->ctx->task could possibly point to
 951  * remains valid.  This condition is satisfied when called through
 952  * perf_event_for_each_child or perf_event_for_each as described
 953  * for perf_event_disable.
 954  */
 955 static void perf_event_enable(struct perf_event *event)
 956 {
 957         struct perf_event_context *ctx = event->ctx;
 958         struct task_struct *task = ctx->task;
 959
 960         if (!task) {
 961                 /*
 962                  * Enable the event on the cpu that it's on
 963                  */
 964                 smp_call_function_single(event->cpu, __perf_event_enable,
 965                                          event, 1);
 966                 return;
 967         }
 968
 969         spin_lock_irq(&ctx->lock);
 970         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 971                 goto out;
 972
 973         /*
 974          * If the event is in error state, clear that first.
 975          * That way, if we see the event in error state below, we
 976          * know that it has gone back into error state, as distinct
 977          * from the task having been scheduled away before the
 978          * cross-call arrived.
 979          */
 980         if (event->state == PERF_EVENT_STATE_ERROR)
 981                 event->state = PERF_EVENT_STATE_OFF;
 982
 983  retry:
 984         spin_unlock_irq(&ctx->lock);
 985         task_oncpu_function_call(task, __perf_event_enable, event);
 986
 987         spin_lock_irq(&ctx->lock);
 988
 989         /*
 990          * If the context is active and the event is still off,
 991          * we need to retry the cross-call.
 992          */
 993         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
 994                 goto retry;
 995
 996         /*
 997          * Since we have the lock this context can't be scheduled
 998          * in, so we can change the state safely.
 999          */
1000         if (event->state == PERF_EVENT_STATE_OFF)
1001                 __perf_event_mark_enabled(event, ctx);
1002
1003  out:
1004         spin_unlock_irq(&ctx->lock);
1005 }
1006
1007 static int perf_event_refresh(struct perf_event *event, int refresh)
1008 {
1009         /*
1010          * not supported on inherited events
1011          */
1012         if (event->attr.inherit)
1013                 return -EINVAL;
1014
1015         atomic_add(refresh, &event->event_limit);
1016         perf_event_enable(event);
1017
1018         return 0;
1019 }
1020
1021 void __perf_event_sched_out(struct perf_event_context *ctx,
1022                               struct perf_cpu_context *cpuctx)
1023 {
1024         struct perf_event *event;
1025
1026         spin_lock(&ctx->lock);
1027         ctx->is_active = 0;
1028         if (likely(!ctx->nr_events))
1029                 goto out;
1030         update_context_time(ctx);
1031
1032         perf_disable();
1033         if (ctx->nr_active)
1034                 list_for_each_entry(event, &ctx->group_list, group_entry)
1035                         group_sched_out(event, cpuctx, ctx);
1036
1037         perf_enable();
1038  out:
1039         spin_unlock(&ctx->lock);
1040 }
1041
1042 /*
1043  * Test whether two contexts are equivalent, i.e. whether they
1044  * have both been cloned from the same version of the same context
1045  * and they both have the same number of enabled events.
1046  * If the number of enabled events is the same, then the set
1047  * of enabled events should be the same, because these are both
1048  * inherited contexts, therefore we can't access individual events
1049  * in them directly with an fd; we can only enable/disable all
1050  * events via prctl, or enable/disable all events in a family
1051  * via ioctl, which will have the same effect on both contexts.
1052  */
1053 static int context_equiv(struct perf_event_context *ctx1,
1054                          struct perf_event_context *ctx2)
1055 {
1056         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1057                 && ctx1->parent_gen == ctx2->parent_gen
1058                 && !ctx1->pin_count && !ctx2->pin_count;
1059 }
1060
1061 static void __perf_event_read(void *event);
1062
1063 static void __perf_event_sync_stat(struct perf_event *event,
1064                                      struct perf_event *next_event)
1065 {
1066         u64 value;
1067
1068         if (!event->attr.inherit_stat)
1069                 return;
1070
1071         /*
1072          * Update the event value, we cannot use perf_event_read()
1073          * because we're in the middle of a context switch and have IRQs
1074          * disabled, which upsets smp_call_function_single(), however
1075          * we know the event must be on the current CPU, therefore we
1076          * don't need to use it.
1077          */
1078         switch (event->state) {
1079         case PERF_EVENT_STATE_ACTIVE:
1080                 __perf_event_read(event);
1081                 break;
1082
1083         case PERF_EVENT_STATE_INACTIVE:
1084                 update_event_times(event);
1085                 break;
1086
1087         default:
1088                 break;
1089         }
1090
1091         /*
1092          * In order to keep per-task stats reliable we need to flip the event
1093          * values when we flip the contexts.
1094          */
1095         value = atomic64_read(&next_event->count);
1096         value = atomic64_xchg(&event->count, value);
1097         atomic64_set(&next_event->count, value);
1098
1099         swap(event->total_time_enabled, next_event->total_time_enabled);
1100         swap(event->total_time_running, next_event->total_time_running);
1101
1102         /*
1103          * Since we swizzled the values, update the user visible data too.
1104          */
1105         perf_event_update_userpage(event);
1106         perf_event_update_userpage(next_event);
1107 }
1108
1109 #define list_next_entry(pos, member) \
1110         list_entry(pos->member.next, typeof(*pos), member)
1111
1112 static void perf_event_sync_stat(struct perf_event_context *ctx,
1113                                    struct perf_event_context *next_ctx)
1114 {
1115         struct perf_event *event, *next_event;
1116
1117         if (!ctx->nr_stat)
1118                 return;
1119
1120         event = list_first_entry(&ctx->event_list,
1121                                    struct perf_event, event_entry);
1122
1123         next_event = list_first_entry(&next_ctx->event_list,
1124                                         struct perf_event, event_entry);
1125
1126         while (&event->event_entry != &ctx->event_list &&
1127                &next_event->event_entry != &next_ctx->event_list) {
1128
1129                 __perf_event_sync_stat(event, next_event);
1130
1131                 event = list_next_entry(event, event_entry);
1132                 next_event = list_next_entry(next_event, event_entry);
1133         }
1134 }
1135
1136 /*
1137  * Called from scheduler to remove the events of the current task,
1138  * with interrupts disabled.
1139  *
1140  * We stop each event and update the event value in event->count.
1141  *
1142  * This does not protect us against NMI, but disable()
1143  * sets the disabled bit in the control field of event _before_
1144  * accessing the event control register. If a NMI hits, then it will
1145  * not restart the event.
1146  */
1147 void perf_event_task_sched_out(struct task_struct *task,
1148                                  struct task_struct *next, int cpu)
1149 {
1150         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1151         struct perf_event_context *ctx = task->perf_event_ctxp;
1152         struct perf_event_context *next_ctx;
1153         struct perf_event_context *parent;
1154         struct pt_regs *regs;
1155         int do_switch = 1;
1156
1157         regs = task_pt_regs(task);
1158         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1159
1160         if (likely(!ctx || !cpuctx->task_ctx))
1161                 return;
1162
1163         update_context_time(ctx);
1164
1165         rcu_read_lock();
1166         parent = rcu_dereference(ctx->parent_ctx);
1167         next_ctx = next->perf_event_ctxp;
1168         if (parent && next_ctx &&
1169             rcu_dereference(next_ctx->parent_ctx) == parent) {
1170                 /*
1171                  * Looks like the two contexts are clones, so we might be
1172                  * able to optimize the context switch.  We lock both
1173                  * contexts and check that they are clones under the
1174                  * lock (including re-checking that neither has been
1175                  * uncloned in the meantime).  It doesn't matter which
1176                  * order we take the locks because no other cpu could
1177                  * be trying to lock both of these tasks.
1178                  */
1179                 spin_lock(&ctx->lock);
1180                 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1181                 if (context_equiv(ctx, next_ctx)) {
1182                         /*
1183                          * XXX do we need a memory barrier of sorts
1184                          * wrt to rcu_dereference() of perf_event_ctxp
1185                          */
1186                         task->perf_event_ctxp = next_ctx;
1187                         next->perf_event_ctxp = ctx;
1188                         ctx->task = next;
1189                         next_ctx->task = task;
1190                         do_switch = 0;
1191
1192                         perf_event_sync_stat(ctx, next_ctx);
1193                 }
1194                 spin_unlock(&next_ctx->lock);
1195                 spin_unlock(&ctx->lock);
1196         }
1197         rcu_read_unlock();
1198
1199         if (do_switch) {
1200                 __perf_event_sched_out(ctx, cpuctx);
1201                 cpuctx->task_ctx = NULL;
1202         }
1203 }
1204
1205 /*
1206  * Called with IRQs disabled
1207  */
1208 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1209 {
1210         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1211
1212         if (!cpuctx->task_ctx)
1213                 return;
1214
1215         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1216                 return;
1217
1218         __perf_event_sched_out(ctx, cpuctx);
1219         cpuctx->task_ctx = NULL;
1220 }
1221
1222 /*
1223  * Called with IRQs disabled
1224  */
1225 static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1226 {
1227         __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1228 }
1229
1230 static void
1231 __perf_event_sched_in(struct perf_event_context *ctx,
1232                         struct perf_cpu_context *cpuctx, int cpu)
1233 {
1234         struct perf_event *event;
1235         int can_add_hw = 1;
1236
1237         spin_lock(&ctx->lock);
1238         ctx->is_active = 1;
1239         if (likely(!ctx->nr_events))
1240                 goto out;
1241
1242         ctx->timestamp = perf_clock();
1243
1244         perf_disable();
1245
1246         /*
1247          * First go through the list and put on any pinned groups
1248          * in order to give them the best chance of going on.
1249          */
1250         list_for_each_entry(event, &ctx->group_list, group_entry) {
1251                 if (event->state <= PERF_EVENT_STATE_OFF ||
1252                     !event->attr.pinned)
1253                         continue;
1254                 if (event->cpu != -1 && event->cpu != cpu)
1255                         continue;
1256
1257                 if (group_can_go_on(event, cpuctx, 1))
1258                         group_sched_in(event, cpuctx, ctx, cpu);
1259
1260                 /*
1261                  * If this pinned group hasn't been scheduled,
1262                  * put it in error state.
1263                  */
1264                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1265                         update_group_times(event);
1266                         event->state = PERF_EVENT_STATE_ERROR;
1267                 }
1268         }
1269
1270         list_for_each_entry(event, &ctx->group_list, group_entry) {
1271                 /*
1272                  * Ignore events in OFF or ERROR state, and
1273                  * ignore pinned events since we did them already.
1274                  */
1275                 if (event->state <= PERF_EVENT_STATE_OFF ||
1276                     event->attr.pinned)
1277                         continue;
1278
1279                 /*
1280                  * Listen to the 'cpu' scheduling filter constraint
1281                  * of events:
1282                  */
1283                 if (event->cpu != -1 && event->cpu != cpu)
1284                         continue;
1285
1286                 if (group_can_go_on(event, cpuctx, can_add_hw))
1287                         if (group_sched_in(event, cpuctx, ctx, cpu))
1288                                 can_add_hw = 0;
1289         }
1290         perf_enable();
1291  out:
1292         spin_unlock(&ctx->lock);
1293 }
1294
1295 /*
1296  * Called from scheduler to add the events of the current task
1297  * with interrupts disabled.
1298  *
1299  * We restore the event value and then enable it.
1300  *
1301  * This does not protect us against NMI, but enable()
1302  * sets the enabled bit in the control field of event _before_
1303  * accessing the event control register. If a NMI hits, then it will
1304  * keep the event running.
1305  */
1306 void perf_event_task_sched_in(struct task_struct *task, int cpu)
1307 {
1308         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1309         struct perf_event_context *ctx = task->perf_event_ctxp;
1310
1311         if (likely(!ctx))
1312                 return;
1313         if (cpuctx->task_ctx == ctx)
1314                 return;
1315         __perf_event_sched_in(ctx, cpuctx, cpu);
1316         cpuctx->task_ctx = ctx;
1317 }
1318
1319 static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1320 {
1321         struct perf_event_context *ctx = &cpuctx->ctx;
1322
1323         __perf_event_sched_in(ctx, cpuctx, cpu);
1324 }
1325
1326 #define MAX_INTERRUPTS (~0ULL)
1327
1328 static void perf_log_throttle(struct perf_event *event, int enable);
1329
1330 static void perf_adjust_period(struct perf_event *event, u64 events)
1331 {
1332         struct hw_perf_event *hwc = &event->hw;
1333         u64 period, sample_period;
1334         s64 delta;
1335
1336         events *= hwc->sample_period;
1337         period = div64_u64(events, event->attr.sample_freq);
1338
1339         delta = (s64)(period - hwc->sample_period);
1340         delta = (delta + 7) / 8; /* low pass filter */
1341
1342         sample_period = hwc->sample_period + delta;
1343
1344         if (!sample_period)
1345                 sample_period = 1;
1346
1347         hwc->sample_period = sample_period;
1348 }
1349
1350 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1351 {
1352         struct perf_event *event;
1353         struct hw_perf_event *hwc;
1354         u64 interrupts, freq;
1355
1356         spin_lock(&ctx->lock);
1357         list_for_each_entry(event, &ctx->group_list, group_entry) {
1358                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1359                         continue;
1360
1361                 hwc = &event->hw;
1362
1363                 interrupts = hwc->interrupts;
1364                 hwc->interrupts = 0;
1365
1366                 /*
1367                  * unthrottle events on the tick
1368                  */
1369                 if (interrupts == MAX_INTERRUPTS) {
1370                         perf_log_throttle(event, 1);
1371                         event->pmu->unthrottle(event);
1372                         interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1373                 }
1374
1375                 if (!event->attr.freq || !event->attr.sample_freq)
1376                         continue;
1377
1378                 /*
1379                  * if the specified freq < HZ then we need to skip ticks
1380                  */
1381                 if (event->attr.sample_freq < HZ) {
1382                         freq = event->attr.sample_freq;
1383
1384                         hwc->freq_count += freq;
1385                         hwc->freq_interrupts += interrupts;
1386
1387                         if (hwc->freq_count < HZ)
1388                                 continue;
1389
1390                         interrupts = hwc->freq_interrupts;
1391                         hwc->freq_interrupts = 0;
1392                         hwc->freq_count -= HZ;
1393                 } else
1394                         freq = HZ;
1395
1396                 perf_adjust_period(event, freq * interrupts);
1397
1398                 /*
1399                  * In order to avoid being stalled by an (accidental) huge
1400                  * sample period, force reset the sample period if we didn't
1401                  * get any events in this freq period.
1402                  */
1403                 if (!interrupts) {
1404                         perf_disable();
1405                         event->pmu->disable(event);
1406                         atomic64_set(&hwc->period_left, 0);
1407                         event->pmu->enable(event);
1408                         perf_enable();
1409                 }
1410         }
1411         spin_unlock(&ctx->lock);
1412 }
1413
1414 /*
1415  * Round-robin a context's events:
1416  */
1417 static void rotate_ctx(struct perf_event_context *ctx)
1418 {
1419         struct perf_event *event;
1420
1421         if (!ctx->nr_events)
1422                 return;
1423
1424         spin_lock(&ctx->lock);
1425         /*
1426          * Rotate the first entry last (works just fine for group events too):
1427          */
1428         perf_disable();
1429         list_for_each_entry(event, &ctx->group_list, group_entry) {
1430                 list_move_tail(&event->group_entry, &ctx->group_list);
1431                 break;
1432         }
1433         perf_enable();
1434
1435         spin_unlock(&ctx->lock);
1436 }
1437
1438 void perf_event_task_tick(struct task_struct *curr, int cpu)
1439 {
1440         struct perf_cpu_context *cpuctx;
1441         struct perf_event_context *ctx;
1442
1443         if (!atomic_read(&nr_events))
1444                 return;
1445
1446         cpuctx = &per_cpu(perf_cpu_context, cpu);
1447         ctx = curr->perf_event_ctxp;
1448
1449         perf_ctx_adjust_freq(&cpuctx->ctx);
1450         if (ctx)
1451                 perf_ctx_adjust_freq(ctx);
1452
1453         perf_event_cpu_sched_out(cpuctx);
1454         if (ctx)
1455                 __perf_event_task_sched_out(ctx);
1456
1457         rotate_ctx(&cpuctx->ctx);
1458         if (ctx)
1459                 rotate_ctx(ctx);
1460
1461         perf_event_cpu_sched_in(cpuctx, cpu);
1462         if (ctx)
1463                 perf_event_task_sched_in(curr, cpu);
1464 }
1465
1466 /*
1467  * Enable all of a task's events that have been marked enable-on-exec.
1468  * This expects task == current.
1469  */
1470 static void perf_event_enable_on_exec(struct task_struct *task)
1471 {
1472         struct perf_event_context *ctx;
1473         struct perf_event *event;
1474         unsigned long flags;
1475         int enabled = 0;
1476
1477         local_irq_save(flags);
1478         ctx = task->perf_event_ctxp;
1479         if (!ctx || !ctx->nr_events)
1480                 goto out;
1481
1482         __perf_event_task_sched_out(ctx);
1483
1484         spin_lock(&ctx->lock);
1485
1486         list_for_each_entry(event, &ctx->group_list, group_entry) {
1487                 if (!event->attr.enable_on_exec)
1488                         continue;
1489                 event->attr.enable_on_exec = 0;
1490                 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1491                         continue;
1492                 __perf_event_mark_enabled(event, ctx);
1493                 enabled = 1;
1494         }
1495
1496         /*
1497          * Unclone this context if we enabled any event.
1498          */
1499         if (enabled)
1500                 unclone_ctx(ctx);
1501
1502         spin_unlock(&ctx->lock);
1503
1504         perf_event_task_sched_in(task, smp_processor_id());
1505  out:
1506         local_irq_restore(flags);
1507 }
1508
1509 /*
1510  * Cross CPU call to read the hardware event
1511  */
1512 static void __perf_event_read(void *info)
1513 {
1514         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1515         struct perf_event *event = info;
1516         struct perf_event_context *ctx = event->ctx;
1517         unsigned long flags;
1518
1519         /*
1520          * If this is a task context, we need to check whether it is
1521          * the current task context of this cpu.  If not it has been
1522          * scheduled out before the smp call arrived.  In that case
1523          * event->count would have been updated to a recent sample
1524          * when the event was scheduled out.
1525          */
1526         if (ctx->task && cpuctx->task_ctx != ctx)
1527                 return;
1528
1529         local_irq_save(flags);
1530         if (ctx->is_active)
1531                 update_context_time(ctx);
1532         event->pmu->read(event);
1533         update_event_times(event);
1534         local_irq_restore(flags);
1535 }
1536
1537 static u64 perf_event_read(struct perf_event *event)
1538 {
1539         /*
1540          * If event is enabled and currently active on a CPU, update the
1541          * value in the event structure:
1542          */
1543         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1544                 smp_call_function_single(event->oncpu,
1545                                          __perf_event_read, event, 1);
1546         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1547                 update_event_times(event);
1548         }
1549
1550         return atomic64_read(&event->count);
1551 }
1552
1553 /*
1554  * Initialize the perf_event context in a task_struct:
1555  */
1556 static void
1557 __perf_event_init_context(struct perf_event_context *ctx,
1558                             struct task_struct *task)
1559 {
1560         memset(ctx, 0, sizeof(*ctx));
1561         spin_lock_init(&ctx->lock);
1562         mutex_init(&ctx->mutex);
1563         INIT_LIST_HEAD(&ctx->group_list);
1564         INIT_LIST_HEAD(&ctx->event_list);
1565         atomic_set(&ctx->refcount, 1);
1566         ctx->task = task;
1567 }
1568
1569 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1570 {
1571         struct perf_event_context *ctx;
1572         struct perf_cpu_context *cpuctx;
1573         struct task_struct *task;
1574         unsigned long flags;
1575         int err;
1576
1577         /*
1578          * If cpu is not a wildcard then this is a percpu event:
1579          */
1580         if (cpu != -1) {
1581                 /* Must be root to operate on a CPU event: */
1582                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1583                         return ERR_PTR(-EACCES);
1584
1585                 if (cpu < 0 || cpu > num_possible_cpus())
1586                         return ERR_PTR(-EINVAL);
1587
1588                 /*
1589                  * We could be clever and allow to attach a event to an
1590                  * offline CPU and activate it when the CPU comes up, but
1591                  * that's for later.
1592                  */
1593                 if (!cpu_isset(cpu, cpu_online_map))
1594                         return ERR_PTR(-ENODEV);
1595
1596                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1597                 ctx = &cpuctx->ctx;
1598                 get_ctx(ctx);
1599
1600                 return ctx;
1601         }
1602
1603         rcu_read_lock();
1604         if (!pid)
1605                 task = current;
1606         else
1607                 task = find_task_by_vpid(pid);
1608         if (task)
1609                 get_task_struct(task);
1610         rcu_read_unlock();
1611
1612         if (!task)
1613                 return ERR_PTR(-ESRCH);
1614
1615         /*
1616          * Can't attach events to a dying task.
1617          */
1618         err = -ESRCH;
1619         if (task->flags & PF_EXITING)
1620                 goto errout;
1621
1622         /* Reuse ptrace permission checks for now. */
1623         err = -EACCES;
1624         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1625                 goto errout;
1626
1627  retry:
1628         ctx = perf_lock_task_context(task, &flags);
1629         if (ctx) {
1630                 unclone_ctx(ctx);
1631                 spin_unlock_irqrestore(&ctx->lock, flags);
1632         }
1633
1634         if (!ctx) {
1635                 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1636                 err = -ENOMEM;
1637                 if (!ctx)
1638                         goto errout;
1639                 __perf_event_init_context(ctx, task);
1640                 get_ctx(ctx);
1641                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1642                         /*
1643                          * We raced with some other task; use
1644                          * the context they set.
1645                          */
1646                         kfree(ctx);
1647                         goto retry;
1648                 }
1649                 get_task_struct(task);
1650         }
1651
1652         put_task_struct(task);
1653         return ctx;
1654
1655  errout:
1656         put_task_struct(task);
1657         return ERR_PTR(err);
1658 }
1659
1660 static void free_event_rcu(struct rcu_head *head)
1661 {
1662         struct perf_event *event;
1663
1664         event = container_of(head, struct perf_event, rcu_head);
1665         if (event->ns)
1666                 put_pid_ns(event->ns);
1667         kfree(event);
1668 }
1669
1670 static void perf_pending_sync(struct perf_event *event);
1671
1672 static void free_event(struct perf_event *event)
1673 {
1674         perf_pending_sync(event);
1675
1676         if (!event->parent) {
1677                 atomic_dec(&nr_events);
1678                 if (event->attr.mmap)
1679                         atomic_dec(&nr_mmap_events);
1680                 if (event->attr.comm)
1681                         atomic_dec(&nr_comm_events);
1682                 if (event->attr.task)
1683                         atomic_dec(&nr_task_events);
1684         }
1685
1686         if (event->output) {
1687                 fput(event->output->filp);
1688                 event->output = NULL;
1689         }
1690
1691         if (event->destroy)
1692                 event->destroy(event);
1693
1694         put_ctx(event->ctx);
1695         call_rcu(&event->rcu_head, free_event_rcu);
1696 }
1697
1698 /*
1699  * Called when the last reference to the file is gone.
1700  */
1701 static int perf_release(struct inode *inode, struct file *file)
1702 {
1703         struct perf_event *event = file->private_data;
1704         struct perf_event_context *ctx = event->ctx;
1705
1706         file->private_data = NULL;
1707
1708         WARN_ON_ONCE(ctx->parent_ctx);
1709         mutex_lock(&ctx->mutex);
1710         perf_event_remove_from_context(event);
1711         mutex_unlock(&ctx->mutex);
1712
1713         mutex_lock(&event->owner->perf_event_mutex);
1714         list_del_init(&event->owner_entry);
1715         mutex_unlock(&event->owner->perf_event_mutex);
1716         put_task_struct(event->owner);
1717
1718         free_event(event);
1719
1720         return 0;
1721 }
1722
1723 static int perf_event_read_size(struct perf_event *event)
1724 {
1725         int entry = sizeof(u64); /* value */
1726         int size = 0;
1727         int nr = 1;
1728
1729         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1730                 size += sizeof(u64);
1731
1732         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1733                 size += sizeof(u64);
1734
1735         if (event->attr.read_format & PERF_FORMAT_ID)
1736                 entry += sizeof(u64);
1737
1738         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1739                 nr += event->group_leader->nr_siblings;
1740                 size += sizeof(u64);
1741         }
1742
1743         size += entry * nr;
1744
1745         return size;
1746 }
1747
1748 static u64 perf_event_read_value(struct perf_event *event)
1749 {
1750         struct perf_event *child;
1751         u64 total = 0;
1752
1753         total += perf_event_read(event);
1754         list_for_each_entry(child, &event->child_list, child_list)
1755                 total += perf_event_read(child);
1756
1757         return total;
1758 }
1759
1760 static int perf_event_read_entry(struct perf_event *event,
1761                                    u64 read_format, char __user *buf)
1762 {
1763         int n = 0, count = 0;
1764         u64 values[2];
1765
1766         values[n++] = perf_event_read_value(event);
1767         if (read_format & PERF_FORMAT_ID)
1768                 values[n++] = primary_event_id(event);
1769
1770         count = n * sizeof(u64);
1771
1772         if (copy_to_user(buf, values, count))
1773                 return -EFAULT;
1774
1775         return count;
1776 }
1777
1778 static int perf_event_read_group(struct perf_event *event,
1779                                    u64 read_format, char __user *buf)
1780 {
1781         struct perf_event *leader = event->group_leader, *sub;
1782         int n = 0, size = 0, err = -EFAULT;
1783         u64 values[3];
1784
1785         values[n++] = 1 + leader->nr_siblings;
1786         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1787                 values[n++] = leader->total_time_enabled +
1788                         atomic64_read(&leader->child_total_time_enabled);
1789         }
1790         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1791                 values[n++] = leader->total_time_running +
1792                         atomic64_read(&leader->child_total_time_running);
1793         }
1794
1795         size = n * sizeof(u64);
1796
1797         if (copy_to_user(buf, values, size))
1798                 return -EFAULT;
1799
1800         err = perf_event_read_entry(leader, read_format, buf + size);
1801         if (err < 0)
1802                 return err;
1803
1804         size += err;
1805
1806         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1807                 err = perf_event_read_entry(sub, read_format,
1808                                 buf + size);
1809                 if (err < 0)
1810                         return err;
1811
1812                 size += err;
1813         }
1814
1815         return size;
1816 }
1817
1818 static int perf_event_read_one(struct perf_event *event,
1819                                  u64 read_format, char __user *buf)
1820 {
1821         u64 values[4];
1822         int n = 0;
1823
1824         values[n++] = perf_event_read_value(event);
1825         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1826                 values[n++] = event->total_time_enabled +
1827                         atomic64_read(&event->child_total_time_enabled);
1828         }
1829         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1830                 values[n++] = event->total_time_running +
1831                         atomic64_read(&event->child_total_time_running);
1832         }
1833         if (read_format & PERF_FORMAT_ID)
1834                 values[n++] = primary_event_id(event);
1835
1836         if (copy_to_user(buf, values, n * sizeof(u64)))
1837                 return -EFAULT;
1838
1839         return n * sizeof(u64);
1840 }
1841
1842 /*
1843  * Read the performance event - simple non blocking version for now
1844  */
1845 static ssize_t
1846 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1847 {
1848         u64 read_format = event->attr.read_format;
1849         int ret;
1850
1851         /*
1852          * Return end-of-file for a read on a event that is in
1853          * error state (i.e. because it was pinned but it couldn't be
1854          * scheduled on to the CPU at some point).
1855          */
1856         if (event->state == PERF_EVENT_STATE_ERROR)
1857                 return 0;
1858
1859         if (count < perf_event_read_size(event))
1860                 return -ENOSPC;
1861
1862         WARN_ON_ONCE(event->ctx->parent_ctx);
1863         mutex_lock(&event->child_mutex);
1864         if (read_format & PERF_FORMAT_GROUP)
1865                 ret = perf_event_read_group(event, read_format, buf);
1866         else
1867                 ret = perf_event_read_one(event, read_format, buf);
1868         mutex_unlock(&event->child_mutex);
1869
1870         return ret;
1871 }
1872
1873 static ssize_t
1874 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1875 {
1876         struct perf_event *event = file->private_data;
1877
1878         return perf_read_hw(event, buf, count);
1879 }
1880
1881 static unsigned int perf_poll(struct file *file, poll_table *wait)
1882 {
1883         struct perf_event *event = file->private_data;
1884         struct perf_mmap_data *data;
1885         unsigned int events = POLL_HUP;
1886
1887         rcu_read_lock();
1888         data = rcu_dereference(event->data);
1889         if (data)
1890                 events = atomic_xchg(&data->poll, 0);
1891         rcu_read_unlock();
1892
1893         poll_wait(file, &event->waitq, wait);
1894
1895         return events;
1896 }
1897
1898 static void perf_event_reset(struct perf_event *event)
1899 {
1900         (void)perf_event_read(event);
1901         atomic64_set(&event->count, 0);
1902         perf_event_update_userpage(event);
1903 }
1904
1905 /*
1906  * Holding the top-level event's child_mutex means that any
1907  * descendant process that has inherited this event will block
1908  * in sync_child_event if it goes to exit, thus satisfying the
1909  * task existence requirements of perf_event_enable/disable.
1910  */
1911 static void perf_event_for_each_child(struct perf_event *event,
1912                                         void (*func)(struct perf_event *))
1913 {
1914         struct perf_event *child;
1915
1916         WARN_ON_ONCE(event->ctx->parent_ctx);
1917         mutex_lock(&event->child_mutex);
1918         func(event);
1919         list_for_each_entry(child, &event->child_list, child_list)
1920                 func(child);
1921         mutex_unlock(&event->child_mutex);
1922 }
1923
1924 static void perf_event_for_each(struct perf_event *event,
1925                                   void (*func)(struct perf_event *))
1926 {
1927         struct perf_event_context *ctx = event->ctx;
1928         struct perf_event *sibling;
1929
1930         WARN_ON_ONCE(ctx->parent_ctx);
1931         mutex_lock(&ctx->mutex);
1932         event = event->group_leader;
1933
1934         perf_event_for_each_child(event, func);
1935         func(event);
1936         list_for_each_entry(sibling, &event->sibling_list, group_entry)
1937                 perf_event_for_each_child(event, func);
1938         mutex_unlock(&ctx->mutex);
1939 }
1940
1941 static int perf_event_period(struct perf_event *event, u64 __user *arg)
1942 {
1943         struct perf_event_context *ctx = event->ctx;
1944         unsigned long size;
1945         int ret = 0;
1946         u64 value;
1947
1948         if (!event->attr.sample_period)
1949                 return -EINVAL;
1950
1951         size = copy_from_user(&value, arg, sizeof(value));
1952         if (size != sizeof(value))
1953                 return -EFAULT;
1954
1955         if (!value)
1956                 return -EINVAL;
1957
1958         spin_lock_irq(&ctx->lock);
1959         if (event->attr.freq) {
1960                 if (value > sysctl_perf_event_sample_rate) {
1961                         ret = -EINVAL;
1962                         goto unlock;
1963                 }
1964
1965                 event->attr.sample_freq = value;
1966         } else {
1967                 event->attr.sample_period = value;
1968                 event->hw.sample_period = value;
1969         }
1970 unlock:
1971         spin_unlock_irq(&ctx->lock);
1972
1973         return ret;
1974 }
1975
1976 int perf_event_set_output(struct perf_event *event, int output_fd);
1977
1978 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1979 {
1980         struct perf_event *event = file->private_data;
1981         void (*func)(struct perf_event *);
1982         u32 flags = arg;
1983
1984         switch (cmd) {
1985         case PERF_EVENT_IOC_ENABLE:
1986                 func = perf_event_enable;
1987                 break;
1988         case PERF_EVENT_IOC_DISABLE:
1989                 func = perf_event_disable;
1990                 break;
1991         case PERF_EVENT_IOC_RESET:
1992                 func = perf_event_reset;
1993                 break;
1994
1995         case PERF_EVENT_IOC_REFRESH:
1996                 return perf_event_refresh(event, arg);
1997
1998         case PERF_EVENT_IOC_PERIOD:
1999                 return perf_event_period(event, (u64 __user *)arg);
2000
2001         case PERF_EVENT_IOC_SET_OUTPUT:
2002                 return perf_event_set_output(event, arg);
2003
2004         default:
2005                 return -ENOTTY;
2006         }
2007
2008         if (flags & PERF_IOC_FLAG_GROUP)
2009                 perf_event_for_each(event, func);
2010         else
2011                 perf_event_for_each_child(event, func);
2012
2013         return 0;
2014 }
2015
2016 int perf_event_task_enable(void)
2017 {
2018         struct perf_event *event;
2019
2020         mutex_lock(&current->perf_event_mutex);
2021         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2022                 perf_event_for_each_child(event, perf_event_enable);
2023         mutex_unlock(&current->perf_event_mutex);
2024
2025         return 0;
2026 }
2027
2028 int perf_event_task_disable(void)
2029 {
2030         struct perf_event *event;
2031
2032         mutex_lock(&current->perf_event_mutex);
2033         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2034                 perf_event_for_each_child(event, perf_event_disable);
2035         mutex_unlock(&current->perf_event_mutex);
2036
2037         return 0;
2038 }
2039
2040 #ifndef PERF_EVENT_INDEX_OFFSET
2041 # define PERF_EVENT_INDEX_OFFSET 0
2042 #endif
2043
2044 static int perf_event_index(struct perf_event *event)
2045 {
2046         if (event->state != PERF_EVENT_STATE_ACTIVE)
2047                 return 0;
2048
2049         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2050 }
2051
2052 /*
2053  * Callers need to ensure there can be no nesting of this function, otherwise
2054  * the seqlock logic goes bad. We can not serialize this because the arch
2055  * code calls this from NMI context.
2056  */
2057 void perf_event_update_userpage(struct perf_event *event)
2058 {
2059         struct perf_event_mmap_page *userpg;
2060         struct perf_mmap_data *data;
2061
2062         rcu_read_lock();
2063         data = rcu_dereference(event->data);
2064         if (!data)
2065                 goto unlock;
2066
2067         userpg = data->user_page;
2068
2069         /*
2070          * Disable preemption so as to not let the corresponding user-space
2071          * spin too long if we get preempted.
2072          */
2073         preempt_disable();
2074         ++userpg->lock;
2075         barrier();
2076         userpg->index = perf_event_index(event);
2077         userpg->offset = atomic64_read(&event->count);
2078         if (event->state == PERF_EVENT_STATE_ACTIVE)
2079                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2080
2081         userpg->time_enabled = event->total_time_enabled +
2082                         atomic64_read(&event->child_total_time_enabled);
2083
2084         userpg->time_running = event->total_time_running +
2085                         atomic64_read(&event->child_total_time_running);
2086
2087         barrier();
2088         ++userpg->lock;
2089         preempt_enable();
2090 unlock:
2091         rcu_read_unlock();
2092 }
2093
2094 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2095 {
2096         struct perf_event *event = vma->vm_file->private_data;
2097         struct perf_mmap_data *data;
2098         int ret = VM_FAULT_SIGBUS;
2099
2100         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2101                 if (vmf->pgoff == 0)
2102                         ret = 0;
2103                 return ret;
2104         }
2105
2106         rcu_read_lock();
2107         data = rcu_dereference(event->data);
2108         if (!data)
2109                 goto unlock;
2110
2111         if (vmf->pgoff == 0) {
2112                 vmf->page = virt_to_page(data->user_page);
2113         } else {
2114                 int nr = vmf->pgoff - 1;
2115
2116                 if ((unsigned)nr > data->nr_pages)
2117                         goto unlock;
2118
2119                 if (vmf->flags & FAULT_FLAG_WRITE)
2120                         goto unlock;
2121
2122                 vmf->page = virt_to_page(data->data_pages[nr]);
2123         }
2124
2125         get_page(vmf->page);
2126         vmf->page->mapping = vma->vm_file->f_mapping;
2127         vmf->page->index   = vmf->pgoff;
2128
2129         ret = 0;
2130 unlock:
2131         rcu_read_unlock();
2132
2133         return ret;
2134 }
2135
2136 static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2137 {
2138         struct perf_mmap_data *data;
2139         unsigned long size;
2140         int i;
2141
2142         WARN_ON(atomic_read(&event->mmap_count));
2143
2144         size = sizeof(struct perf_mmap_data);
2145         size += nr_pages * sizeof(void *);
2146
2147         data = kzalloc(size, GFP_KERNEL);
2148         if (!data)
2149                 goto fail;
2150
2151         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2152         if (!data->user_page)
2153                 goto fail_user_page;
2154
2155         for (i = 0; i < nr_pages; i++) {
2156                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2157                 if (!data->data_pages[i])
2158                         goto fail_data_pages;
2159         }
2160
2161         data->nr_pages = nr_pages;
2162         atomic_set(&data->lock, -1);
2163
2164         if (event->attr.watermark) {
2165                 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2166                                       event->attr.wakeup_watermark);
2167         }
2168         if (!data->watermark)
2169                 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2170
2171         rcu_assign_pointer(event->data, data);
2172
2173         return 0;
2174
2175 fail_data_pages:
2176         for (i--; i >= 0; i--)
2177                 free_page((unsigned long)data->data_pages[i]);
2178
2179         free_page((unsigned long)data->user_page);
2180
2181 fail_user_page:
2182         kfree(data);
2183
2184 fail:
2185         return -ENOMEM;
2186 }
2187
2188 static void perf_mmap_free_page(unsigned long addr)
2189 {
2190         struct page *page = virt_to_page((void *)addr);
2191
2192         page->mapping = NULL;
2193         __free_page(page);
2194 }
2195
2196 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2197 {
2198         struct perf_mmap_data *data;
2199         int i;
2200
2201         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2202
2203         perf_mmap_free_page((unsigned long)data->user_page);
2204         for (i = 0; i < data->nr_pages; i++)
2205                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2206
2207         kfree(data);
2208 }
2209
2210 static void perf_mmap_data_free(struct perf_event *event)
2211 {
2212         struct perf_mmap_data *data = event->data;
2213
2214         WARN_ON(atomic_read(&event->mmap_count));
2215
2216         rcu_assign_pointer(event->data, NULL);
2217         call_rcu(&data->rcu_head, __perf_mmap_data_free);
2218 }
2219
2220 static void perf_mmap_open(struct vm_area_struct *vma)
2221 {
2222         struct perf_event *event = vma->vm_file->private_data;
2223
2224         atomic_inc(&event->mmap_count);
2225 }
2226
2227 static void perf_mmap_close(struct vm_area_struct *vma)
2228 {
2229         struct perf_event *event = vma->vm_file->private_data;
2230
2231         WARN_ON_ONCE(event->ctx->parent_ctx);
2232         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2233                 struct user_struct *user = current_user();
2234
2235                 atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
2236                 vma->vm_mm->locked_vm -= event->data->nr_locked;
2237                 perf_mmap_data_free(event);
2238                 mutex_unlock(&event->mmap_mutex);
2239         }
2240 }
2241
2242 static const struct vm_operations_struct perf_mmap_vmops = {
2243         .open           = perf_mmap_open,
2244         .close          = perf_mmap_close,
2245         .fault          = perf_mmap_fault,
2246         .page_mkwrite   = perf_mmap_fault,
2247 };
2248
2249 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2250 {
2251         struct perf_event *event = file->private_data;
2252         unsigned long user_locked, user_lock_limit;
2253         struct user_struct *user = current_user();
2254         unsigned long locked, lock_limit;
2255         unsigned long vma_size;
2256         unsigned long nr_pages;
2257         long user_extra, extra;
2258         int ret = 0;
2259
2260         if (!(vma->vm_flags & VM_SHARED))
2261                 return -EINVAL;
2262
2263         vma_size = vma->vm_end - vma->vm_start;
2264         nr_pages = (vma_size / PAGE_SIZE) - 1;
2265
2266         /*
2267          * If we have data pages ensure they're a power-of-two number, so we
2268          * can do bitmasks instead of modulo.
2269          */
2270         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2271                 return -EINVAL;
2272
2273         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2274                 return -EINVAL;
2275
2276         if (vma->vm_pgoff != 0)
2277                 return -EINVAL;
2278
2279         WARN_ON_ONCE(event->ctx->parent_ctx);
2280         mutex_lock(&event->mmap_mutex);
2281         if (event->output) {
2282                 ret = -EINVAL;
2283                 goto unlock;
2284         }
2285
2286         if (atomic_inc_not_zero(&event->mmap_count)) {
2287                 if (nr_pages != event->data->nr_pages)
2288                         ret = -EINVAL;
2289                 goto unlock;
2290         }
2291
2292         user_extra = nr_pages + 1;
2293         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2294
2295         /*
2296          * Increase the limit linearly with more CPUs:
2297          */
2298         user_lock_limit *= num_online_cpus();
2299
2300         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2301
2302         extra = 0;
2303         if (user_locked > user_lock_limit)
2304                 extra = user_locked - user_lock_limit;
2305
2306         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2307         lock_limit >>= PAGE_SHIFT;
2308         locked = vma->vm_mm->locked_vm + extra;
2309
2310         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2311                 !capable(CAP_IPC_LOCK)) {
2312                 ret = -EPERM;
2313                 goto unlock;
2314         }
2315
2316         WARN_ON(event->data);
2317         ret = perf_mmap_data_alloc(event, nr_pages);
2318         if (ret)
2319                 goto unlock;
2320
2321         atomic_set(&event->mmap_count, 1);
2322         atomic_long_add(user_extra, &user->locked_vm);
2323         vma->vm_mm->locked_vm += extra;
2324         event->data->nr_locked = extra;
2325         if (vma->vm_flags & VM_WRITE)
2326                 event->data->writable = 1;
2327
2328 unlock:
2329         mutex_unlock(&event->mmap_mutex);
2330
2331         vma->vm_flags |= VM_RESERVED;
2332         vma->vm_ops = &perf_mmap_vmops;
2333
2334         return ret;
2335 }
2336
2337 static int perf_fasync(int fd, struct file *filp, int on)
2338 {
2339         struct inode *inode = filp->f_path.dentry->d_inode;
2340         struct perf_event *event = filp->private_data;
2341         int retval;
2342
2343         mutex_lock(&inode->i_mutex);
2344         retval = fasync_helper(fd, filp, on, &event->fasync);
2345         mutex_unlock(&inode->i_mutex);
2346
2347         if (retval < 0)
2348                 return retval;
2349
2350         return 0;
2351 }
2352
2353 static const struct file_operations perf_fops = {
2354         .release                = perf_release,
2355         .read                   = perf_read,
2356         .poll                   = perf_poll,
2357         .unlocked_ioctl         = perf_ioctl,
2358         .compat_ioctl           = perf_ioctl,
2359         .mmap                   = perf_mmap,
2360         .fasync                 = perf_fasync,
2361 };
2362
2363 /*
2364  * Perf event wakeup
2365  *
2366  * If there's data, ensure we set the poll() state and publish everything
2367  * to user-space before waking everybody up.
2368  */
2369
2370 void perf_event_wakeup(struct perf_event *event)
2371 {
2372         wake_up_all(&event->waitq);
2373
2374         if (event->pending_kill) {
2375                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2376                 event->pending_kill = 0;
2377         }
2378 }
2379
2380 /*
2381  * Pending wakeups
2382  *
2383  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2384  *
2385  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2386  * single linked list and use cmpxchg() to add entries lockless.
2387  */
2388
2389 static void perf_pending_event(struct perf_pending_entry *entry)
2390 {
2391         struct perf_event *event = container_of(entry,
2392                         struct perf_event, pending);
2393
2394         if (event->pending_disable) {
2395                 event->pending_disable = 0;
2396                 __perf_event_disable(event);
2397         }
2398
2399         if (event->pending_wakeup) {
2400                 event->pending_wakeup = 0;
2401                 perf_event_wakeup(event);
2402         }
2403 }
2404
2405 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2406
2407 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2408         PENDING_TAIL,
2409 };
2410
2411 static void perf_pending_queue(struct perf_pending_entry *entry,
2412                                void (*func)(struct perf_pending_entry *))
2413 {
2414         struct perf_pending_entry **head;
2415
2416         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2417                 return;
2418
2419         entry->func = func;
2420
2421         head = &get_cpu_var(perf_pending_head);
2422
2423         do {
2424                 entry->next = *head;
2425         } while (cmpxchg(head, entry->next, entry) != entry->next);
2426
2427         set_perf_event_pending();
2428
2429         put_cpu_var(perf_pending_head);
2430 }
2431
2432 static int __perf_pending_run(void)
2433 {
2434         struct perf_pending_entry *list;
2435         int nr = 0;
2436
2437         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2438         while (list != PENDING_TAIL) {
2439                 void (*func)(struct perf_pending_entry *);
2440                 struct perf_pending_entry *entry = list;
2441
2442                 list = list->next;
2443
2444                 func = entry->func;
2445                 entry->next = NULL;
2446                 /*
2447                  * Ensure we observe the unqueue before we issue the wakeup,
2448                  * so that we won't be waiting forever.
2449                  * -- see perf_not_pending().
2450                  */
2451                 smp_wmb();
2452
2453                 func(entry);
2454                 nr++;
2455         }
2456
2457         return nr;
2458 }
2459
2460 static inline int perf_not_pending(struct perf_event *event)
2461 {
2462         /*
2463          * If we flush on whatever cpu we run, there is a chance we don't
2464          * need to wait.
2465          */
2466         get_cpu();
2467         __perf_pending_run();
2468         put_cpu();
2469
2470         /*
2471          * Ensure we see the proper queue state before going to sleep
2472          * so that we do not miss the wakeup. -- see perf_pending_handle()
2473          */
2474         smp_rmb();
2475         return event->pending.next == NULL;
2476 }
2477
2478 static void perf_pending_sync(struct perf_event *event)
2479 {
2480         wait_event(event->waitq, perf_not_pending(event));
2481 }
2482
2483 void perf_event_do_pending(void)
2484 {
2485         __perf_pending_run();
2486 }
2487
2488 /*
2489  * Callchain support -- arch specific
2490  */
2491
2492 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2493 {
2494         return NULL;
2495 }
2496
2497 /*
2498  * Output
2499  */
2500 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2501                               unsigned long offset, unsigned long head)
2502 {
2503         unsigned long mask;
2504
2505         if (!data->writable)
2506                 return true;
2507
2508         mask = (data->nr_pages << PAGE_SHIFT) - 1;
2509
2510         offset = (offset - tail) & mask;
2511         head   = (head   - tail) & mask;
2512
2513         if ((int)(head - offset) < 0)
2514                 return false;
2515
2516         return true;
2517 }
2518
2519 static void perf_output_wakeup(struct perf_output_handle *handle)
2520 {
2521         atomic_set(&handle->data->poll, POLL_IN);
2522
2523         if (handle->nmi) {
2524                 handle->event->pending_wakeup = 1;
2525                 perf_pending_queue(&handle->event->pending,
2526                                    perf_pending_event);
2527         } else
2528                 perf_event_wakeup(handle->event);
2529 }
2530
2531 /*
2532  * Curious locking construct.
2533  *
2534  * We need to ensure a later event_id doesn't publish a head when a former
2535  * event_id isn't done writing. However since we need to deal with NMIs we
2536  * cannot fully serialize things.
2537  *
2538  * What we do is serialize between CPUs so we only have to deal with NMI
2539  * nesting on a single CPU.
2540  *
2541  * We only publish the head (and generate a wakeup) when the outer-most
2542  * event_id completes.
2543  */
2544 static void perf_output_lock(struct perf_output_handle *handle)
2545 {
2546         struct perf_mmap_data *data = handle->data;
2547         int cpu;
2548
2549         handle->locked = 0;
2550
2551         local_irq_save(handle->flags);
2552         cpu = smp_processor_id();
2553
2554         if (in_nmi() && atomic_read(&data->lock) == cpu)
2555                 return;
2556
2557         while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2558                 cpu_relax();
2559
2560         handle->locked = 1;
2561 }
2562
2563 static void perf_output_unlock(struct perf_output_handle *handle)
2564 {
2565         struct perf_mmap_data *data = handle->data;
2566         unsigned long head;
2567         int cpu;
2568
2569         data->done_head = data->head;
2570
2571         if (!handle->locked)
2572                 goto out;
2573
2574 again:
2575         /*
2576          * The xchg implies a full barrier that ensures all writes are done
2577          * before we publish the new head, matched by a rmb() in userspace when
2578          * reading this position.
2579          */
2580         while ((head = atomic_long_xchg(&data->done_head, 0)))
2581                 data->user_page->data_head = head;
2582
2583         /*
2584          * NMI can happen here, which means we can miss a done_head update.
2585          */
2586
2587         cpu = atomic_xchg(&data->lock, -1);
2588         WARN_ON_ONCE(cpu != smp_processor_id());
2589
2590         /*
2591          * Therefore we have to validate we did not indeed do so.
2592          */
2593         if (unlikely(atomic_long_read(&data->done_head))) {
2594                 /*
2595                  * Since we had it locked, we can lock it again.
2596                  */
2597                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2598                         cpu_relax();
2599
2600                 goto again;
2601         }
2602
2603         if (atomic_xchg(&data->wakeup, 0))
2604                 perf_output_wakeup(handle);
2605 out:
2606         local_irq_restore(handle->flags);
2607 }
2608
2609 void perf_output_copy(struct perf_output_handle *handle,
2610                       const void *buf, unsigned int len)
2611 {
2612         unsigned int pages_mask;
2613         unsigned int offset;
2614         unsigned int size;
2615         void **pages;
2616
2617         offset          = handle->offset;
2618         pages_mask      = handle->data->nr_pages - 1;
2619         pages           = handle->data->data_pages;
2620
2621         do {
2622                 unsigned int page_offset;
2623                 int nr;
2624
2625                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2626                 page_offset = offset & (PAGE_SIZE - 1);
2627                 size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2628
2629                 memcpy(pages[nr] + page_offset, buf, size);
2630
2631                 len         -= size;
2632                 buf         += size;
2633                 offset      += size;
2634         } while (len);
2635
2636         handle->offset = offset;
2637
2638         /*
2639          * Check we didn't copy past our reservation window, taking the
2640          * possible unsigned int wrap into account.
2641          */
2642         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2643 }
2644
2645 int perf_output_begin(struct perf_output_handle *handle,
2646                       struct perf_event *event, unsigned int size,
2647                       int nmi, int sample)
2648 {
2649         struct perf_event *output_event;
2650         struct perf_mmap_data *data;
2651         unsigned long tail, offset, head;
2652         int have_lost;
2653         struct {
2654                 struct perf_event_header header;
2655                 u64                      id;
2656                 u64                      lost;
2657         } lost_event;
2658
2659         rcu_read_lock();
2660         /*
2661          * For inherited events we send all the output towards the parent.
2662          */
2663         if (event->parent)
2664                 event = event->parent;
2665
2666         output_event = rcu_dereference(event->output);
2667         if (output_event)
2668                 event = output_event;
2669
2670         data = rcu_dereference(event->data);
2671         if (!data)
2672                 goto out;
2673
2674         handle->data    = data;
2675         handle->event   = event;
2676         handle->nmi     = nmi;
2677         handle->sample  = sample;
2678
2679         if (!data->nr_pages)
2680                 goto fail;
2681
2682         have_lost = atomic_read(&data->lost);
2683         if (have_lost)
2684                 size += sizeof(lost_event);
2685
2686         perf_output_lock(handle);
2687
2688         do {
2689                 /*
2690                  * Userspace could choose to issue a mb() before updating the
2691                  * tail pointer. So that all reads will be completed before the
2692                  * write is issued.
2693                  */
2694                 tail = ACCESS_ONCE(data->user_page->data_tail);
2695                 smp_rmb();
2696                 offset = head = atomic_long_read(&data->head);
2697                 head += size;
2698                 if (unlikely(!perf_output_space(data, tail, offset, head)))
2699                         goto fail;
2700         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2701
2702         handle->offset  = offset;
2703         handle->head    = head;
2704
2705         if (head - tail > data->watermark)
2706                 atomic_set(&data->wakeup, 1);
2707
2708         if (have_lost) {
2709                 lost_event.header.type = PERF_RECORD_LOST;
2710                 lost_event.header.misc = 0;
2711                 lost_event.header.size = sizeof(lost_event);
2712                 lost_event.id          = event->id;
2713                 lost_event.lost        = atomic_xchg(&data->lost, 0);
2714
2715                 perf_output_put(handle, lost_event);
2716         }
2717
2718         return 0;
2719
2720 fail:
2721         atomic_inc(&data->lost);
2722         perf_output_unlock(handle);
2723 out:
2724         rcu_read_unlock();
2725
2726         return -ENOSPC;
2727 }
2728
2729 void perf_output_end(struct perf_output_handle *handle)
2730 {
2731         struct perf_event *event = handle->event;
2732         struct perf_mmap_data *data = handle->data;
2733
2734         int wakeup_events = event->attr.wakeup_events;
2735
2736         if (handle->sample && wakeup_events) {
2737                 int events = atomic_inc_return(&data->events);
2738                 if (events >= wakeup_events) {
2739                         atomic_sub(wakeup_events, &data->events);
2740                         atomic_set(&data->wakeup, 1);
2741                 }
2742         }
2743
2744         perf_output_unlock(handle);
2745         rcu_read_unlock();
2746 }
2747
2748 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2749 {
2750         /*
2751          * only top level events have the pid namespace they were created in
2752          */
2753         if (event->parent)
2754                 event = event->parent;
2755
2756         return task_tgid_nr_ns(p, event->ns);
2757 }
2758
2759 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2760 {
2761         /*
2762          * only top level events have the pid namespace they were created in
2763          */
2764         if (event->parent)
2765                 event = event->parent;
2766
2767         return task_pid_nr_ns(p, event->ns);
2768 }
2769
2770 static void perf_output_read_one(struct perf_output_handle *handle,
2771                                  struct perf_event *event)
2772 {
2773         u64 read_format = event->attr.read_format;
2774         u64 values[4];
2775         int n = 0;
2776
2777         values[n++] = atomic64_read(&event->count);
2778         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2779                 values[n++] = event->total_time_enabled +
2780                         atomic64_read(&event->child_total_time_enabled);
2781         }
2782         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2783                 values[n++] = event->total_time_running +
2784                         atomic64_read(&event->child_total_time_running);
2785         }
2786         if (read_format & PERF_FORMAT_ID)
2787                 values[n++] = primary_event_id(event);
2788
2789         perf_output_copy(handle, values, n * sizeof(u64));
2790 }
2791
2792 /*
2793  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2794  */
2795 static void perf_output_read_group(struct perf_output_handle *handle,
2796                             struct perf_event *event)
2797 {
2798         struct perf_event *leader = event->group_leader, *sub;
2799         u64 read_format = event->attr.read_format;
2800         u64 values[5];
2801         int n = 0;
2802
2803         values[n++] = 1 + leader->nr_siblings;
2804
2805         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2806                 values[n++] = leader->total_time_enabled;
2807
2808         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2809                 values[n++] = leader->total_time_running;
2810
2811         if (leader != event)
2812                 leader->pmu->read(leader);
2813
2814         values[n++] = atomic64_read(&leader->count);
2815         if (read_format & PERF_FORMAT_ID)
2816                 values[n++] = primary_event_id(leader);
2817
2818         perf_output_copy(handle, values, n * sizeof(u64));
2819
2820         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2821                 n = 0;
2822
2823                 if (sub != event)
2824                         sub->pmu->read(sub);
2825
2826                 values[n++] = atomic64_read(&sub->count);
2827                 if (read_format & PERF_FORMAT_ID)
2828                         values[n++] = primary_event_id(sub);
2829
2830                 perf_output_copy(handle, values, n * sizeof(u64));
2831         }
2832 }
2833
2834 static void perf_output_read(struct perf_output_handle *handle,
2835                              struct perf_event *event)
2836 {
2837         if (event->attr.read_format & PERF_FORMAT_GROUP)
2838                 perf_output_read_group(handle, event);
2839         else
2840                 perf_output_read_one(handle, event);
2841 }
2842
2843 void perf_output_sample(struct perf_output_handle *handle,
2844                         struct perf_event_header *header,
2845                         struct perf_sample_data *data,
2846                         struct perf_event *event)
2847 {
2848         u64 sample_type = data->type;
2849
2850         perf_output_put(handle, *header);
2851
2852         if (sample_type & PERF_SAMPLE_IP)
2853                 perf_output_put(handle, data->ip);
2854
2855         if (sample_type & PERF_SAMPLE_TID)
2856                 perf_output_put(handle, data->tid_entry);
2857
2858         if (sample_type & PERF_SAMPLE_TIME)
2859                 perf_output_put(handle, data->time);
2860
2861         if (sample_type & PERF_SAMPLE_ADDR)
2862                 perf_output_put(handle, data->addr);
2863
2864         if (sample_type & PERF_SAMPLE_ID)
2865                 perf_output_put(handle, data->id);
2866
2867         if (sample_type & PERF_SAMPLE_STREAM_ID)
2868                 perf_output_put(handle, data->stream_id);
2869
2870         if (sample_type & PERF_SAMPLE_CPU)
2871                 perf_output_put(handle, data->cpu_entry);
2872
2873         if (sample_type & PERF_SAMPLE_PERIOD)
2874                 perf_output_put(handle, data->period);
2875
2876         if (sample_type & PERF_SAMPLE_READ)
2877                 perf_output_read(handle, event);
2878
2879         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2880                 if (data->callchain) {
2881                         int size = 1;
2882
2883                         if (data->callchain)
2884                                 size += data->callchain->nr;
2885
2886                         size *= sizeof(u64);
2887
2888                         perf_output_copy(handle, data->callchain, size);
2889                 } else {
2890                         u64 nr = 0;
2891                         perf_output_put(handle, nr);
2892                 }
2893         }
2894
2895         if (sample_type & PERF_SAMPLE_RAW) {
2896                 if (data->raw) {
2897                         perf_output_put(handle, data->raw->size);
2898                         perf_output_copy(handle, data->raw->data,
2899                                          data->raw->size);
2900                 } else {
2901                         struct {
2902                                 u32     size;
2903                                 u32     data;
2904                         } raw = {
2905                                 .size = sizeof(u32),
2906                                 .data = 0,
2907                         };
2908                         perf_output_put(handle, raw);
2909                 }
2910         }
2911 }
2912
2913 void perf_prepare_sample(struct perf_event_header *header,
2914                          struct perf_sample_data *data,
2915                          struct perf_event *event,
2916                          struct pt_regs *regs)
2917 {
2918         u64 sample_type = event->attr.sample_type;
2919
2920         data->type = sample_type;
2921
2922         header->type = PERF_RECORD_SAMPLE;
2923         header->size = sizeof(*header);
2924
2925         header->misc = 0;
2926         header->misc |= perf_misc_flags(regs);
2927
2928         if (sample_type & PERF_SAMPLE_IP) {
2929                 data->ip = perf_instruction_pointer(regs);
2930
2931                 header->size += sizeof(data->ip);
2932         }
2933
2934         if (sample_type & PERF_SAMPLE_TID) {
2935                 /* namespace issues */
2936                 data->tid_entry.pid = perf_event_pid(event, current);
2937                 data->tid_entry.tid = perf_event_tid(event, current);
2938
2939                 header->size += sizeof(data->tid_entry);
2940         }
2941
2942         if (sample_type & PERF_SAMPLE_TIME) {
2943                 data->time = perf_clock();
2944
2945                 header->size += sizeof(data->time);
2946         }
2947
2948         if (sample_type & PERF_SAMPLE_ADDR)
2949                 header->size += sizeof(data->addr);
2950
2951         if (sample_type & PERF_SAMPLE_ID) {
2952                 data->id = primary_event_id(event);
2953
2954                 header->size += sizeof(data->id);
2955         }
2956
2957         if (sample_type & PERF_SAMPLE_STREAM_ID) {
2958                 data->stream_id = event->id;
2959
2960                 header->size += sizeof(data->stream_id);
2961         }
2962
2963         if (sample_type & PERF_SAMPLE_CPU) {
2964                 data->cpu_entry.cpu             = raw_smp_processor_id();
2965                 data->cpu_entry.reserved        = 0;
2966
2967                 header->size += sizeof(data->cpu_entry);
2968         }
2969
2970         if (sample_type & PERF_SAMPLE_PERIOD)
2971                 header->size += sizeof(data->period);
2972
2973         if (sample_type & PERF_SAMPLE_READ)
2974                 header->size += perf_event_read_size(event);
2975
2976         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2977                 int size = 1;
2978
2979                 data->callchain = perf_callchain(regs);
2980
2981                 if (data->callchain)
2982                         size += data->callchain->nr;
2983
2984                 header->size += size * sizeof(u64);
2985         }
2986
2987         if (sample_type & PERF_SAMPLE_RAW) {
2988                 int size = sizeof(u32);
2989
2990                 if (data->raw)
2991                         size += data->raw->size;
2992                 else
2993                         size += sizeof(u32);
2994
2995                 WARN_ON_ONCE(size & (sizeof(u64)-1));
2996                 header->size += size;
2997         }
2998 }
2999
3000 static void perf_event_output(struct perf_event *event, int nmi,
3001                                 struct perf_sample_data *data,
3002                                 struct pt_regs *regs)
3003 {
3004         struct perf_output_handle handle;
3005         struct perf_event_header header;
3006
3007         perf_prepare_sample(&header, data, event, regs);
3008
3009         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3010                 return;
3011
3012         perf_output_sample(&handle, &header, data, event);
3013
3014         perf_output_end(&handle);
3015 }
3016
3017 /*
3018  * read event_id
3019  */
3020
3021 struct perf_read_event {
3022         struct perf_event_header        header;
3023
3024         u32                             pid;
3025         u32                             tid;
3026 };
3027
3028 static void
3029 perf_event_read_event(struct perf_event *event,
3030                         struct task_struct *task)
3031 {
3032         struct perf_output_handle handle;
3033         struct perf_read_event read_event = {
3034                 .header = {
3035                         .type = PERF_RECORD_READ,
3036                         .misc = 0,
3037                         .size = sizeof(read_event) + perf_event_read_size(event),
3038                 },
3039                 .pid = perf_event_pid(event, task),
3040                 .tid = perf_event_tid(event, task),
3041         };
3042         int ret;
3043
3044         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3045         if (ret)
3046                 return;
3047
3048         perf_output_put(&handle, read_event);
3049         perf_output_read(&handle, event);
3050
3051         perf_output_end(&handle);
3052 }
3053
3054 /*
3055  * task tracking -- fork/exit
3056  *
3057  * enabled by: attr.comm | attr.mmap | attr.task
3058  */
3059
3060 struct perf_task_event {
3061         struct task_struct              *task;
3062         struct perf_event_context       *task_ctx;
3063
3064         struct {
3065                 struct perf_event_header        header;
3066
3067                 u32                             pid;
3068                 u32                             ppid;
3069                 u32                             tid;
3070                 u32                             ptid;
3071                 u64                             time;
3072         } event_id;
3073 };
3074
3075 static void perf_event_task_output(struct perf_event *event,
3076                                      struct perf_task_event *task_event)
3077 {
3078         struct perf_output_handle handle;
3079         int size;
3080         struct task_struct *task = task_event->task;
3081         int ret;
3082
3083         size  = task_event->event_id.header.size;
3084         ret = perf_output_begin(&handle, event, size, 0, 0);
3085
3086         if (ret)
3087                 return;
3088
3089         task_event->event_id.pid = perf_event_pid(event, task);
3090         task_event->event_id.ppid = perf_event_pid(event, current);
3091
3092         task_event->event_id.tid = perf_event_tid(event, task);
3093         task_event->event_id.ptid = perf_event_tid(event, current);
3094
3095         task_event->event_id.time = perf_clock();
3096
3097         perf_output_put(&handle, task_event->event_id);
3098
3099         perf_output_end(&handle);
3100 }
3101
3102 static int perf_event_task_match(struct perf_event *event)
3103 {
3104         if (event->attr.comm || event->attr.mmap || event->attr.task)
3105                 return 1;
3106
3107         return 0;
3108 }
3109
3110 static void perf_event_task_ctx(struct perf_event_context *ctx,
3111                                   struct perf_task_event *task_event)
3112 {
3113         struct perf_event *event;
3114
3115         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3116                 return;
3117
3118         rcu_read_lock();
3119         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3120                 if (perf_event_task_match(event))
3121                         perf_event_task_output(event, task_event);
3122         }
3123         rcu_read_unlock();
3124 }
3125
3126 static void perf_event_task_event(struct perf_task_event *task_event)
3127 {
3128         struct perf_cpu_context *cpuctx;
3129         struct perf_event_context *ctx = task_event->task_ctx;
3130
3131         cpuctx = &get_cpu_var(perf_cpu_context);
3132         perf_event_task_ctx(&cpuctx->ctx, task_event);
3133         put_cpu_var(perf_cpu_context);
3134
3135         rcu_read_lock();
3136         if (!ctx)
3137                 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3138         if (ctx)
3139                 perf_event_task_ctx(ctx, task_event);
3140         rcu_read_unlock();
3141 }
3142
3143 static void perf_event_task(struct task_struct *task,
3144                               struct perf_event_context *task_ctx,
3145                               int new)
3146 {
3147         struct perf_task_event task_event;
3148
3149         if (!atomic_read(&nr_comm_events) &&
3150             !atomic_read(&nr_mmap_events) &&
3151             !atomic_read(&nr_task_events))
3152                 return;
3153
3154         task_event = (struct perf_task_event){
3155                 .task     = task,
3156                 .task_ctx = task_ctx,
3157                 .event_id    = {
3158                         .header = {
3159                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3160                                 .misc = 0,
3161                                 .size = sizeof(task_event.event_id),
3162                         },
3163                         /* .pid  */
3164                         /* .ppid */
3165                         /* .tid  */
3166                         /* .ptid */
3167                 },
3168         };
3169
3170         perf_event_task_event(&task_event);
3171 }
3172
3173 void perf_event_fork(struct task_struct *task)
3174 {
3175         perf_event_task(task, NULL, 1);
3176 }
3177
3178 /*
3179  * comm tracking
3180  */
3181
3182 struct perf_comm_event {
3183         struct task_struct      *task;
3184         char                    *comm;
3185         int                     comm_size;
3186
3187         struct {
3188                 struct perf_event_header        header;
3189
3190                 u32                             pid;
3191                 u32                             tid;
3192         } event_id;
3193 };
3194
3195 static void perf_event_comm_output(struct perf_event *event,
3196                                      struct perf_comm_event *comm_event)
3197 {
3198         struct perf_output_handle handle;
3199         int size = comm_event->event_id.header.size;
3200         int ret = perf_output_begin(&handle, event, size, 0, 0);
3201
3202         if (ret)
3203                 return;
3204
3205         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3206         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3207
3208         perf_output_put(&handle, comm_event->event_id);
3209         perf_output_copy(&handle, comm_event->comm,
3210                                    comm_event->comm_size);
3211         perf_output_end(&handle);
3212 }
3213
3214 static int perf_event_comm_match(struct perf_event *event)
3215 {
3216         if (event->attr.comm)
3217                 return 1;
3218
3219         return 0;
3220 }
3221
3222 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3223                                   struct perf_comm_event *comm_event)
3224 {
3225         struct perf_event *event;
3226
3227         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3228                 return;
3229
3230         rcu_read_lock();
3231         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3232                 if (perf_event_comm_match(event))
3233                         perf_event_comm_output(event, comm_event);
3234         }
3235         rcu_read_unlock();
3236 }
3237
3238 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3239 {
3240         struct perf_cpu_context *cpuctx;
3241         struct perf_event_context *ctx;
3242         unsigned int size;
3243         char comm[TASK_COMM_LEN];
3244
3245         memset(comm, 0, sizeof(comm));
3246         strncpy(comm, comm_event->task->comm, sizeof(comm));
3247         size = ALIGN(strlen(comm)+1, sizeof(u64));
3248
3249         comm_event->comm = comm;
3250         comm_event->comm_size = size;
3251
3252         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3253
3254         cpuctx = &get_cpu_var(perf_cpu_context);
3255         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3256         put_cpu_var(perf_cpu_context);
3257
3258         rcu_read_lock();
3259         /*
3260          * doesn't really matter which of the child contexts the
3261          * events ends up in.
3262          */
3263         ctx = rcu_dereference(current->perf_event_ctxp);
3264         if (ctx)
3265                 perf_event_comm_ctx(ctx, comm_event);
3266         rcu_read_unlock();
3267 }
3268
3269 void perf_event_comm(struct task_struct *task)
3270 {
3271         struct perf_comm_event comm_event;
3272
3273         if (task->perf_event_ctxp)
3274                 perf_event_enable_on_exec(task);
3275
3276         if (!atomic_read(&nr_comm_events))
3277                 return;
3278
3279         comm_event = (struct perf_comm_event){
3280                 .task   = task,
3281                 /* .comm      */
3282                 /* .comm_size */
3283                 .event_id  = {
3284                         .header = {
3285                                 .type = PERF_RECORD_COMM,
3286                                 .misc = 0,
3287                                 /* .size */
3288                         },
3289                         /* .pid */
3290                         /* .tid */
3291                 },
3292         };
3293
3294         perf_event_comm_event(&comm_event);
3295 }
3296
3297 /*
3298  * mmap tracking
3299  */
3300
3301 struct perf_mmap_event {
3302         struct vm_area_struct   *vma;
3303
3304         const char              *file_name;
3305         int                     file_size;
3306
3307         struct {
3308                 struct perf_event_header        header;
3309
3310                 u32                             pid;
3311                 u32                             tid;
3312                 u64                             start;
3313                 u64                             len;
3314                 u64                             pgoff;
3315         } event_id;
3316 };
3317
3318 static void perf_event_mmap_output(struct perf_event *event,
3319                                      struct perf_mmap_event *mmap_event)
3320 {
3321         struct perf_output_handle handle;
3322         int size = mmap_event->event_id.header.size;
3323         int ret = perf_output_begin(&handle, event, size, 0, 0);
3324
3325         if (ret)
3326                 return;
3327
3328         mmap_event->event_id.pid = perf_event_pid(event, current);
3329         mmap_event->event_id.tid = perf_event_tid(event, current);
3330
3331         perf_output_put(&handle, mmap_event->event_id);
3332         perf_output_copy(&handle, mmap_event->file_name,
3333                                    mmap_event->file_size);
3334         perf_output_end(&handle);
3335 }
3336
3337 static int perf_event_mmap_match(struct perf_event *event,
3338                                    struct perf_mmap_event *mmap_event)
3339 {
3340         if (event->attr.mmap)
3341                 return 1;
3342
3343         return 0;
3344 }
3345
3346 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3347                                   struct perf_mmap_event *mmap_event)
3348 {
3349         struct perf_event *event;
3350
3351         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3352                 return;
3353
3354         rcu_read_lock();
3355         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3356                 if (perf_event_mmap_match(event, mmap_event))
3357                         perf_event_mmap_output(event, mmap_event);
3358         }
3359         rcu_read_unlock();
3360 }
3361
3362 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3363 {
3364         struct perf_cpu_context *cpuctx;
3365         struct perf_event_context *ctx;
3366         struct vm_area_struct *vma = mmap_event->vma;
3367         struct file *file = vma->vm_file;
3368         unsigned int size;
3369         char tmp[16];
3370         char *buf = NULL;
3371         const char *name;
3372
3373         memset(tmp, 0, sizeof(tmp));
3374
3375         if (file) {
3376                 /*
3377                  * d_path works from the end of the buffer backwards, so we
3378                  * need to add enough zero bytes after the string to handle
3379                  * the 64bit alignment we do later.
3380                  */
3381                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3382                 if (!buf) {
3383                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3384                         goto got_name;
3385                 }
3386                 name = d_path(&file->f_path, buf, PATH_MAX);
3387                 if (IS_ERR(name)) {
3388                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3389                         goto got_name;
3390                 }
3391         } else {
3392                 if (arch_vma_name(mmap_event->vma)) {
3393                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3394                                        sizeof(tmp));
3395                         goto got_name;
3396                 }
3397
3398                 if (!vma->vm_mm) {
3399                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3400                         goto got_name;
3401                 }
3402
3403                 name = strncpy(tmp, "//anon", sizeof(tmp));
3404                 goto got_name;
3405         }
3406
3407 got_name:
3408         size = ALIGN(strlen(name)+1, sizeof(u64));
3409
3410         mmap_event->file_name = name;
3411         mmap_event->file_size = size;
3412
3413         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3414
3415         cpuctx = &get_cpu_var(perf_cpu_context);
3416         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3417         put_cpu_var(perf_cpu_context);
3418
3419         rcu_read_lock();
3420         /*
3421          * doesn't really matter which of the child contexts the
3422          * events ends up in.
3423          */
3424         ctx = rcu_dereference(current->perf_event_ctxp);
3425         if (ctx)
3426                 perf_event_mmap_ctx(ctx, mmap_event);
3427         rcu_read_unlock();
3428
3429         kfree(buf);
3430 }
3431
3432 void __perf_event_mmap(struct vm_area_struct *vma)
3433 {
3434         struct perf_mmap_event mmap_event;
3435
3436         if (!atomic_read(&nr_mmap_events))
3437                 return;
3438
3439         mmap_event = (struct perf_mmap_event){
3440                 .vma    = vma,
3441                 /* .file_name */
3442                 /* .file_size */
3443                 .event_id  = {
3444                         .header = {
3445                                 .type = PERF_RECORD_MMAP,
3446                                 .misc = 0,
3447                                 /* .size */
3448                         },
3449                         /* .pid */
3450                         /* .tid */
3451                         .start  = vma->vm_start,
3452                         .len    = vma->vm_end - vma->vm_start,
3453                         .pgoff  = vma->vm_pgoff,
3454                 },
3455         };
3456
3457         perf_event_mmap_event(&mmap_event);
3458 }
3459
3460 /*
3461  * IRQ throttle logging
3462  */
3463
3464 static void perf_log_throttle(struct perf_event *event, int enable)
3465 {
3466         struct perf_output_handle handle;
3467         int ret;
3468
3469         struct {
3470                 struct perf_event_header        header;
3471                 u64                             time;
3472                 u64                             id;
3473                 u64                             stream_id;
3474         } throttle_event = {
3475                 .header = {
3476                         .type = PERF_RECORD_THROTTLE,
3477                         .misc = 0,
3478                         .size = sizeof(throttle_event),
3479                 },
3480                 .time           = perf_clock(),
3481                 .id             = primary_event_id(event),
3482                 .stream_id      = event->id,
3483         };
3484
3485         if (enable)
3486                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3487
3488         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3489         if (ret)
3490                 return;
3491
3492         perf_output_put(&handle, throttle_event);
3493         perf_output_end(&handle);
3494 }
3495
3496 /*
3497  * Generic event overflow handling, sampling.
3498  */
3499
3500 static int __perf_event_overflow(struct perf_event *event, int nmi,
3501                                    int throttle, struct perf_sample_data *data,
3502                                    struct pt_regs *regs)
3503 {
3504         int events = atomic_read(&event->event_limit);
3505         struct hw_perf_event *hwc = &event->hw;
3506         int ret = 0;
3507
3508         throttle = (throttle && event->pmu->unthrottle != NULL);
3509
3510         if (!throttle) {
3511                 hwc->interrupts++;
3512         } else {
3513                 if (hwc->interrupts != MAX_INTERRUPTS) {
3514                         hwc->interrupts++;
3515                         if (HZ * hwc->interrupts >
3516                                         (u64)sysctl_perf_event_sample_rate) {
3517                                 hwc->interrupts = MAX_INTERRUPTS;
3518                                 perf_log_throttle(event, 0);
3519                                 ret = 1;
3520                         }
3521                 } else {
3522                         /*
3523                          * Keep re-disabling events even though on the previous
3524                          * pass we disabled it - just in case we raced with a
3525                          * sched-in and the event got enabled again:
3526                          */
3527                         ret = 1;
3528                 }
3529         }
3530
3531         if (event->attr.freq) {
3532                 u64 now = perf_clock();
3533                 s64 delta = now - hwc->freq_stamp;
3534
3535                 hwc->freq_stamp = now;
3536
3537                 if (delta > 0 && delta < TICK_NSEC)
3538                         perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3539         }
3540
3541         /*
3542          * XXX event_limit might not quite work as expected on inherited
3543          * events
3544          */
3545
3546         event->pending_kill = POLL_IN;
3547         if (events && atomic_dec_and_test(&event->event_limit)) {
3548                 ret = 1;
3549                 event->pending_kill = POLL_HUP;
3550                 if (nmi) {
3551                         event->pending_disable = 1;
3552                         perf_pending_queue(&event->pending,
3553                                            perf_pending_event);
3554                 } else
3555                         perf_event_disable(event);
3556         }
3557
3558         perf_event_output(event, nmi, data, regs);
3559         return ret;
3560 }
3561
3562 int perf_event_overflow(struct perf_event *event, int nmi,
3563                           struct perf_sample_data *data,
3564                           struct pt_regs *regs)
3565 {
3566         return __perf_event_overflow(event, nmi, 1, data, regs);
3567 }
3568
3569 /*
3570  * Generic software event infrastructure
3571  */
3572
3573 /*
3574  * We directly increment event->count and keep a second value in
3575  * event->hw.period_left to count intervals. This period event
3576  * is kept in the range [-sample_period, 0] so that we can use the
3577  * sign as trigger.
3578  */
3579
3580 static u64 perf_swevent_set_period(struct perf_event *event)
3581 {
3582         struct hw_perf_event *hwc = &event->hw;
3583         u64 period = hwc->last_period;
3584         u64 nr, offset;
3585         s64 old, val;
3586
3587         hwc->last_period = hwc->sample_period;
3588
3589 again:
3590         old = val = atomic64_read(&hwc->period_left);
3591         if (val < 0)
3592                 return 0;
3593
3594         nr = div64_u64(period + val, period);
3595         offset = nr * period;
3596         val -= offset;
3597         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3598                 goto again;
3599
3600         return nr;
3601 }
3602
3603 static void perf_swevent_overflow(struct perf_event *event,
3604                                     int nmi, struct perf_sample_data *data,
3605                                     struct pt_regs *regs)
3606 {
3607         struct hw_perf_event *hwc = &event->hw;
3608         int throttle = 0;
3609         u64 overflow;
3610
3611         data->period = event->hw.last_period;
3612         overflow = perf_swevent_set_period(event);
3613
3614         if (hwc->interrupts == MAX_INTERRUPTS)
3615                 return;
3616
3617         for (; overflow; overflow--) {
3618                 if (__perf_event_overflow(event, nmi, throttle,
3619                                             data, regs)) {
3620                         /*
3621                          * We inhibit the overflow from happening when
3622                          * hwc->interrupts == MAX_INTERRUPTS.
3623                          */
3624                         break;
3625                 }
3626                 throttle = 1;
3627         }
3628 }
3629
3630 static void perf_swevent_unthrottle(struct perf_event *event)
3631 {
3632         /*
3633          * Nothing to do, we already reset hwc->interrupts.
3634          */
3635 }
3636
3637 static void perf_swevent_add(struct perf_event *event, u64 nr,
3638                                int nmi, struct perf_sample_data *data,
3639                                struct pt_regs *regs)
3640 {
3641         struct hw_perf_event *hwc = &event->hw;
3642
3643         atomic64_add(nr, &event->count);
3644
3645         if (!hwc->sample_period)
3646                 return;
3647
3648         if (!regs)
3649                 return;
3650
3651         if (!atomic64_add_negative(nr, &hwc->period_left))
3652                 perf_swevent_overflow(event, nmi, data, regs);
3653 }
3654
3655 static int perf_swevent_is_counting(struct perf_event *event)
3656 {
3657         /*
3658          * The event is active, we're good!
3659          */
3660         if (event->state == PERF_EVENT_STATE_ACTIVE)
3661                 return 1;
3662
3663         /*
3664          * The event is off/error, not counting.
3665          */
3666         if (event->state != PERF_EVENT_STATE_INACTIVE)
3667                 return 0;
3668
3669         /*
3670          * The event is inactive, if the context is active
3671          * we're part of a group that didn't make it on the 'pmu',
3672          * not counting.
3673          */
3674         if (event->ctx->is_active)
3675                 return 0;
3676
3677         /*
3678          * We're inactive and the context is too, this means the
3679          * task is scheduled out, we're counting events that happen
3680          * to us, like migration events.
3681          */
3682         return 1;
3683 }
3684
3685 static int perf_swevent_match(struct perf_event *event,
3686                                 enum perf_type_id type,
3687                                 u32 event_id, struct pt_regs *regs)
3688 {
3689         if (!perf_swevent_is_counting(event))
3690                 return 0;
3691
3692         if (event->attr.type != type)
3693                 return 0;
3694         if (event->attr.config != event_id)
3695                 return 0;
3696
3697         if (regs) {
3698                 if (event->attr.exclude_user && user_mode(regs))
3699                         return 0;
3700
3701                 if (event->attr.exclude_kernel && !user_mode(regs))
3702                         return 0;
3703         }
3704
3705         return 1;
3706 }
3707
3708 static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3709                                      enum perf_type_id type,
3710                                      u32 event_id, u64 nr, int nmi,
3711                                      struct perf_sample_data *data,
3712                                      struct pt_regs *regs)
3713 {
3714         struct perf_event *event;
3715
3716         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3717                 return;
3718
3719         rcu_read_lock();
3720         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3721                 if (perf_swevent_match(event, type, event_id, regs))
3722                         perf_swevent_add(event, nr, nmi, data, regs);
3723         }
3724         rcu_read_unlock();
3725 }
3726
3727 static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3728 {
3729         if (in_nmi())
3730                 return &cpuctx->recursion[3];
3731
3732         if (in_irq())
3733                 return &cpuctx->recursion[2];
3734
3735         if (in_softirq())
3736                 return &cpuctx->recursion[1];
3737
3738         return &cpuctx->recursion[0];
3739 }
3740
3741 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3742                                     u64 nr, int nmi,
3743                                     struct perf_sample_data *data,
3744                                     struct pt_regs *regs)
3745 {
3746         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3747         int *recursion = perf_swevent_recursion_context(cpuctx);
3748         struct perf_event_context *ctx;
3749
3750         if (*recursion)
3751                 goto out;
3752
3753         (*recursion)++;
3754         barrier();
3755
3756         perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3757                                  nr, nmi, data, regs);
3758         rcu_read_lock();
3759         /*
3760          * doesn't really matter which of the child contexts the
3761          * events ends up in.
3762          */
3763         ctx = rcu_dereference(current->perf_event_ctxp);
3764         if (ctx)
3765                 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3766         rcu_read_unlock();
3767
3768         barrier();
3769         (*recursion)--;
3770
3771 out:
3772         put_cpu_var(perf_cpu_context);
3773 }
3774
3775 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3776                             struct pt_regs *regs, u64 addr)
3777 {
3778         struct perf_sample_data data = {
3779                 .addr = addr,
3780         };
3781
3782         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3783                                 &data, regs);
3784 }
3785
3786 static void perf_swevent_read(struct perf_event *event)
3787 {
3788 }
3789
3790 static int perf_swevent_enable(struct perf_event *event)
3791 {
3792         struct hw_perf_event *hwc = &event->hw;
3793
3794         if (hwc->sample_period) {
3795                 hwc->last_period = hwc->sample_period;
3796                 perf_swevent_set_period(event);
3797         }
3798         return 0;
3799 }
3800
3801 static void perf_swevent_disable(struct perf_event *event)
3802 {
3803 }
3804
3805 static const struct pmu perf_ops_generic = {
3806         .enable         = perf_swevent_enable,
3807         .disable        = perf_swevent_disable,
3808         .read           = perf_swevent_read,
3809         .unthrottle     = perf_swevent_unthrottle,
3810 };
3811
3812 /*
3813  * hrtimer based swevent callback
3814  */
3815
3816 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3817 {
3818         enum hrtimer_restart ret = HRTIMER_RESTART;
3819         struct perf_sample_data data;
3820         struct pt_regs *regs;
3821         struct perf_event *event;
3822         u64 period;
3823
3824         event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
3825         event->pmu->read(event);
3826
3827         data.addr = 0;
3828         regs = get_irq_regs();
3829         /*
3830          * In case we exclude kernel IPs or are somehow not in interrupt
3831          * context, provide the next best thing, the user IP.
3832          */
3833         if ((event->attr.exclude_kernel || !regs) &&
3834                         !event->attr.exclude_user)
3835                 regs = task_pt_regs(current);
3836
3837         if (regs) {
3838                 if (perf_event_overflow(event, 0, &data, regs))
3839                         ret = HRTIMER_NORESTART;
3840         }
3841
3842         period = max_t(u64, 10000, event->hw.sample_period);
3843         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3844
3845         return ret;
3846 }
3847
3848 /*
3849  * Software event: cpu wall time clock
3850  */
3851
3852 static void cpu_clock_perf_event_update(struct perf_event *event)
3853 {
3854         int cpu = raw_smp_processor_id();
3855         s64 prev;
3856         u64 now;
3857
3858         now = cpu_clock(cpu);
3859         prev = atomic64_read(&event->hw.prev_count);
3860         atomic64_set(&event->hw.prev_count, now);
3861         atomic64_add(now - prev, &event->count);
3862 }
3863
3864 static int cpu_clock_perf_event_enable(struct perf_event *event)
3865 {
3866         struct hw_perf_event *hwc = &event->hw;
3867         int cpu = raw_smp_processor_id();
3868
3869         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3870         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3871         hwc->hrtimer.function = perf_swevent_hrtimer;
3872         if (hwc->sample_period) {
3873                 u64 period = max_t(u64, 10000, hwc->sample_period);
3874                 __hrtimer_start_range_ns(&hwc->hrtimer,
3875                                 ns_to_ktime(period), 0,
3876                                 HRTIMER_MODE_REL, 0);
3877         }
3878
3879         return 0;
3880 }
3881
3882 static void cpu_clock_perf_event_disable(struct perf_event *event)
3883 {
3884         if (event->hw.sample_period)
3885                 hrtimer_cancel(&event->hw.hrtimer);
3886         cpu_clock_perf_event_update(event);
3887 }
3888
3889 static void cpu_clock_perf_event_read(struct perf_event *event)
3890 {
3891         cpu_clock_perf_event_update(event);
3892 }
3893
3894 static const struct pmu perf_ops_cpu_clock = {
3895         .enable         = cpu_clock_perf_event_enable,
3896         .disable        = cpu_clock_perf_event_disable,
3897         .read           = cpu_clock_perf_event_read,
3898 };
3899
3900 /*
3901  * Software event: task time clock
3902  */
3903
3904 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
3905 {
3906         u64 prev;
3907         s64 delta;
3908
3909         prev = atomic64_xchg(&event->hw.prev_count, now);
3910         delta = now - prev;
3911         atomic64_add(delta, &event->count);
3912 }
3913
3914 static int task_clock_perf_event_enable(struct perf_event *event)
3915 {
3916         struct hw_perf_event *hwc = &event->hw;
3917         u64 now;
3918
3919         now = event->ctx->time;
3920
3921         atomic64_set(&hwc->prev_count, now);
3922         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3923         hwc->hrtimer.function = perf_swevent_hrtimer;
3924         if (hwc->sample_period) {
3925                 u64 period = max_t(u64, 10000, hwc->sample_period);
3926                 __hrtimer_start_range_ns(&hwc->hrtimer,
3927                                 ns_to_ktime(period), 0,
3928                                 HRTIMER_MODE_REL, 0);
3929         }
3930
3931         return 0;
3932 }
3933
3934 static void task_clock_perf_event_disable(struct perf_event *event)
3935 {
3936         if (event->hw.sample_period)
3937                 hrtimer_cancel(&event->hw.hrtimer);
3938         task_clock_perf_event_update(event, event->ctx->time);
3939
3940 }
3941
3942 static void task_clock_perf_event_read(struct perf_event *event)
3943 {
3944         u64 time;
3945
3946         if (!in_nmi()) {
3947                 update_context_time(event->ctx);
3948                 time = event->ctx->time;
3949         } else {
3950                 u64 now = perf_clock();
3951                 u64 delta = now - event->ctx->timestamp;
3952                 time = event->ctx->time + delta;
3953         }
3954
3955         task_clock_perf_event_update(event, time);
3956 }
3957
3958 static const struct pmu perf_ops_task_clock = {
3959         .enable         = task_clock_perf_event_enable,
3960         .disable        = task_clock_perf_event_disable,
3961         .read           = task_clock_perf_event_read,
3962 };
3963
3964 #ifdef CONFIG_EVENT_PROFILE
3965 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3966                           int entry_size)
3967 {
3968         struct perf_raw_record raw = {
3969                 .size = entry_size,
3970                 .data = record,
3971         };
3972
3973         struct perf_sample_data data = {
3974                 .addr = addr,
3975                 .raw = &raw,
3976         };
3977
3978         struct pt_regs *regs = get_irq_regs();
3979
3980         if (!regs)
3981                 regs = task_pt_regs(current);
3982
3983         do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3984                                 &data, regs);
3985 }
3986 EXPORT_SYMBOL_GPL(perf_tp_event);
3987
3988 extern int ftrace_profile_enable(int);
3989 extern void ftrace_profile_disable(int);
3990
3991 static void tp_perf_event_destroy(struct perf_event *event)
3992 {
3993         ftrace_profile_disable(event->attr.config);
3994 }
3995
3996 static const struct pmu *tp_perf_event_init(struct perf_event *event)
3997 {
3998         /*
3999          * Raw tracepoint data is a severe data leak, only allow root to
4000          * have these.
4001          */
4002         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4003                         perf_paranoid_tracepoint_raw() &&
4004                         !capable(CAP_SYS_ADMIN))
4005                 return ERR_PTR(-EPERM);
4006
4007         if (ftrace_profile_enable(event->attr.config))
4008                 return NULL;
4009
4010         event->destroy = tp_perf_event_destroy;
4011
4012         return &perf_ops_generic;
4013 }
4014 #else
4015 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4016 {
4017         return NULL;
4018 }
4019 #endif
4020
4021 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4022
4023 static void sw_perf_event_destroy(struct perf_event *event)
4024 {
4025         u64 event_id = event->attr.config;
4026
4027         WARN_ON(event->parent);
4028
4029         atomic_dec(&perf_swevent_enabled[event_id]);
4030 }
4031
4032 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4033 {
4034         const struct pmu *pmu = NULL;
4035         u64 event_id = event->attr.config;
4036
4037         /*
4038          * Software events (currently) can't in general distinguish
4039          * between user, kernel and hypervisor events.
4040          * However, context switches and cpu migrations are considered
4041          * to be kernel events, and page faults are never hypervisor
4042          * events.
4043          */
4044         switch (event_id) {
4045         case PERF_COUNT_SW_CPU_CLOCK:
4046                 pmu = &perf_ops_cpu_clock;
4047
4048                 break;
4049         case PERF_COUNT_SW_TASK_CLOCK:
4050                 /*
4051                  * If the user instantiates this as a per-cpu event,
4052                  * use the cpu_clock event instead.
4053                  */
4054                 if (event->ctx->task)
4055                         pmu = &perf_ops_task_clock;
4056                 else
4057                         pmu = &perf_ops_cpu_clock;
4058
4059                 break;
4060         case PERF_COUNT_SW_PAGE_FAULTS:
4061         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4062         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4063         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4064         case PERF_COUNT_SW_CPU_MIGRATIONS:
4065                 if (!event->parent) {
4066                         atomic_inc(&perf_swevent_enabled[event_id]);
4067                         event->destroy = sw_perf_event_destroy;
4068                 }
4069                 pmu = &perf_ops_generic;
4070                 break;
4071         }
4072
4073         return pmu;
4074 }
4075
4076 /*
4077  * Allocate and initialize a event structure
4078  */
4079 static struct perf_event *
4080 perf_event_alloc(struct perf_event_attr *attr,
4081                    int cpu,
4082                    struct perf_event_context *ctx,
4083                    struct perf_event *group_leader,
4084                    struct perf_event *parent_event,
4085                    gfp_t gfpflags)
4086 {
4087         const struct pmu *pmu;
4088         struct perf_event *event;
4089         struct hw_perf_event *hwc;
4090         long err;
4091
4092         event = kzalloc(sizeof(*event), gfpflags);
4093         if (!event)
4094                 return ERR_PTR(-ENOMEM);
4095
4096         /*
4097          * Single events are their own group leaders, with an
4098          * empty sibling list:
4099          */
4100         if (!group_leader)
4101                 group_leader = event;
4102
4103         mutex_init(&event->child_mutex);
4104         INIT_LIST_HEAD(&event->child_list);
4105
4106         INIT_LIST_HEAD(&event->group_entry);
4107         INIT_LIST_HEAD(&event->event_entry);
4108         INIT_LIST_HEAD(&event->sibling_list);
4109         init_waitqueue_head(&event->waitq);
4110
4111         mutex_init(&event->mmap_mutex);
4112
4113         event->cpu              = cpu;
4114         event->attr             = *attr;
4115         event->group_leader     = group_leader;
4116         event->pmu              = NULL;
4117         event->ctx              = ctx;
4118         event->oncpu            = -1;
4119
4120         event->parent           = parent_event;
4121
4122         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4123         event->id               = atomic64_inc_return(&perf_event_id);
4124
4125         event->state            = PERF_EVENT_STATE_INACTIVE;
4126
4127         if (attr->disabled)
4128                 event->state = PERF_EVENT_STATE_OFF;
4129
4130         pmu = NULL;
4131
4132         hwc = &event->hw;
4133         hwc->sample_period = attr->sample_period;
4134         if (attr->freq && attr->sample_freq)
4135                 hwc->sample_period = 1;
4136         hwc->last_period = hwc->sample_period;
4137
4138         atomic64_set(&hwc->period_left, hwc->sample_period);
4139
4140         /*
4141          * we currently do not support PERF_FORMAT_GROUP on inherited events
4142          */
4143         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4144                 goto done;
4145
4146         switch (attr->type) {
4147         case PERF_TYPE_RAW:
4148         case PERF_TYPE_HARDWARE:
4149         case PERF_TYPE_HW_CACHE:
4150                 pmu = hw_perf_event_init(event);
4151                 break;
4152
4153         case PERF_TYPE_SOFTWARE:
4154                 pmu = sw_perf_event_init(event);
4155                 break;
4156
4157         case PERF_TYPE_TRACEPOINT:
4158                 pmu = tp_perf_event_init(event);
4159                 break;
4160
4161         default:
4162                 break;
4163         }
4164 done:
4165         err = 0;
4166         if (!pmu)
4167                 err = -EINVAL;
4168         else if (IS_ERR(pmu))
4169                 err = PTR_ERR(pmu);
4170
4171         if (err) {
4172                 if (event->ns)
4173                         put_pid_ns(event->ns);
4174                 kfree(event);
4175                 return ERR_PTR(err);
4176         }
4177
4178         event->pmu = pmu;
4179
4180         if (!event->parent) {
4181                 atomic_inc(&nr_events);
4182                 if (event->attr.mmap)
4183                         atomic_inc(&nr_mmap_events);
4184                 if (event->attr.comm)
4185                         atomic_inc(&nr_comm_events);
4186                 if (event->attr.task)
4187                         atomic_inc(&nr_task_events);
4188         }
4189
4190         return event;
4191 }
4192
4193 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4194                           struct perf_event_attr *attr)
4195 {
4196         u32 size;
4197         int ret;
4198
4199         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4200                 return -EFAULT;
4201
4202         /*
4203          * zero the full structure, so that a short copy will be nice.
4204          */
4205         memset(attr, 0, sizeof(*attr));
4206
4207         ret = get_user(size, &uattr->size);
4208         if (ret)
4209                 return ret;
4210
4211         if (size > PAGE_SIZE)   /* silly large */
4212                 goto err_size;
4213
4214         if (!size)              /* abi compat */
4215                 size = PERF_ATTR_SIZE_VER0;
4216
4217         if (size < PERF_ATTR_SIZE_VER0)
4218                 goto err_size;
4219
4220         /*
4221          * If we're handed a bigger struct than we know of,
4222          * ensure all the unknown bits are 0 - i.e. new
4223          * user-space does not rely on any kernel feature
4224          * extensions we dont know about yet.
4225          */
4226         if (size > sizeof(*attr)) {
4227                 unsigned char __user *addr;
4228                 unsigned char __user *end;
4229                 unsigned char val;
4230
4231                 addr = (void __user *)uattr + sizeof(*attr);
4232                 end  = (void __user *)uattr + size;
4233
4234                 for (; addr < end; addr++) {
4235                         ret = get_user(val, addr);
4236                         if (ret)
4237                                 return ret;
4238                         if (val)
4239                                 goto err_size;
4240                 }
4241                 size = sizeof(*attr);
4242         }
4243
4244         ret = copy_from_user(attr, uattr, size);
4245         if (ret)
4246                 return -EFAULT;
4247
4248         /*
4249          * If the type exists, the corresponding creation will verify
4250          * the attr->config.
4251          */
4252         if (attr->type >= PERF_TYPE_MAX)
4253                 return -EINVAL;
4254
4255         if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4256                 return -EINVAL;
4257
4258         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4259                 return -EINVAL;
4260
4261         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4262                 return -EINVAL;
4263
4264 out:
4265         return ret;
4266
4267 err_size:
4268         put_user(sizeof(*attr), &uattr->size);
4269         ret = -E2BIG;
4270         goto out;
4271 }
4272
4273 int perf_event_set_output(struct perf_event *event, int output_fd)
4274 {
4275         struct perf_event *output_event = NULL;
4276         struct file *output_file = NULL;
4277         struct perf_event *old_output;
4278         int fput_needed = 0;
4279         int ret = -EINVAL;
4280
4281         if (!output_fd)
4282                 goto set;
4283
4284         output_file = fget_light(output_fd, &fput_needed);
4285         if (!output_file)
4286                 return -EBADF;
4287
4288         if (output_file->f_op != &perf_fops)
4289                 goto out;
4290
4291         output_event = output_file->private_data;
4292
4293         /* Don't chain output fds */
4294         if (output_event->output)
4295                 goto out;
4296
4297         /* Don't set an output fd when we already have an output channel */
4298         if (event->data)
4299                 goto out;
4300
4301         atomic_long_inc(&output_file->f_count);
4302
4303 set:
4304         mutex_lock(&event->mmap_mutex);
4305         old_output = event->output;
4306         rcu_assign_pointer(event->output, output_event);
4307         mutex_unlock(&event->mmap_mutex);
4308
4309         if (old_output) {
4310                 /*
4311                  * we need to make sure no existing perf_output_*()
4312                  * is still referencing this event.
4313                  */
4314                 synchronize_rcu();
4315                 fput(old_output->filp);
4316         }
4317
4318         ret = 0;
4319 out:
4320         fput_light(output_file, fput_needed);
4321         return ret;
4322 }
4323
4324 /**
4325  * sys_perf_event_open - open a performance event, associate it to a task/cpu
4326  *
4327  * @attr_uptr:  event_id type attributes for monitoring/sampling
4328  * @pid:                target pid
4329  * @cpu:                target cpu
4330  * @group_fd:           group leader event fd
4331  */
4332 SYSCALL_DEFINE5(perf_event_open,
4333                 struct perf_event_attr __user *, attr_uptr,
4334                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4335 {
4336         struct perf_event *event, *group_leader;
4337         struct perf_event_attr attr;
4338         struct perf_event_context *ctx;
4339         struct file *event_file = NULL;
4340         struct file *group_file = NULL;
4341         int fput_needed = 0;
4342         int fput_needed2 = 0;
4343         int err;
4344
4345         /* for future expandability... */
4346         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4347                 return -EINVAL;
4348
4349         err = perf_copy_attr(attr_uptr, &attr);
4350         if (err)
4351                 return err;
4352
4353         if (!attr.exclude_kernel) {
4354                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4355                         return -EACCES;
4356         }
4357
4358         if (attr.freq) {
4359                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4360                         return -EINVAL;
4361         }
4362
4363         /*
4364          * Get the target context (task or percpu):
4365          */
4366         ctx = find_get_context(pid, cpu);
4367         if (IS_ERR(ctx))
4368                 return PTR_ERR(ctx);
4369
4370         /*
4371          * Look up the group leader (we will attach this event to it):
4372          */
4373         group_leader = NULL;
4374         if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4375                 err = -EINVAL;
4376                 group_file = fget_light(group_fd, &fput_needed);
4377                 if (!group_file)
4378                         goto err_put_context;
4379                 if (group_file->f_op != &perf_fops)
4380                         goto err_put_context;
4381
4382                 group_leader = group_file->private_data;
4383                 /*
4384                  * Do not allow a recursive hierarchy (this new sibling
4385                  * becoming part of another group-sibling):
4386                  */
4387                 if (group_leader->group_leader != group_leader)
4388                         goto err_put_context;
4389                 /*
4390                  * Do not allow to attach to a group in a different
4391                  * task or CPU context:
4392                  */
4393                 if (group_leader->ctx != ctx)
4394                         goto err_put_context;
4395                 /*
4396                  * Only a group leader can be exclusive or pinned
4397                  */
4398                 if (attr.exclusive || attr.pinned)
4399                         goto err_put_context;
4400         }
4401
4402         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4403                                      NULL, GFP_KERNEL);
4404         err = PTR_ERR(event);
4405         if (IS_ERR(event))
4406                 goto err_put_context;
4407
4408         err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4409         if (err < 0)
4410                 goto err_free_put_context;
4411
4412         event_file = fget_light(err, &fput_needed2);
4413         if (!event_file)
4414                 goto err_free_put_context;
4415
4416         if (flags & PERF_FLAG_FD_OUTPUT) {
4417                 err = perf_event_set_output(event, group_fd);
4418                 if (err)
4419                         goto err_fput_free_put_context;
4420         }
4421
4422         event->filp = event_file;
4423         WARN_ON_ONCE(ctx->parent_ctx);
4424         mutex_lock(&ctx->mutex);
4425         perf_install_in_context(ctx, event, cpu);
4426         ++ctx->generation;
4427         mutex_unlock(&ctx->mutex);
4428
4429         event->owner = current;
4430         get_task_struct(current);
4431         mutex_lock(&current->perf_event_mutex);
4432         list_add_tail(&event->owner_entry, &current->perf_event_list);
4433         mutex_unlock(&current->perf_event_mutex);
4434
4435 err_fput_free_put_context:
4436         fput_light(event_file, fput_needed2);
4437
4438 err_free_put_context:
4439         if (err < 0)
4440                 kfree(event);
4441
4442 err_put_context:
4443         if (err < 0)
4444                 put_ctx(ctx);
4445
4446         fput_light(group_file, fput_needed);
4447
4448         return err;
4449 }
4450
4451 /*
4452  * inherit a event from parent task to child task:
4453  */
4454 static struct perf_event *
4455 inherit_event(struct perf_event *parent_event,
4456               struct task_struct *parent,
4457               struct perf_event_context *parent_ctx,
4458               struct task_struct *child,
4459               struct perf_event *group_leader,
4460               struct perf_event_context *child_ctx)
4461 {
4462         struct perf_event *child_event;
4463
4464         /*
4465          * Instead of creating recursive hierarchies of events,
4466          * we link inherited events back to the original parent,
4467          * which has a filp for sure, which we use as the reference
4468          * count:
4469          */
4470         if (parent_event->parent)
4471                 parent_event = parent_event->parent;
4472
4473         child_event = perf_event_alloc(&parent_event->attr,
4474                                            parent_event->cpu, child_ctx,
4475                                            group_leader, parent_event,
4476                                            GFP_KERNEL);
4477         if (IS_ERR(child_event))
4478                 return child_event;
4479         get_ctx(child_ctx);
4480
4481         /*
4482          * Make the child state follow the state of the parent event,
4483          * not its attr.disabled bit.  We hold the parent's mutex,
4484          * so we won't race with perf_event_{en, dis}able_family.
4485          */
4486         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4487                 child_event->state = PERF_EVENT_STATE_INACTIVE;
4488         else
4489                 child_event->state = PERF_EVENT_STATE_OFF;
4490
4491         if (parent_event->attr.freq)
4492                 child_event->hw.sample_period = parent_event->hw.sample_period;
4493
4494         /*
4495          * Link it up in the child's context:
4496          */
4497         add_event_to_ctx(child_event, child_ctx);
4498
4499         /*
4500          * Get a reference to the parent filp - we will fput it
4501          * when the child event exits. This is safe to do because
4502          * we are in the parent and we know that the filp still
4503          * exists and has a nonzero count:
4504          */
4505         atomic_long_inc(&parent_event->filp->f_count);
4506
4507         /*
4508          * Link this into the parent event's child list
4509          */
4510         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4511         mutex_lock(&parent_event->child_mutex);
4512         list_add_tail(&child_event->child_list, &parent_event->child_list);
4513         mutex_unlock(&parent_event->child_mutex);
4514
4515         return child_event;
4516 }
4517
4518 static int inherit_group(struct perf_event *parent_event,
4519               struct task_struct *parent,
4520               struct perf_event_context *parent_ctx,
4521               struct task_struct *child,
4522               struct perf_event_context *child_ctx)
4523 {
4524         struct perf_event *leader;
4525         struct perf_event *sub;
4526         struct perf_event *child_ctr;
4527
4528         leader = inherit_event(parent_event, parent, parent_ctx,
4529                                  child, NULL, child_ctx);
4530         if (IS_ERR(leader))
4531                 return PTR_ERR(leader);
4532         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4533                 child_ctr = inherit_event(sub, parent, parent_ctx,
4534                                             child, leader, child_ctx);
4535                 if (IS_ERR(child_ctr))
4536                         return PTR_ERR(child_ctr);
4537         }
4538         return 0;
4539 }
4540
4541 static void sync_child_event(struct perf_event *child_event,
4542                                struct task_struct *child)
4543 {
4544         struct perf_event *parent_event = child_event->parent;
4545         u64 child_val;
4546
4547         if (child_event->attr.inherit_stat)
4548                 perf_event_read_event(child_event, child);
4549
4550         child_val = atomic64_read(&child_event->count);
4551
4552         /*
4553          * Add back the child's count to the parent's count:
4554          */
4555         atomic64_add(child_val, &parent_event->count);
4556         atomic64_add(child_event->total_time_enabled,
4557                      &parent_event->child_total_time_enabled);
4558         atomic64_add(child_event->total_time_running,
4559                      &parent_event->child_total_time_running);
4560
4561         /*
4562          * Remove this event from the parent's list
4563          */
4564         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4565         mutex_lock(&parent_event->child_mutex);
4566         list_del_init(&child_event->child_list);
4567         mutex_unlock(&parent_event->child_mutex);
4568
4569         /*
4570          * Release the parent event, if this was the last
4571          * reference to it.
4572          */
4573         fput(parent_event->filp);
4574 }
4575
4576 static void
4577 __perf_event_exit_task(struct perf_event *child_event,
4578                          struct perf_event_context *child_ctx,
4579                          struct task_struct *child)
4580 {
4581         struct perf_event *parent_event;
4582
4583         update_event_times(child_event);
4584         perf_event_remove_from_context(child_event);
4585
4586         parent_event = child_event->parent;
4587         /*
4588          * It can happen that parent exits first, and has events
4589          * that are still around due to the child reference. These
4590          * events need to be zapped - but otherwise linger.
4591          */
4592         if (parent_event) {
4593                 sync_child_event(child_event, child);
4594                 free_event(child_event);
4595         }
4596 }
4597
4598 /*
4599  * When a child task exits, feed back event values to parent events.
4600  */
4601 void perf_event_exit_task(struct task_struct *child)
4602 {
4603         struct perf_event *child_event, *tmp;
4604         struct perf_event_context *child_ctx;
4605         unsigned long flags;
4606
4607         if (likely(!child->perf_event_ctxp)) {
4608                 perf_event_task(child, NULL, 0);
4609                 return;
4610         }
4611
4612         local_irq_save(flags);
4613         /*
4614          * We can't reschedule here because interrupts are disabled,
4615          * and either child is current or it is a task that can't be
4616          * scheduled, so we are now safe from rescheduling changing
4617          * our context.
4618          */
4619         child_ctx = child->perf_event_ctxp;
4620         __perf_event_task_sched_out(child_ctx);
4621
4622         /*
4623          * Take the context lock here so that if find_get_context is
4624          * reading child->perf_event_ctxp, we wait until it has
4625          * incremented the context's refcount before we do put_ctx below.
4626          */
4627         spin_lock(&child_ctx->lock);
4628         child->perf_event_ctxp = NULL;
4629         /*
4630          * If this context is a clone; unclone it so it can't get
4631          * swapped to another process while we're removing all
4632          * the events from it.
4633          */
4634         unclone_ctx(child_ctx);
4635         spin_unlock_irqrestore(&child_ctx->lock, flags);
4636
4637         /*
4638          * Report the task dead after unscheduling the events so that we
4639          * won't get any samples after PERF_RECORD_EXIT. We can however still
4640          * get a few PERF_RECORD_READ events.
4641          */
4642         perf_event_task(child, child_ctx, 0);
4643
4644         /*
4645          * We can recurse on the same lock type through:
4646          *
4647          *   __perf_event_exit_task()
4648          *     sync_child_event()
4649          *       fput(parent_event->filp)
4650          *         perf_release()
4651          *           mutex_lock(&ctx->mutex)
4652          *
4653          * But since its the parent context it won't be the same instance.
4654          */
4655         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4656
4657 again:
4658         list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4659                                  group_entry)
4660                 __perf_event_exit_task(child_event, child_ctx, child);
4661
4662         /*
4663          * If the last event was a group event, it will have appended all
4664          * its siblings to the list, but we obtained 'tmp' before that which
4665          * will still point to the list head terminating the iteration.
4666          */
4667         if (!list_empty(&child_ctx->group_list))
4668                 goto again;
4669
4670         mutex_unlock(&child_ctx->mutex);
4671
4672         put_ctx(child_ctx);
4673 }
4674
4675 /*
4676  * free an unexposed, unused context as created by inheritance by
4677  * init_task below, used by fork() in case of fail.
4678  */
4679 void perf_event_free_task(struct task_struct *task)
4680 {
4681         struct perf_event_context *ctx = task->perf_event_ctxp;
4682         struct perf_event *event, *tmp;
4683
4684         if (!ctx)
4685                 return;
4686
4687         mutex_lock(&ctx->mutex);
4688 again:
4689         list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4690                 struct perf_event *parent = event->parent;
4691
4692                 if (WARN_ON_ONCE(!parent))
4693                         continue;
4694
4695                 mutex_lock(&parent->child_mutex);
4696                 list_del_init(&event->child_list);
4697                 mutex_unlock(&parent->child_mutex);
4698
4699                 fput(parent->filp);
4700
4701                 list_del_event(event, ctx);
4702                 free_event(event);
4703         }
4704
4705         if (!list_empty(&ctx->group_list))
4706                 goto again;
4707
4708         mutex_unlock(&ctx->mutex);
4709
4710         put_ctx(ctx);
4711 }
4712
4713 /*
4714  * Initialize the perf_event context in task_struct
4715  */
4716 int perf_event_init_task(struct task_struct *child)
4717 {
4718         struct perf_event_context *child_ctx, *parent_ctx;
4719         struct perf_event_context *cloned_ctx;
4720         struct perf_event *event;
4721         struct task_struct *parent = current;
4722         int inherited_all = 1;
4723         int ret = 0;
4724
4725         child->perf_event_ctxp = NULL;
4726
4727         mutex_init(&child->perf_event_mutex);
4728         INIT_LIST_HEAD(&child->perf_event_list);
4729
4730         if (likely(!parent->perf_event_ctxp))
4731                 return 0;
4732
4733         /*
4734          * This is executed from the parent task context, so inherit
4735          * events that have been marked for cloning.
4736          * First allocate and initialize a context for the child.
4737          */
4738
4739         child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4740         if (!child_ctx)
4741                 return -ENOMEM;
4742
4743         __perf_event_init_context(child_ctx, child);
4744         child->perf_event_ctxp = child_ctx;
4745         get_task_struct(child);
4746
4747         /*
4748          * If the parent's context is a clone, pin it so it won't get
4749          * swapped under us.
4750          */
4751         parent_ctx = perf_pin_task_context(parent);
4752
4753         /*
4754          * No need to check if parent_ctx != NULL here; since we saw
4755          * it non-NULL earlier, the only reason for it to become NULL
4756          * is if we exit, and since we're currently in the middle of
4757          * a fork we can't be exiting at the same time.
4758          */
4759
4760         /*
4761          * Lock the parent list. No need to lock the child - not PID
4762          * hashed yet and not running, so nobody can access it.
4763          */
4764         mutex_lock(&parent_ctx->mutex);
4765
4766         /*
4767          * We dont have to disable NMIs - we are only looking at
4768          * the list, not manipulating it:
4769          */
4770         list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4771
4772                 if (!event->attr.inherit) {
4773                         inherited_all = 0;
4774                         continue;
4775                 }
4776
4777                 ret = inherit_group(event, parent, parent_ctx,
4778                                              child, child_ctx);
4779                 if (ret) {
4780                         inherited_all = 0;
4781                         break;
4782                 }
4783         }
4784
4785         if (inherited_all) {
4786                 /*
4787                  * Mark the child context as a clone of the parent
4788                  * context, or of whatever the parent is a clone of.
4789                  * Note that if the parent is a clone, it could get
4790                  * uncloned at any point, but that doesn't matter
4791                  * because the list of events and the generation
4792                  * count can't have changed since we took the mutex.
4793                  */
4794                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4795                 if (cloned_ctx) {
4796                         child_ctx->parent_ctx = cloned_ctx;
4797                         child_ctx->parent_gen = parent_ctx->parent_gen;
4798                 } else {
4799                         child_ctx->parent_ctx = parent_ctx;
4800                         child_ctx->parent_gen = parent_ctx->generation;
4801                 }
4802                 get_ctx(child_ctx->parent_ctx);
4803         }
4804
4805         mutex_unlock(&parent_ctx->mutex);
4806
4807         perf_unpin_context(parent_ctx);
4808
4809         return ret;
4810 }
4811
4812 static void __cpuinit perf_event_init_cpu(int cpu)
4813 {
4814         struct perf_cpu_context *cpuctx;
4815
4816         cpuctx = &per_cpu(perf_cpu_context, cpu);
4817         __perf_event_init_context(&cpuctx->ctx, NULL);
4818
4819         spin_lock(&perf_resource_lock);
4820         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4821         spin_unlock(&perf_resource_lock);
4822
4823         hw_perf_event_setup(cpu);
4824 }
4825
4826 #ifdef CONFIG_HOTPLUG_CPU
4827 static void __perf_event_exit_cpu(void *info)
4828 {
4829         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4830         struct perf_event_context *ctx = &cpuctx->ctx;
4831         struct perf_event *event, *tmp;
4832
4833         list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
4834                 __perf_event_remove_from_context(event);
4835 }
4836 static void perf_event_exit_cpu(int cpu)
4837 {
4838         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4839         struct perf_event_context *ctx = &cpuctx->ctx;
4840
4841         mutex_lock(&ctx->mutex);
4842         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
4843         mutex_unlock(&ctx->mutex);
4844 }
4845 #else
4846 static inline void perf_event_exit_cpu(int cpu) { }
4847 #endif
4848
4849 static int __cpuinit
4850 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4851 {
4852         unsigned int cpu = (long)hcpu;
4853
4854         switch (action) {
4855
4856         case CPU_UP_PREPARE:
4857         case CPU_UP_PREPARE_FROZEN:
4858                 perf_event_init_cpu(cpu);
4859                 break;
4860
4861         case CPU_ONLINE:
4862         case CPU_ONLINE_FROZEN:
4863                 hw_perf_event_setup_online(cpu);
4864                 break;
4865
4866         case CPU_DOWN_PREPARE:
4867         case CPU_DOWN_PREPARE_FROZEN:
4868                 perf_event_exit_cpu(cpu);
4869                 break;
4870
4871         default:
4872                 break;
4873         }
4874
4875         return NOTIFY_OK;
4876 }
4877
4878 /*
4879  * This has to have a higher priority than migration_notifier in sched.c.
4880  */
4881 static struct notifier_block __cpuinitdata perf_cpu_nb = {
4882         .notifier_call          = perf_cpu_notify,
4883         .priority               = 20,
4884 };
4885
4886 void __init perf_event_init(void)
4887 {
4888         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4889                         (void *)(long)smp_processor_id());
4890         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4891                         (void *)(long)smp_processor_id());
4892         register_cpu_notifier(&perf_cpu_nb);
4893 }
4894
4895 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4896 {
4897         return sprintf(buf, "%d\n", perf_reserved_percpu);
4898 }
4899
4900 static ssize_t
4901 perf_set_reserve_percpu(struct sysdev_class *class,
4902                         const char *buf,
4903                         size_t count)
4904 {
4905         struct perf_cpu_context *cpuctx;
4906         unsigned long val;
4907         int err, cpu, mpt;
4908
4909         err = strict_strtoul(buf, 10, &val);
4910         if (err)
4911                 return err;
4912         if (val > perf_max_events)
4913                 return -EINVAL;
4914
4915         spin_lock(&perf_resource_lock);
4916         perf_reserved_percpu = val;
4917         for_each_online_cpu(cpu) {
4918                 cpuctx = &per_cpu(perf_cpu_context, cpu);
4919                 spin_lock_irq(&cpuctx->ctx.lock);
4920                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
4921                           perf_max_events - perf_reserved_percpu);
4922                 cpuctx->max_pertask = mpt;
4923                 spin_unlock_irq(&cpuctx->ctx.lock);
4924         }
4925         spin_unlock(&perf_resource_lock);
4926
4927         return count;
4928 }
4929
4930 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4931 {
4932         return sprintf(buf, "%d\n", perf_overcommit);
4933 }
4934
4935 static ssize_t
4936 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4937 {
4938         unsigned long val;
4939         int err;
4940
4941         err = strict_strtoul(buf, 10, &val);
4942         if (err)
4943                 return err;
4944         if (val > 1)
4945                 return -EINVAL;
4946
4947         spin_lock(&perf_resource_lock);
4948         perf_overcommit = val;
4949         spin_unlock(&perf_resource_lock);
4950
4951         return count;
4952 }
4953
4954 static SYSDEV_CLASS_ATTR(
4955                                 reserve_percpu,
4956                                 0644,
4957                                 perf_show_reserve_percpu,
4958                                 perf_set_reserve_percpu
4959                         );
4960
4961 static SYSDEV_CLASS_ATTR(
4962                                 overcommit,
4963                                 0644,
4964                                 perf_show_overcommit,
4965                                 perf_set_overcommit
4966                         );
4967
4968 static struct attribute *perfclass_attrs[] = {
4969         &attr_reserve_percpu.attr,
4970         &attr_overcommit.attr,
4971         NULL
4972 };
4973
4974 static struct attribute_group perfclass_attr_group = {
4975         .attrs                  = perfclass_attrs,
4976         .name                   = "perf_events",
4977 };
4978
4979 static int __init perf_event_sysfs_init(void)
4980 {
4981         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4982                                   &perfclass_attr_group);
4983 }
4984 device_initcall(perf_event_sysfs_init);