kernel/perf_event.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/slab.h>
  19 #include <linux/hash.h>
  20 #include <linux/sysfs.h>
  21 #include <linux/dcache.h>
  22 #include <linux/percpu.h>
  23 #include <linux/ptrace.h>
  24 #include <linux/vmstat.h>
  25 #include <linux/vmalloc.h>
  26 #include <linux/hardirq.h>
  27 #include <linux/rculist.h>
  28 #include <linux/uaccess.h>
  29 #include <linux/syscalls.h>
  30 #include <linux/anon_inodes.h>
  31 #include <linux/kernel_stat.h>
  32 #include <linux/perf_event.h>
  33 #include <linux/ftrace_event.h>
  34 #include <linux/hw_breakpoint.h>
  35
  36 #include <asm/irq_regs.h>
  37
  38 /*
  39  * Each CPU has a list of per CPU events:
  40  */
  41 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  42
  43 int perf_max_events __read_mostly = 1;
  44 static int perf_reserved_percpu __read_mostly;
  45 static int perf_overcommit __read_mostly = 1;
  46
  47 static atomic_t nr_events __read_mostly;
  48 static atomic_t nr_mmap_events __read_mostly;
  49 static atomic_t nr_comm_events __read_mostly;
  50 static atomic_t nr_task_events __read_mostly;
  51
  52 /*
  53  * perf event paranoia level:
  54  *  -1 - not paranoid at all
  55  *   0 - disallow raw tracepoint access for unpriv
  56  *   1 - disallow cpu events for unpriv
  57  *   2 - disallow kernel profiling for unpriv
  58  */
  59 int sysctl_perf_event_paranoid __read_mostly = 1;
  60
  61 /* Minimum for 128 pages + 1 for the user control page */
  62 int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */
  63
  64 /*
  65  * max perf event sample rate
  66  */
  67 int sysctl_perf_event_sample_rate __read_mostly = 100000;
  68
  69 static atomic64_t perf_event_id;
  70
  71 /*
  72  * Lock for (sysadmin-configurable) event reservations:
  73  */
  74 static DEFINE_SPINLOCK(perf_resource_lock);
  75
  76 /*
  77  * Architecture provided APIs - weak aliases:
  78  */
  79 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  80 {
  81         return NULL;
  82 }
  83
  84 void __weak hw_perf_disable(void)               { barrier(); }
  85 void __weak hw_perf_enable(void)                { barrier(); }
  86
  87 void __weak perf_event_print_debug(void)        { }
  88
  89 static DEFINE_PER_CPU(int, perf_disable_count);
  90
  91 void perf_disable(void)
  92 {
  93         if (!__get_cpu_var(perf_disable_count)++)
  94                 hw_perf_disable();
  95 }
  96
  97 void perf_enable(void)
  98 {
  99         if (!--__get_cpu_var(perf_disable_count))
 100                 hw_perf_enable();
 101 }
 102
 103 static void get_ctx(struct perf_event_context *ctx)
 104 {
 105         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 106 }
 107
 108 static void free_ctx(struct rcu_head *head)
 109 {
 110         struct perf_event_context *ctx;
 111
 112         ctx = container_of(head, struct perf_event_context, rcu_head);
 113         kfree(ctx);
 114 }
 115
 116 static void put_ctx(struct perf_event_context *ctx)
 117 {
 118         if (atomic_dec_and_test(&ctx->refcount)) {
 119                 if (ctx->parent_ctx)
 120                         put_ctx(ctx->parent_ctx);
 121                 if (ctx->task)
 122                         put_task_struct(ctx->task);
 123                 call_rcu(&ctx->rcu_head, free_ctx);
 124         }
 125 }
 126
 127 static void unclone_ctx(struct perf_event_context *ctx)
 128 {
 129         if (ctx->parent_ctx) {
 130                 put_ctx(ctx->parent_ctx);
 131                 ctx->parent_ctx = NULL;
 132         }
 133 }
 134
 135 /*
 136  * If we inherit events we want to return the parent event id
 137  * to userspace.
 138  */
 139 static u64 primary_event_id(struct perf_event *event)
 140 {
 141         u64 id = event->id;
 142
 143         if (event->parent)
 144                 id = event->parent->id;
 145
 146         return id;
 147 }
 148
 149 /*
 150  * Get the perf_event_context for a task and lock it.
 151  * This has to cope with with the fact that until it is locked,
 152  * the context could get moved to another task.
 153  */
 154 static struct perf_event_context *
 155 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 156 {
 157         struct perf_event_context *ctx;
 158
 159         rcu_read_lock();
 160  retry:
 161         ctx = rcu_dereference(task->perf_event_ctxp);
 162         if (ctx) {
 163                 /*
 164                  * If this context is a clone of another, it might
 165                  * get swapped for another underneath us by
 166                  * perf_event_task_sched_out, though the
 167                  * rcu_read_lock() protects us from any context
 168                  * getting freed.  Lock the context and check if it
 169                  * got swapped before we could get the lock, and retry
 170                  * if so.  If we locked the right context, then it
 171                  * can't get swapped on us any more.
 172                  */
 173                 raw_spin_lock_irqsave(&ctx->lock, *flags);
 174                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 175                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 176                         goto retry;
 177                 }
 178
 179                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 180                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 181                         ctx = NULL;
 182                 }
 183         }
 184         rcu_read_unlock();
 185         return ctx;
 186 }
 187
 188 /*
 189  * Get the context for a task and increment its pin_count so it
 190  * can't get swapped to another task.  This also increments its
 191  * reference count so that the context can't get freed.
 192  */
 193 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 194 {
 195         struct perf_event_context *ctx;
 196         unsigned long flags;
 197
 198         ctx = perf_lock_task_context(task, &flags);
 199         if (ctx) {
 200                 ++ctx->pin_count;
 201                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
 202         }
 203         return ctx;
 204 }
 205
 206 static void perf_unpin_context(struct perf_event_context *ctx)
 207 {
 208         unsigned long flags;
 209
 210         raw_spin_lock_irqsave(&ctx->lock, flags);
 211         --ctx->pin_count;
 212         raw_spin_unlock_irqrestore(&ctx->lock, flags);
 213         put_ctx(ctx);
 214 }
 215
 216 static inline u64 perf_clock(void)
 217 {
 218         return cpu_clock(raw_smp_processor_id());
 219 }
 220
 221 /*
 222  * Update the record of the current time in a context.
 223  */
 224 static void update_context_time(struct perf_event_context *ctx)
 225 {
 226         u64 now = perf_clock();
 227
 228         ctx->time += now - ctx->timestamp;
 229         ctx->timestamp = now;
 230 }
 231
 232 /*
 233  * Update the total_time_enabled and total_time_running fields for a event.
 234  */
 235 static void update_event_times(struct perf_event *event)
 236 {
 237         struct perf_event_context *ctx = event->ctx;
 238         u64 run_end;
 239
 240         if (event->state < PERF_EVENT_STATE_INACTIVE ||
 241             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 242                 return;
 243
 244         if (ctx->is_active)
 245                 run_end = ctx->time;
 246         else
 247                 run_end = event->tstamp_stopped;
 248
 249         event->total_time_enabled = run_end - event->tstamp_enabled;
 250
 251         if (event->state == PERF_EVENT_STATE_INACTIVE)
 252                 run_end = event->tstamp_stopped;
 253         else
 254                 run_end = ctx->time;
 255
 256         event->total_time_running = run_end - event->tstamp_running;
 257 }
 258
 259 /*
 260  * Update total_time_enabled and total_time_running for all events in a group.
 261  */
 262 static void update_group_times(struct perf_event *leader)
 263 {
 264         struct perf_event *event;
 265
 266         update_event_times(leader);
 267         list_for_each_entry(event, &leader->sibling_list, group_entry)
 268                 update_event_times(event);
 269 }
 270
 271 static struct list_head *
 272 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 273 {
 274         if (event->attr.pinned)
 275                 return &ctx->pinned_groups;
 276         else
 277                 return &ctx->flexible_groups;
 278 }
 279
 280 /*
 281  * Add a event from the lists for its context.
 282  * Must be called with ctx->mutex and ctx->lock held.
 283  */
 284 static void
 285 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 286 {
 287         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
 288         event->attach_state |= PERF_ATTACH_CONTEXT;
 289
 290         /*
 291          * If we're a stand alone event or group leader, we go to the context
 292          * list, group events are kept attached to the group so that
 293          * perf_group_detach can, at all times, locate all siblings.
 294          */
 295         if (event->group_leader == event) {
 296                 struct list_head *list;
 297
 298                 if (is_software_event(event))
 299                         event->group_flags |= PERF_GROUP_SOFTWARE;
 300
 301                 list = ctx_group_list(event, ctx);
 302                 list_add_tail(&event->group_entry, list);
 303         }
 304
 305         list_add_rcu(&event->event_entry, &ctx->event_list);
 306         ctx->nr_events++;
 307         if (event->attr.inherit_stat)
 308                 ctx->nr_stat++;
 309 }
 310
 311 static void perf_group_attach(struct perf_event *event)
 312 {
 313         struct perf_event *group_leader = event->group_leader;
 314
 315         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
 316         event->attach_state |= PERF_ATTACH_GROUP;
 317
 318         if (group_leader == event)
 319                 return;
 320
 321         if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
 322                         !is_software_event(event))
 323                 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
 324
 325         list_add_tail(&event->group_entry, &group_leader->sibling_list);
 326         group_leader->nr_siblings++;
 327 }
 328
 329 /*
 330  * Remove a event from the lists for its context.
 331  * Must be called with ctx->mutex and ctx->lock held.
 332  */
 333 static void
 334 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 335 {
 336         /*
 337          * We can have double detach due to exit/hot-unplug + close.
 338          */
 339         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
 340                 return;
 341
 342         event->attach_state &= ~PERF_ATTACH_CONTEXT;
 343
 344         ctx->nr_events--;
 345         if (event->attr.inherit_stat)
 346                 ctx->nr_stat--;
 347
 348         list_del_rcu(&event->event_entry);
 349
 350         if (event->group_leader == event)
 351                 list_del_init(&event->group_entry);
 352
 353         update_group_times(event);
 354
 355         /*
 356          * If event was in error state, then keep it
 357          * that way, otherwise bogus counts will be
 358          * returned on read(). The only way to get out
 359          * of error state is by explicit re-enabling
 360          * of the event
 361          */
 362         if (event->state > PERF_EVENT_STATE_OFF)
 363                 event->state = PERF_EVENT_STATE_OFF;
 364 }
 365
 366 static void perf_group_detach(struct perf_event *event)
 367 {
 368         struct perf_event *sibling, *tmp;
 369         struct list_head *list = NULL;
 370
 371         /*
 372          * We can have double detach due to exit/hot-unplug + close.
 373          */
 374         if (!(event->attach_state & PERF_ATTACH_GROUP))
 375                 return;
 376
 377         event->attach_state &= ~PERF_ATTACH_GROUP;
 378
 379         /*
 380          * If this is a sibling, remove it from its group.
 381          */
 382         if (event->group_leader != event) {
 383                 list_del_init(&event->group_entry);
 384                 event->group_leader->nr_siblings--;
 385                 return;
 386         }
 387
 388         if (!list_empty(&event->group_entry))
 389                 list = &event->group_entry;
 390
 391         /*
 392          * If this was a group event with sibling events then
 393          * upgrade the siblings to singleton events by adding them
 394          * to whatever list we are on.
 395          */
 396         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 397                 if (list)
 398                         list_move_tail(&sibling->group_entry, list);
 399                 sibling->group_leader = sibling;
 400
 401                 /* Inherit group flags from the previous leader */
 402                 sibling->group_flags = event->group_flags;
 403         }
 404 }
 405
 406 static void
 407 event_sched_out(struct perf_event *event,
 408                   struct perf_cpu_context *cpuctx,
 409                   struct perf_event_context *ctx)
 410 {
 411         if (event->state != PERF_EVENT_STATE_ACTIVE)
 412                 return;
 413
 414         event->state = PERF_EVENT_STATE_INACTIVE;
 415         if (event->pending_disable) {
 416                 event->pending_disable = 0;
 417                 event->state = PERF_EVENT_STATE_OFF;
 418         }
 419         event->tstamp_stopped = ctx->time;
 420         event->pmu->disable(event);
 421         event->oncpu = -1;
 422
 423         if (!is_software_event(event))
 424                 cpuctx->active_oncpu--;
 425         ctx->nr_active--;
 426         if (event->attr.exclusive || !cpuctx->active_oncpu)
 427                 cpuctx->exclusive = 0;
 428 }
 429
 430 static void
 431 group_sched_out(struct perf_event *group_event,
 432                 struct perf_cpu_context *cpuctx,
 433                 struct perf_event_context *ctx)
 434 {
 435         struct perf_event *event;
 436
 437         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 438                 return;
 439
 440         event_sched_out(group_event, cpuctx, ctx);
 441
 442         /*
 443          * Schedule out siblings (if any):
 444          */
 445         list_for_each_entry(event, &group_event->sibling_list, group_entry)
 446                 event_sched_out(event, cpuctx, ctx);
 447
 448         if (group_event->attr.exclusive)
 449                 cpuctx->exclusive = 0;
 450 }
 451
 452 /*
 453  * Cross CPU call to remove a performance event
 454  *
 455  * We disable the event on the hardware level first. After that we
 456  * remove it from the context list.
 457  */
 458 static void __perf_event_remove_from_context(void *info)
 459 {
 460         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 461         struct perf_event *event = info;
 462         struct perf_event_context *ctx = event->ctx;
 463
 464         /*
 465          * If this is a task context, we need to check whether it is
 466          * the current task context of this cpu. If not it has been
 467          * scheduled out before the smp call arrived.
 468          */
 469         if (ctx->task && cpuctx->task_ctx != ctx)
 470                 return;
 471
 472         raw_spin_lock(&ctx->lock);
 473         /*
 474          * Protect the list operation against NMI by disabling the
 475          * events on a global level.
 476          */
 477         perf_disable();
 478
 479         event_sched_out(event, cpuctx, ctx);
 480
 481         list_del_event(event, ctx);
 482
 483         if (!ctx->task) {
 484                 /*
 485                  * Allow more per task events with respect to the
 486                  * reservation:
 487                  */
 488                 cpuctx->max_pertask =
 489                         min(perf_max_events - ctx->nr_events,
 490                             perf_max_events - perf_reserved_percpu);
 491         }
 492
 493         perf_enable();
 494         raw_spin_unlock(&ctx->lock);
 495 }
 496
 497
 498 /*
 499  * Remove the event from a task's (or a CPU's) list of events.
 500  *
 501  * Must be called with ctx->mutex held.
 502  *
 503  * CPU events are removed with a smp call. For task events we only
 504  * call when the task is on a CPU.
 505  *
 506  * If event->ctx is a cloned context, callers must make sure that
 507  * every task struct that event->ctx->task could possibly point to
 508  * remains valid.  This is OK when called from perf_release since
 509  * that only calls us on the top-level context, which can't be a clone.
 510  * When called from perf_event_exit_task, it's OK because the
 511  * context has been detached from its task.
 512  */
 513 static void perf_event_remove_from_context(struct perf_event *event)
 514 {
 515         struct perf_event_context *ctx = event->ctx;
 516         struct task_struct *task = ctx->task;
 517
 518         if (!task) {
 519                 /*
 520                  * Per cpu events are removed via an smp call and
 521                  * the removal is always successful.
 522                  */
 523                 smp_call_function_single(event->cpu,
 524                                          __perf_event_remove_from_context,
 525                                          event, 1);
 526                 return;
 527         }
 528
 529 retry:
 530         task_oncpu_function_call(task, __perf_event_remove_from_context,
 531                                  event);
 532
 533         raw_spin_lock_irq(&ctx->lock);
 534         /*
 535          * If the context is active we need to retry the smp call.
 536          */
 537         if (ctx->nr_active && !list_empty(&event->group_entry)) {
 538                 raw_spin_unlock_irq(&ctx->lock);
 539                 goto retry;
 540         }
 541
 542         /*
 543          * The lock prevents that this context is scheduled in so we
 544          * can remove the event safely, if the call above did not
 545          * succeed.
 546          */
 547         if (!list_empty(&event->group_entry))
 548                 list_del_event(event, ctx);
 549         raw_spin_unlock_irq(&ctx->lock);
 550 }
 551
 552 /*
 553  * Cross CPU call to disable a performance event
 554  */
 555 static void __perf_event_disable(void *info)
 556 {
 557         struct perf_event *event = info;
 558         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 559         struct perf_event_context *ctx = event->ctx;
 560
 561         /*
 562          * If this is a per-task event, need to check whether this
 563          * event's task is the current task on this cpu.
 564          */
 565         if (ctx->task && cpuctx->task_ctx != ctx)
 566                 return;
 567
 568         raw_spin_lock(&ctx->lock);
 569
 570         /*
 571          * If the event is on, turn it off.
 572          * If it is in error state, leave it in error state.
 573          */
 574         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 575                 update_context_time(ctx);
 576                 update_group_times(event);
 577                 if (event == event->group_leader)
 578                         group_sched_out(event, cpuctx, ctx);
 579                 else
 580                         event_sched_out(event, cpuctx, ctx);
 581                 event->state = PERF_EVENT_STATE_OFF;
 582         }
 583
 584         raw_spin_unlock(&ctx->lock);
 585 }
 586
 587 /*
 588  * Disable a event.
 589  *
 590  * If event->ctx is a cloned context, callers must make sure that
 591  * every task struct that event->ctx->task could possibly point to
 592  * remains valid.  This condition is satisifed when called through
 593  * perf_event_for_each_child or perf_event_for_each because they
 594  * hold the top-level event's child_mutex, so any descendant that
 595  * goes to exit will block in sync_child_event.
 596  * When called from perf_pending_event it's OK because event->ctx
 597  * is the current context on this CPU and preemption is disabled,
 598  * hence we can't get into perf_event_task_sched_out for this context.
 599  */
 600 void perf_event_disable(struct perf_event *event)
 601 {
 602         struct perf_event_context *ctx = event->ctx;
 603         struct task_struct *task = ctx->task;
 604
 605         if (!task) {
 606                 /*
 607                  * Disable the event on the cpu that it's on
 608                  */
 609                 smp_call_function_single(event->cpu, __perf_event_disable,
 610                                          event, 1);
 611                 return;
 612         }
 613
 614  retry:
 615         task_oncpu_function_call(task, __perf_event_disable, event);
 616
 617         raw_spin_lock_irq(&ctx->lock);
 618         /*
 619          * If the event is still active, we need to retry the cross-call.
 620          */
 621         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 622                 raw_spin_unlock_irq(&ctx->lock);
 623                 goto retry;
 624         }
 625
 626         /*
 627          * Since we have the lock this context can't be scheduled
 628          * in, so we can change the state safely.
 629          */
 630         if (event->state == PERF_EVENT_STATE_INACTIVE) {
 631                 update_group_times(event);
 632                 event->state = PERF_EVENT_STATE_OFF;
 633         }
 634
 635         raw_spin_unlock_irq(&ctx->lock);
 636 }
 637
 638 static int
 639 event_sched_in(struct perf_event *event,
 640                  struct perf_cpu_context *cpuctx,
 641                  struct perf_event_context *ctx)
 642 {
 643         if (event->state <= PERF_EVENT_STATE_OFF)
 644                 return 0;
 645
 646         event->state = PERF_EVENT_STATE_ACTIVE;
 647         event->oncpu = smp_processor_id();
 648         /*
 649          * The new state must be visible before we turn it on in the hardware:
 650          */
 651         smp_wmb();
 652
 653         if (event->pmu->enable(event)) {
 654                 event->state = PERF_EVENT_STATE_INACTIVE;
 655                 event->oncpu = -1;
 656                 return -EAGAIN;
 657         }
 658
 659         event->tstamp_running += ctx->time - event->tstamp_stopped;
 660
 661         if (!is_software_event(event))
 662                 cpuctx->active_oncpu++;
 663         ctx->nr_active++;
 664
 665         if (event->attr.exclusive)
 666                 cpuctx->exclusive = 1;
 667
 668         return 0;
 669 }
 670
 671 static int
 672 group_sched_in(struct perf_event *group_event,
 673                struct perf_cpu_context *cpuctx,
 674                struct perf_event_context *ctx)
 675 {
 676         struct perf_event *event, *partial_group = NULL;
 677         const struct pmu *pmu = group_event->pmu;
 678         bool txn = false;
 679         int ret;
 680
 681         if (group_event->state == PERF_EVENT_STATE_OFF)
 682                 return 0;
 683
 684         /* Check if group transaction availabe */
 685         if (pmu->start_txn)
 686                 txn = true;
 687
 688         if (txn)
 689                 pmu->start_txn(pmu);
 690
 691         if (event_sched_in(group_event, cpuctx, ctx)) {
 692                 if (txn)
 693                         pmu->cancel_txn(pmu);
 694                 return -EAGAIN;
 695         }
 696
 697         /*
 698          * Schedule in siblings as one group (if any):
 699          */
 700         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 701                 if (event_sched_in(event, cpuctx, ctx)) {
 702                         partial_group = event;
 703                         goto group_error;
 704                 }
 705         }
 706
 707         if (!txn)
 708                 return 0;
 709
 710         ret = pmu->commit_txn(pmu);
 711         if (!ret) {
 712                 pmu->cancel_txn(pmu);
 713                 return 0;
 714         }
 715
 716 group_error:
 717         /*
 718          * Groups can be scheduled in as one unit only, so undo any
 719          * partial group before returning:
 720          */
 721         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 722                 if (event == partial_group)
 723                         break;
 724                 event_sched_out(event, cpuctx, ctx);
 725         }
 726         event_sched_out(group_event, cpuctx, ctx);
 727
 728         if (txn)
 729                 pmu->cancel_txn(pmu);
 730
 731         return -EAGAIN;
 732 }
 733
 734 /*
 735  * Work out whether we can put this event group on the CPU now.
 736  */
 737 static int group_can_go_on(struct perf_event *event,
 738                            struct perf_cpu_context *cpuctx,
 739                            int can_add_hw)
 740 {
 741         /*
 742          * Groups consisting entirely of software events can always go on.
 743          */
 744         if (event->group_flags & PERF_GROUP_SOFTWARE)
 745                 return 1;
 746         /*
 747          * If an exclusive group is already on, no other hardware
 748          * events can go on.
 749          */
 750         if (cpuctx->exclusive)
 751                 return 0;
 752         /*
 753          * If this group is exclusive and there are already
 754          * events on the CPU, it can't go on.
 755          */
 756         if (event->attr.exclusive && cpuctx->active_oncpu)
 757                 return 0;
 758         /*
 759          * Otherwise, try to add it if all previous groups were able
 760          * to go on.
 761          */
 762         return can_add_hw;
 763 }
 764
 765 static void add_event_to_ctx(struct perf_event *event,
 766                                struct perf_event_context *ctx)
 767 {
 768         list_add_event(event, ctx);
 769         perf_group_attach(event);
 770         event->tstamp_enabled = ctx->time;
 771         event->tstamp_running = ctx->time;
 772         event->tstamp_stopped = ctx->time;
 773 }
 774
 775 /*
 776  * Cross CPU call to install and enable a performance event
 777  *
 778  * Must be called with ctx->mutex held
 779  */
 780 static void __perf_install_in_context(void *info)
 781 {
 782         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 783         struct perf_event *event = info;
 784         struct perf_event_context *ctx = event->ctx;
 785         struct perf_event *leader = event->group_leader;
 786         int err;
 787
 788         /*
 789          * If this is a task context, we need to check whether it is
 790          * the current task context of this cpu. If not it has been
 791          * scheduled out before the smp call arrived.
 792          * Or possibly this is the right context but it isn't
 793          * on this cpu because it had no events.
 794          */
 795         if (ctx->task && cpuctx->task_ctx != ctx) {
 796                 if (cpuctx->task_ctx || ctx->task != current)
 797                         return;
 798                 cpuctx->task_ctx = ctx;
 799         }
 800
 801         raw_spin_lock(&ctx->lock);
 802         ctx->is_active = 1;
 803         update_context_time(ctx);
 804
 805         /*
 806          * Protect the list operation against NMI by disabling the
 807          * events on a global level. NOP for non NMI based events.
 808          */
 809         perf_disable();
 810
 811         add_event_to_ctx(event, ctx);
 812
 813         if (event->cpu != -1 && event->cpu != smp_processor_id())
 814                 goto unlock;
 815
 816         /*
 817          * Don't put the event on if it is disabled or if
 818          * it is in a group and the group isn't on.
 819          */
 820         if (event->state != PERF_EVENT_STATE_INACTIVE ||
 821             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 822                 goto unlock;
 823
 824         /*
 825          * An exclusive event can't go on if there are already active
 826          * hardware events, and no hardware event can go on if there
 827          * is already an exclusive event on.
 828          */
 829         if (!group_can_go_on(event, cpuctx, 1))
 830                 err = -EEXIST;
 831         else
 832                 err = event_sched_in(event, cpuctx, ctx);
 833
 834         if (err) {
 835                 /*
 836                  * This event couldn't go on.  If it is in a group
 837                  * then we have to pull the whole group off.
 838                  * If the event group is pinned then put it in error state.
 839                  */
 840                 if (leader != event)
 841                         group_sched_out(leader, cpuctx, ctx);
 842                 if (leader->attr.pinned) {
 843                         update_group_times(leader);
 844                         leader->state = PERF_EVENT_STATE_ERROR;
 845                 }
 846         }
 847
 848         if (!err && !ctx->task && cpuctx->max_pertask)
 849                 cpuctx->max_pertask--;
 850
 851  unlock:
 852         perf_enable();
 853
 854         raw_spin_unlock(&ctx->lock);
 855 }
 856
 857 /*
 858  * Attach a performance event to a context
 859  *
 860  * First we add the event to the list with the hardware enable bit
 861  * in event->hw_config cleared.
 862  *
 863  * If the event is attached to a task which is on a CPU we use a smp
 864  * call to enable it in the task context. The task might have been
 865  * scheduled away, but we check this in the smp call again.
 866  *
 867  * Must be called with ctx->mutex held.
 868  */
 869 static void
 870 perf_install_in_context(struct perf_event_context *ctx,
 871                         struct perf_event *event,
 872                         int cpu)
 873 {
 874         struct task_struct *task = ctx->task;
 875
 876         if (!task) {
 877                 /*
 878                  * Per cpu events are installed via an smp call and
 879                  * the install is always successful.
 880                  */
 881                 smp_call_function_single(cpu, __perf_install_in_context,
 882                                          event, 1);
 883                 return;
 884         }
 885
 886 retry:
 887         task_oncpu_function_call(task, __perf_install_in_context,
 888                                  event);
 889
 890         raw_spin_lock_irq(&ctx->lock);
 891         /*
 892          * we need to retry the smp call.
 893          */
 894         if (ctx->is_active && list_empty(&event->group_entry)) {
 895                 raw_spin_unlock_irq(&ctx->lock);
 896                 goto retry;
 897         }
 898
 899         /*
 900          * The lock prevents that this context is scheduled in so we
 901          * can add the event safely, if it the call above did not
 902          * succeed.
 903          */
 904         if (list_empty(&event->group_entry))
 905                 add_event_to_ctx(event, ctx);
 906         raw_spin_unlock_irq(&ctx->lock);
 907 }
 908
 909 /*
 910  * Put a event into inactive state and update time fields.
 911  * Enabling the leader of a group effectively enables all
 912  * the group members that aren't explicitly disabled, so we
 913  * have to update their ->tstamp_enabled also.
 914  * Note: this works for group members as well as group leaders
 915  * since the non-leader members' sibling_lists will be empty.
 916  */
 917 static void __perf_event_mark_enabled(struct perf_event *event,
 918                                         struct perf_event_context *ctx)
 919 {
 920         struct perf_event *sub;
 921
 922         event->state = PERF_EVENT_STATE_INACTIVE;
 923         event->tstamp_enabled = ctx->time - event->total_time_enabled;
 924         list_for_each_entry(sub, &event->sibling_list, group_entry)
 925                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 926                         sub->tstamp_enabled =
 927                                 ctx->time - sub->total_time_enabled;
 928 }
 929
 930 /*
 931  * Cross CPU call to enable a performance event
 932  */
 933 static void __perf_event_enable(void *info)
 934 {
 935         struct perf_event *event = info;
 936         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 937         struct perf_event_context *ctx = event->ctx;
 938         struct perf_event *leader = event->group_leader;
 939         int err;
 940
 941         /*
 942          * If this is a per-task event, need to check whether this
 943          * event's task is the current task on this cpu.
 944          */
 945         if (ctx->task && cpuctx->task_ctx != ctx) {
 946                 if (cpuctx->task_ctx || ctx->task != current)
 947                         return;
 948                 cpuctx->task_ctx = ctx;
 949         }
 950
 951         raw_spin_lock(&ctx->lock);
 952         ctx->is_active = 1;
 953         update_context_time(ctx);
 954
 955         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 956                 goto unlock;
 957         __perf_event_mark_enabled(event, ctx);
 958
 959         if (event->cpu != -1 && event->cpu != smp_processor_id())
 960                 goto unlock;
 961
 962         /*
 963          * If the event is in a group and isn't the group leader,
 964          * then don't put it on unless the group is on.
 965          */
 966         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 967                 goto unlock;
 968
 969         if (!group_can_go_on(event, cpuctx, 1)) {
 970                 err = -EEXIST;
 971         } else {
 972                 perf_disable();
 973                 if (event == leader)
 974                         err = group_sched_in(event, cpuctx, ctx);
 975                 else
 976                         err = event_sched_in(event, cpuctx, ctx);
 977                 perf_enable();
 978         }
 979
 980         if (err) {
 981                 /*
 982                  * If this event can't go on and it's part of a
 983                  * group, then the whole group has to come off.
 984                  */
 985                 if (leader != event)
 986                         group_sched_out(leader, cpuctx, ctx);
 987                 if (leader->attr.pinned) {
 988                         update_group_times(leader);
 989                         leader->state = PERF_EVENT_STATE_ERROR;
 990                 }
 991         }
 992
 993  unlock:
 994         raw_spin_unlock(&ctx->lock);
 995 }
 996
 997 /*
 998  * Enable a event.
 999  *
1000  * If event->ctx is a cloned context, callers must make sure that
1001  * every task struct that event->ctx->task could possibly point to
1002  * remains valid.  This condition is satisfied when called through
1003  * perf_event_for_each_child or perf_event_for_each as described
1004  * for perf_event_disable.
1005  */
1006 void perf_event_enable(struct perf_event *event)
1007 {
1008         struct perf_event_context *ctx = event->ctx;
1009         struct task_struct *task = ctx->task;
1010
1011         if (!task) {
1012                 /*
1013                  * Enable the event on the cpu that it's on
1014                  */
1015                 smp_call_function_single(event->cpu, __perf_event_enable,
1016                                          event, 1);
1017                 return;
1018         }
1019
1020         raw_spin_lock_irq(&ctx->lock);
1021         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1022                 goto out;
1023
1024         /*
1025          * If the event is in error state, clear that first.
1026          * That way, if we see the event in error state below, we
1027          * know that it has gone back into error state, as distinct
1028          * from the task having been scheduled away before the
1029          * cross-call arrived.
1030          */
1031         if (event->state == PERF_EVENT_STATE_ERROR)
1032                 event->state = PERF_EVENT_STATE_OFF;
1033
1034  retry:
1035         raw_spin_unlock_irq(&ctx->lock);
1036         task_oncpu_function_call(task, __perf_event_enable, event);
1037
1038         raw_spin_lock_irq(&ctx->lock);
1039
1040         /*
1041          * If the context is active and the event is still off,
1042          * we need to retry the cross-call.
1043          */
1044         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1045                 goto retry;
1046
1047         /*
1048          * Since we have the lock this context can't be scheduled
1049          * in, so we can change the state safely.
1050          */
1051         if (event->state == PERF_EVENT_STATE_OFF)
1052                 __perf_event_mark_enabled(event, ctx);
1053
1054  out:
1055         raw_spin_unlock_irq(&ctx->lock);
1056 }
1057
1058 static int perf_event_refresh(struct perf_event *event, int refresh)
1059 {
1060         /*
1061          * not supported on inherited events
1062          */
1063         if (event->attr.inherit)
1064                 return -EINVAL;
1065
1066         atomic_add(refresh, &event->event_limit);
1067         perf_event_enable(event);
1068
1069         return 0;
1070 }
1071
1072 enum event_type_t {
1073         EVENT_FLEXIBLE = 0x1,
1074         EVENT_PINNED = 0x2,
1075         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1076 };
1077
1078 static void ctx_sched_out(struct perf_event_context *ctx,
1079                           struct perf_cpu_context *cpuctx,
1080                           enum event_type_t event_type)
1081 {
1082         struct perf_event *event;
1083
1084         raw_spin_lock(&ctx->lock);
1085         ctx->is_active = 0;
1086         if (likely(!ctx->nr_events))
1087                 goto out;
1088         update_context_time(ctx);
1089
1090         perf_disable();
1091         if (!ctx->nr_active)
1092                 goto out_enable;
1093
1094         if (event_type & EVENT_PINNED)
1095                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1096                         group_sched_out(event, cpuctx, ctx);
1097
1098         if (event_type & EVENT_FLEXIBLE)
1099                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1100                         group_sched_out(event, cpuctx, ctx);
1101
1102  out_enable:
1103         perf_enable();
1104  out:
1105         raw_spin_unlock(&ctx->lock);
1106 }
1107
1108 /*
1109  * Test whether two contexts are equivalent, i.e. whether they
1110  * have both been cloned from the same version of the same context
1111  * and they both have the same number of enabled events.
1112  * If the number of enabled events is the same, then the set
1113  * of enabled events should be the same, because these are both
1114  * inherited contexts, therefore we can't access individual events
1115  * in them directly with an fd; we can only enable/disable all
1116  * events via prctl, or enable/disable all events in a family
1117  * via ioctl, which will have the same effect on both contexts.
1118  */
1119 static int context_equiv(struct perf_event_context *ctx1,
1120                          struct perf_event_context *ctx2)
1121 {
1122         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1123                 && ctx1->parent_gen == ctx2->parent_gen
1124                 && !ctx1->pin_count && !ctx2->pin_count;
1125 }
1126
1127 static void __perf_event_sync_stat(struct perf_event *event,
1128                                      struct perf_event *next_event)
1129 {
1130         u64 value;
1131
1132         if (!event->attr.inherit_stat)
1133                 return;
1134
1135         /*
1136          * Update the event value, we cannot use perf_event_read()
1137          * because we're in the middle of a context switch and have IRQs
1138          * disabled, which upsets smp_call_function_single(), however
1139          * we know the event must be on the current CPU, therefore we
1140          * don't need to use it.
1141          */
1142         switch (event->state) {
1143         case PERF_EVENT_STATE_ACTIVE:
1144                 event->pmu->read(event);
1145                 /* fall-through */
1146
1147         case PERF_EVENT_STATE_INACTIVE:
1148                 update_event_times(event);
1149                 break;
1150
1151         default:
1152                 break;
1153         }
1154
1155         /*
1156          * In order to keep per-task stats reliable we need to flip the event
1157          * values when we flip the contexts.
1158          */
1159         value = atomic64_read(&next_event->count);
1160         value = atomic64_xchg(&event->count, value);
1161         atomic64_set(&next_event->count, value);
1162
1163         swap(event->total_time_enabled, next_event->total_time_enabled);
1164         swap(event->total_time_running, next_event->total_time_running);
1165
1166         /*
1167          * Since we swizzled the values, update the user visible data too.
1168          */
1169         perf_event_update_userpage(event);
1170         perf_event_update_userpage(next_event);
1171 }
1172
1173 #define list_next_entry(pos, member) \
1174         list_entry(pos->member.next, typeof(*pos), member)
1175
1176 static void perf_event_sync_stat(struct perf_event_context *ctx,
1177                                    struct perf_event_context *next_ctx)
1178 {
1179         struct perf_event *event, *next_event;
1180
1181         if (!ctx->nr_stat)
1182                 return;
1183
1184         update_context_time(ctx);
1185
1186         event = list_first_entry(&ctx->event_list,
1187                                    struct perf_event, event_entry);
1188
1189         next_event = list_first_entry(&next_ctx->event_list,
1190                                         struct perf_event, event_entry);
1191
1192         while (&event->event_entry != &ctx->event_list &&
1193                &next_event->event_entry != &next_ctx->event_list) {
1194
1195                 __perf_event_sync_stat(event, next_event);
1196
1197                 event = list_next_entry(event, event_entry);
1198                 next_event = list_next_entry(next_event, event_entry);
1199         }
1200 }
1201
1202 /*
1203  * Called from scheduler to remove the events of the current task,
1204  * with interrupts disabled.
1205  *
1206  * We stop each event and update the event value in event->count.
1207  *
1208  * This does not protect us against NMI, but disable()
1209  * sets the disabled bit in the control field of event _before_
1210  * accessing the event control register. If a NMI hits, then it will
1211  * not restart the event.
1212  */
1213 void perf_event_task_sched_out(struct task_struct *task,
1214                                  struct task_struct *next)
1215 {
1216         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1217         struct perf_event_context *ctx = task->perf_event_ctxp;
1218         struct perf_event_context *next_ctx;
1219         struct perf_event_context *parent;
1220         int do_switch = 1;
1221
1222         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1223
1224         if (likely(!ctx || !cpuctx->task_ctx))
1225                 return;
1226
1227         rcu_read_lock();
1228         parent = rcu_dereference(ctx->parent_ctx);
1229         next_ctx = next->perf_event_ctxp;
1230         if (parent && next_ctx &&
1231             rcu_dereference(next_ctx->parent_ctx) == parent) {
1232                 /*
1233                  * Looks like the two contexts are clones, so we might be
1234                  * able to optimize the context switch.  We lock both
1235                  * contexts and check that they are clones under the
1236                  * lock (including re-checking that neither has been
1237                  * uncloned in the meantime).  It doesn't matter which
1238                  * order we take the locks because no other cpu could
1239                  * be trying to lock both of these tasks.
1240                  */
1241                 raw_spin_lock(&ctx->lock);
1242                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1243                 if (context_equiv(ctx, next_ctx)) {
1244                         /*
1245                          * XXX do we need a memory barrier of sorts
1246                          * wrt to rcu_dereference() of perf_event_ctxp
1247                          */
1248                         task->perf_event_ctxp = next_ctx;
1249                         next->perf_event_ctxp = ctx;
1250                         ctx->task = next;
1251                         next_ctx->task = task;
1252                         do_switch = 0;
1253
1254                         perf_event_sync_stat(ctx, next_ctx);
1255                 }
1256                 raw_spin_unlock(&next_ctx->lock);
1257                 raw_spin_unlock(&ctx->lock);
1258         }
1259         rcu_read_unlock();
1260
1261         if (do_switch) {
1262                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1263                 cpuctx->task_ctx = NULL;
1264         }
1265 }
1266
1267 static void task_ctx_sched_out(struct perf_event_context *ctx,
1268                                enum event_type_t event_type)
1269 {
1270         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1271
1272         if (!cpuctx->task_ctx)
1273                 return;
1274
1275         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1276                 return;
1277
1278         ctx_sched_out(ctx, cpuctx, event_type);
1279         cpuctx->task_ctx = NULL;
1280 }
1281
1282 /*
1283  * Called with IRQs disabled
1284  */
1285 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1286 {
1287         task_ctx_sched_out(ctx, EVENT_ALL);
1288 }
1289
1290 /*
1291  * Called with IRQs disabled
1292  */
1293 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1294                               enum event_type_t event_type)
1295 {
1296         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1297 }
1298
1299 static void
1300 ctx_pinned_sched_in(struct perf_event_context *ctx,
1301                     struct perf_cpu_context *cpuctx)
1302 {
1303         struct perf_event *event;
1304
1305         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1306                 if (event->state <= PERF_EVENT_STATE_OFF)
1307                         continue;
1308                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1309                         continue;
1310
1311                 if (group_can_go_on(event, cpuctx, 1))
1312                         group_sched_in(event, cpuctx, ctx);
1313
1314                 /*
1315                  * If this pinned group hasn't been scheduled,
1316                  * put it in error state.
1317                  */
1318                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1319                         update_group_times(event);
1320                         event->state = PERF_EVENT_STATE_ERROR;
1321                 }
1322         }
1323 }
1324
1325 static void
1326 ctx_flexible_sched_in(struct perf_event_context *ctx,
1327                       struct perf_cpu_context *cpuctx)
1328 {
1329         struct perf_event *event;
1330         int can_add_hw = 1;
1331
1332         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1333                 /* Ignore events in OFF or ERROR state */
1334                 if (event->state <= PERF_EVENT_STATE_OFF)
1335                         continue;
1336                 /*
1337                  * Listen to the 'cpu' scheduling filter constraint
1338                  * of events:
1339                  */
1340                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1341                         continue;
1342
1343                 if (group_can_go_on(event, cpuctx, can_add_hw))
1344                         if (group_sched_in(event, cpuctx, ctx))
1345                                 can_add_hw = 0;
1346         }
1347 }
1348
1349 static void
1350 ctx_sched_in(struct perf_event_context *ctx,
1351              struct perf_cpu_context *cpuctx,
1352              enum event_type_t event_type)
1353 {
1354         raw_spin_lock(&ctx->lock);
1355         ctx->is_active = 1;
1356         if (likely(!ctx->nr_events))
1357                 goto out;
1358
1359         ctx->timestamp = perf_clock();
1360
1361         perf_disable();
1362
1363         /*
1364          * First go through the list and put on any pinned groups
1365          * in order to give them the best chance of going on.
1366          */
1367         if (event_type & EVENT_PINNED)
1368                 ctx_pinned_sched_in(ctx, cpuctx);
1369
1370         /* Then walk through the lower prio flexible groups */
1371         if (event_type & EVENT_FLEXIBLE)
1372                 ctx_flexible_sched_in(ctx, cpuctx);
1373
1374         perf_enable();
1375  out:
1376         raw_spin_unlock(&ctx->lock);
1377 }
1378
1379 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1380                              enum event_type_t event_type)
1381 {
1382         struct perf_event_context *ctx = &cpuctx->ctx;
1383
1384         ctx_sched_in(ctx, cpuctx, event_type);
1385 }
1386
1387 static void task_ctx_sched_in(struct task_struct *task,
1388                               enum event_type_t event_type)
1389 {
1390         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1391         struct perf_event_context *ctx = task->perf_event_ctxp;
1392
1393         if (likely(!ctx))
1394                 return;
1395         if (cpuctx->task_ctx == ctx)
1396                 return;
1397         ctx_sched_in(ctx, cpuctx, event_type);
1398         cpuctx->task_ctx = ctx;
1399 }
1400 /*
1401  * Called from scheduler to add the events of the current task
1402  * with interrupts disabled.
1403  *
1404  * We restore the event value and then enable it.
1405  *
1406  * This does not protect us against NMI, but enable()
1407  * sets the enabled bit in the control field of event _before_
1408  * accessing the event control register. If a NMI hits, then it will
1409  * keep the event running.
1410  */
1411 void perf_event_task_sched_in(struct task_struct *task)
1412 {
1413         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1414         struct perf_event_context *ctx = task->perf_event_ctxp;
1415
1416         if (likely(!ctx))
1417                 return;
1418
1419         if (cpuctx->task_ctx == ctx)
1420                 return;
1421
1422         perf_disable();
1423
1424         /*
1425          * We want to keep the following priority order:
1426          * cpu pinned (that don't need to move), task pinned,
1427          * cpu flexible, task flexible.
1428          */
1429         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1430
1431         ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1432         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1433         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1434
1435         cpuctx->task_ctx = ctx;
1436
1437         perf_enable();
1438 }
1439
1440 #define MAX_INTERRUPTS (~0ULL)
1441
1442 static void perf_log_throttle(struct perf_event *event, int enable);
1443
1444 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1445 {
1446         u64 frequency = event->attr.sample_freq;
1447         u64 sec = NSEC_PER_SEC;
1448         u64 divisor, dividend;
1449
1450         int count_fls, nsec_fls, frequency_fls, sec_fls;
1451
1452         count_fls = fls64(count);
1453         nsec_fls = fls64(nsec);
1454         frequency_fls = fls64(frequency);
1455         sec_fls = 30;
1456
1457         /*
1458          * We got @count in @nsec, with a target of sample_freq HZ
1459          * the target period becomes:
1460          *
1461          *             @count * 10^9
1462          * period = -------------------
1463          *          @nsec * sample_freq
1464          *
1465          */
1466
1467         /*
1468          * Reduce accuracy by one bit such that @a and @b converge
1469          * to a similar magnitude.
1470          */
1471 #define REDUCE_FLS(a, b)                \
1472 do {                                    \
1473         if (a##_fls > b##_fls) {        \
1474                 a >>= 1;                \
1475                 a##_fls--;              \
1476         } else {                        \
1477                 b >>= 1;                \
1478                 b##_fls--;              \
1479         }                               \
1480 } while (0)
1481
1482         /*
1483          * Reduce accuracy until either term fits in a u64, then proceed with
1484          * the other, so that finally we can do a u64/u64 division.
1485          */
1486         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1487                 REDUCE_FLS(nsec, frequency);
1488                 REDUCE_FLS(sec, count);
1489         }
1490
1491         if (count_fls + sec_fls > 64) {
1492                 divisor = nsec * frequency;
1493
1494                 while (count_fls + sec_fls > 64) {
1495                         REDUCE_FLS(count, sec);
1496                         divisor >>= 1;
1497                 }
1498
1499                 dividend = count * sec;
1500         } else {
1501                 dividend = count * sec;
1502
1503                 while (nsec_fls + frequency_fls > 64) {
1504                         REDUCE_FLS(nsec, frequency);
1505                         dividend >>= 1;
1506                 }
1507
1508                 divisor = nsec * frequency;
1509         }
1510
1511         if (!divisor)
1512                 return dividend;
1513
1514         return div64_u64(dividend, divisor);
1515 }
1516
1517 static void perf_event_stop(struct perf_event *event)
1518 {
1519         if (!event->pmu->stop)
1520                 return event->pmu->disable(event);
1521
1522         return event->pmu->stop(event);
1523 }
1524
1525 static int perf_event_start(struct perf_event *event)
1526 {
1527         if (!event->pmu->start)
1528                 return event->pmu->enable(event);
1529
1530         return event->pmu->start(event);
1531 }
1532
1533 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1534 {
1535         struct hw_perf_event *hwc = &event->hw;
1536         s64 period, sample_period;
1537         s64 delta;
1538
1539         period = perf_calculate_period(event, nsec, count);
1540
1541         delta = (s64)(period - hwc->sample_period);
1542         delta = (delta + 7) / 8; /* low pass filter */
1543
1544         sample_period = hwc->sample_period + delta;
1545
1546         if (!sample_period)
1547                 sample_period = 1;
1548
1549         hwc->sample_period = sample_period;
1550
1551         if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1552                 perf_disable();
1553                 perf_event_stop(event);
1554                 atomic64_set(&hwc->period_left, 0);
1555                 perf_event_start(event);
1556                 perf_enable();
1557         }
1558 }
1559
1560 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1561 {
1562         struct perf_event *event;
1563         struct hw_perf_event *hwc;
1564         u64 interrupts, now;
1565         s64 delta;
1566
1567         raw_spin_lock(&ctx->lock);
1568         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1569                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1570                         continue;
1571
1572                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1573                         continue;
1574
1575                 hwc = &event->hw;
1576
1577                 interrupts = hwc->interrupts;
1578                 hwc->interrupts = 0;
1579
1580                 /*
1581                  * unthrottle events on the tick
1582                  */
1583                 if (interrupts == MAX_INTERRUPTS) {
1584                         perf_log_throttle(event, 1);
1585                         perf_disable();
1586                         event->pmu->unthrottle(event);
1587                         perf_enable();
1588                 }
1589
1590                 if (!event->attr.freq || !event->attr.sample_freq)
1591                         continue;
1592
1593                 perf_disable();
1594                 event->pmu->read(event);
1595                 now = atomic64_read(&event->count);
1596                 delta = now - hwc->freq_count_stamp;
1597                 hwc->freq_count_stamp = now;
1598
1599                 if (delta > 0)
1600                         perf_adjust_period(event, TICK_NSEC, delta);
1601                 perf_enable();
1602         }
1603         raw_spin_unlock(&ctx->lock);
1604 }
1605
1606 /*
1607  * Round-robin a context's events:
1608  */
1609 static void rotate_ctx(struct perf_event_context *ctx)
1610 {
1611         raw_spin_lock(&ctx->lock);
1612
1613         /*
1614          * Rotate the first entry last of non-pinned groups. Rotation might be
1615          * disabled by the inheritance code.
1616          */
1617         if (!ctx->rotate_disable)
1618                 list_rotate_left(&ctx->flexible_groups);
1619
1620         raw_spin_unlock(&ctx->lock);
1621 }
1622
1623 void perf_event_task_tick(struct task_struct *curr)
1624 {
1625         struct perf_cpu_context *cpuctx;
1626         struct perf_event_context *ctx;
1627         int rotate = 0;
1628
1629         if (!atomic_read(&nr_events))
1630                 return;
1631
1632         cpuctx = &__get_cpu_var(perf_cpu_context);
1633         if (cpuctx->ctx.nr_events &&
1634             cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1635                 rotate = 1;
1636
1637         ctx = curr->perf_event_ctxp;
1638         if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1639                 rotate = 1;
1640
1641         perf_ctx_adjust_freq(&cpuctx->ctx);
1642         if (ctx)
1643                 perf_ctx_adjust_freq(ctx);
1644
1645         if (!rotate)
1646                 return;
1647
1648         perf_disable();
1649         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1650         if (ctx)
1651                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1652
1653         rotate_ctx(&cpuctx->ctx);
1654         if (ctx)
1655                 rotate_ctx(ctx);
1656
1657         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1658         if (ctx)
1659                 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1660         perf_enable();
1661 }
1662
1663 static int event_enable_on_exec(struct perf_event *event,
1664                                 struct perf_event_context *ctx)
1665 {
1666         if (!event->attr.enable_on_exec)
1667                 return 0;
1668
1669         event->attr.enable_on_exec = 0;
1670         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1671                 return 0;
1672
1673         __perf_event_mark_enabled(event, ctx);
1674
1675         return 1;
1676 }
1677
1678 /*
1679  * Enable all of a task's events that have been marked enable-on-exec.
1680  * This expects task == current.
1681  */
1682 static void perf_event_enable_on_exec(struct task_struct *task)
1683 {
1684         struct perf_event_context *ctx;
1685         struct perf_event *event;
1686         unsigned long flags;
1687         int enabled = 0;
1688         int ret;
1689
1690         local_irq_save(flags);
1691         ctx = task->perf_event_ctxp;
1692         if (!ctx || !ctx->nr_events)
1693                 goto out;
1694
1695         __perf_event_task_sched_out(ctx);
1696
1697         raw_spin_lock(&ctx->lock);
1698
1699         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1700                 ret = event_enable_on_exec(event, ctx);
1701                 if (ret)
1702                         enabled = 1;
1703         }
1704
1705         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1706                 ret = event_enable_on_exec(event, ctx);
1707                 if (ret)
1708                         enabled = 1;
1709         }
1710
1711         /*
1712          * Unclone this context if we enabled any event.
1713          */
1714         if (enabled)
1715                 unclone_ctx(ctx);
1716
1717         raw_spin_unlock(&ctx->lock);
1718
1719         perf_event_task_sched_in(task);
1720  out:
1721         local_irq_restore(flags);
1722 }
1723
1724 /*
1725  * Cross CPU call to read the hardware event
1726  */
1727 static void __perf_event_read(void *info)
1728 {
1729         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1730         struct perf_event *event = info;
1731         struct perf_event_context *ctx = event->ctx;
1732
1733         /*
1734          * If this is a task context, we need to check whether it is
1735          * the current task context of this cpu.  If not it has been
1736          * scheduled out before the smp call arrived.  In that case
1737          * event->count would have been updated to a recent sample
1738          * when the event was scheduled out.
1739          */
1740         if (ctx->task && cpuctx->task_ctx != ctx)
1741                 return;
1742
1743         raw_spin_lock(&ctx->lock);
1744         update_context_time(ctx);
1745         update_event_times(event);
1746         raw_spin_unlock(&ctx->lock);
1747
1748         event->pmu->read(event);
1749 }
1750
1751 static u64 perf_event_read(struct perf_event *event)
1752 {
1753         /*
1754          * If event is enabled and currently active on a CPU, update the
1755          * value in the event structure:
1756          */
1757         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1758                 smp_call_function_single(event->oncpu,
1759                                          __perf_event_read, event, 1);
1760         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1761                 struct perf_event_context *ctx = event->ctx;
1762                 unsigned long flags;
1763
1764                 raw_spin_lock_irqsave(&ctx->lock, flags);
1765                 /*
1766                  * may read while context is not active
1767                  * (e.g., thread is blocked), in that case
1768                  * we cannot update context time
1769                  */
1770                 if (ctx->is_active)
1771                         update_context_time(ctx);
1772                 update_event_times(event);
1773                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1774         }
1775
1776         return atomic64_read(&event->count);
1777 }
1778
1779 /*
1780  * Initialize the perf_event context in a task_struct:
1781  */
1782 static void
1783 __perf_event_init_context(struct perf_event_context *ctx,
1784                             struct task_struct *task)
1785 {
1786         raw_spin_lock_init(&ctx->lock);
1787         mutex_init(&ctx->mutex);
1788         INIT_LIST_HEAD(&ctx->pinned_groups);
1789         INIT_LIST_HEAD(&ctx->flexible_groups);
1790         INIT_LIST_HEAD(&ctx->event_list);
1791         atomic_set(&ctx->refcount, 1);
1792         ctx->task = task;
1793 }
1794
1795 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1796 {
1797         struct perf_event_context *ctx;
1798         struct perf_cpu_context *cpuctx;
1799         struct task_struct *task;
1800         unsigned long flags;
1801         int err;
1802
1803         if (pid == -1 && cpu != -1) {
1804                 /* Must be root to operate on a CPU event: */
1805                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1806                         return ERR_PTR(-EACCES);
1807
1808                 if (cpu < 0 || cpu >= nr_cpumask_bits)
1809                         return ERR_PTR(-EINVAL);
1810
1811                 /*
1812                  * We could be clever and allow to attach a event to an
1813                  * offline CPU and activate it when the CPU comes up, but
1814                  * that's for later.
1815                  */
1816                 if (!cpu_online(cpu))
1817                         return ERR_PTR(-ENODEV);
1818
1819                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1820                 ctx = &cpuctx->ctx;
1821                 get_ctx(ctx);
1822
1823                 return ctx;
1824         }
1825
1826         rcu_read_lock();
1827         if (!pid)
1828                 task = current;
1829         else
1830                 task = find_task_by_vpid(pid);
1831         if (task)
1832                 get_task_struct(task);
1833         rcu_read_unlock();
1834
1835         if (!task)
1836                 return ERR_PTR(-ESRCH);
1837
1838         /*
1839          * Can't attach events to a dying task.
1840          */
1841         err = -ESRCH;
1842         if (task->flags & PF_EXITING)
1843                 goto errout;
1844
1845         /* Reuse ptrace permission checks for now. */
1846         err = -EACCES;
1847         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1848                 goto errout;
1849
1850  retry:
1851         ctx = perf_lock_task_context(task, &flags);
1852         if (ctx) {
1853                 unclone_ctx(ctx);
1854                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1855         }
1856
1857         if (!ctx) {
1858                 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1859                 err = -ENOMEM;
1860                 if (!ctx)
1861                         goto errout;
1862                 __perf_event_init_context(ctx, task);
1863                 get_ctx(ctx);
1864                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1865                         /*
1866                          * We raced with some other task; use
1867                          * the context they set.
1868                          */
1869                         kfree(ctx);
1870                         goto retry;
1871                 }
1872                 get_task_struct(task);
1873         }
1874
1875         put_task_struct(task);
1876         return ctx;
1877
1878  errout:
1879         put_task_struct(task);
1880         return ERR_PTR(err);
1881 }
1882
1883 static void perf_event_free_filter(struct perf_event *event);
1884
1885 static void free_event_rcu(struct rcu_head *head)
1886 {
1887         struct perf_event *event;
1888
1889         event = container_of(head, struct perf_event, rcu_head);
1890         if (event->ns)
1891                 put_pid_ns(event->ns);
1892         perf_event_free_filter(event);
1893         kfree(event);
1894 }
1895
1896 static void perf_pending_sync(struct perf_event *event);
1897 static void perf_mmap_data_put(struct perf_mmap_data *data);
1898
1899 static void free_event(struct perf_event *event)
1900 {
1901         perf_pending_sync(event);
1902
1903         if (!event->parent) {
1904                 atomic_dec(&nr_events);
1905                 if (event->attr.mmap)
1906                         atomic_dec(&nr_mmap_events);
1907                 if (event->attr.comm)
1908                         atomic_dec(&nr_comm_events);
1909                 if (event->attr.task)
1910                         atomic_dec(&nr_task_events);
1911         }
1912
1913         if (event->data) {
1914                 perf_mmap_data_put(event->data);
1915                 event->data = NULL;
1916         }
1917
1918         if (event->destroy)
1919                 event->destroy(event);
1920
1921         put_ctx(event->ctx);
1922         call_rcu(&event->rcu_head, free_event_rcu);
1923 }
1924
1925 int perf_event_release_kernel(struct perf_event *event)
1926 {
1927         struct perf_event_context *ctx = event->ctx;
1928
1929         /*
1930          * Remove from the PMU, can't get re-enabled since we got
1931          * here because the last ref went.
1932          */
1933         perf_event_disable(event);
1934
1935         WARN_ON_ONCE(ctx->parent_ctx);
1936         /*
1937          * There are two ways this annotation is useful:
1938          *
1939          *  1) there is a lock recursion from perf_event_exit_task
1940          *     see the comment there.
1941          *
1942          *  2) there is a lock-inversion with mmap_sem through
1943          *     perf_event_read_group(), which takes faults while
1944          *     holding ctx->mutex, however this is called after
1945          *     the last filedesc died, so there is no possibility
1946          *     to trigger the AB-BA case.
1947          */
1948         mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1949         raw_spin_lock_irq(&ctx->lock);
1950         perf_group_detach(event);
1951         list_del_event(event, ctx);
1952         raw_spin_unlock_irq(&ctx->lock);
1953         mutex_unlock(&ctx->mutex);
1954
1955         mutex_lock(&event->owner->perf_event_mutex);
1956         list_del_init(&event->owner_entry);
1957         mutex_unlock(&event->owner->perf_event_mutex);
1958         put_task_struct(event->owner);
1959
1960         free_event(event);
1961
1962         return 0;
1963 }
1964 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1965
1966 /*
1967  * Called when the last reference to the file is gone.
1968  */
1969 static int perf_release(struct inode *inode, struct file *file)
1970 {
1971         struct perf_event *event = file->private_data;
1972
1973         file->private_data = NULL;
1974
1975         return perf_event_release_kernel(event);
1976 }
1977
1978 static int perf_event_read_size(struct perf_event *event)
1979 {
1980         int entry = sizeof(u64); /* value */
1981         int size = 0;
1982         int nr = 1;
1983
1984         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1985                 size += sizeof(u64);
1986
1987         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1988                 size += sizeof(u64);
1989
1990         if (event->attr.read_format & PERF_FORMAT_ID)
1991                 entry += sizeof(u64);
1992
1993         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1994                 nr += event->group_leader->nr_siblings;
1995                 size += sizeof(u64);
1996         }
1997
1998         size += entry * nr;
1999
2000         return size;
2001 }
2002
2003 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2004 {
2005         struct perf_event *child;
2006         u64 total = 0;
2007
2008         *enabled = 0;
2009         *running = 0;
2010
2011         mutex_lock(&event->child_mutex);
2012         total += perf_event_read(event);
2013         *enabled += event->total_time_enabled +
2014                         atomic64_read(&event->child_total_time_enabled);
2015         *running += event->total_time_running +
2016                         atomic64_read(&event->child_total_time_running);
2017
2018         list_for_each_entry(child, &event->child_list, child_list) {
2019                 total += perf_event_read(child);
2020                 *enabled += child->total_time_enabled;
2021                 *running += child->total_time_running;
2022         }
2023         mutex_unlock(&event->child_mutex);
2024
2025         return total;
2026 }
2027 EXPORT_SYMBOL_GPL(perf_event_read_value);
2028
2029 static int perf_event_read_group(struct perf_event *event,
2030                                    u64 read_format, char __user *buf)
2031 {
2032         struct perf_event *leader = event->group_leader, *sub;
2033         int n = 0, size = 0, ret = -EFAULT;
2034         struct perf_event_context *ctx = leader->ctx;
2035         u64 values[5];
2036         u64 count, enabled, running;
2037
2038         mutex_lock(&ctx->mutex);
2039         count = perf_event_read_value(leader, &enabled, &running);
2040
2041         values[n++] = 1 + leader->nr_siblings;
2042         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2043                 values[n++] = enabled;
2044         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2045                 values[n++] = running;
2046         values[n++] = count;
2047         if (read_format & PERF_FORMAT_ID)
2048                 values[n++] = primary_event_id(leader);
2049
2050         size = n * sizeof(u64);
2051
2052         if (copy_to_user(buf, values, size))
2053                 goto unlock;
2054
2055         ret = size;
2056
2057         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2058                 n = 0;
2059
2060                 values[n++] = perf_event_read_value(sub, &enabled, &running);
2061                 if (read_format & PERF_FORMAT_ID)
2062                         values[n++] = primary_event_id(sub);
2063
2064                 size = n * sizeof(u64);
2065
2066                 if (copy_to_user(buf + ret, values, size)) {
2067                         ret = -EFAULT;
2068                         goto unlock;
2069                 }
2070
2071                 ret += size;
2072         }
2073 unlock:
2074         mutex_unlock(&ctx->mutex);
2075
2076         return ret;
2077 }
2078
2079 static int perf_event_read_one(struct perf_event *event,
2080                                  u64 read_format, char __user *buf)
2081 {
2082         u64 enabled, running;
2083         u64 values[4];
2084         int n = 0;
2085
2086         values[n++] = perf_event_read_value(event, &enabled, &running);
2087         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2088                 values[n++] = enabled;
2089         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2090                 values[n++] = running;
2091         if (read_format & PERF_FORMAT_ID)
2092                 values[n++] = primary_event_id(event);
2093
2094         if (copy_to_user(buf, values, n * sizeof(u64)))
2095                 return -EFAULT;
2096
2097         return n * sizeof(u64);
2098 }
2099
2100 /*
2101  * Read the performance event - simple non blocking version for now
2102  */
2103 static ssize_t
2104 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2105 {
2106         u64 read_format = event->attr.read_format;
2107         int ret;
2108
2109         /*
2110          * Return end-of-file for a read on a event that is in
2111          * error state (i.e. because it was pinned but it couldn't be
2112          * scheduled on to the CPU at some point).
2113          */
2114         if (event->state == PERF_EVENT_STATE_ERROR)
2115                 return 0;
2116
2117         if (count < perf_event_read_size(event))
2118                 return -ENOSPC;
2119
2120         WARN_ON_ONCE(event->ctx->parent_ctx);
2121         if (read_format & PERF_FORMAT_GROUP)
2122                 ret = perf_event_read_group(event, read_format, buf);
2123         else
2124                 ret = perf_event_read_one(event, read_format, buf);
2125
2126         return ret;
2127 }
2128
2129 static ssize_t
2130 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2131 {
2132         struct perf_event *event = file->private_data;
2133
2134         return perf_read_hw(event, buf, count);
2135 }
2136
2137 static unsigned int perf_poll(struct file *file, poll_table *wait)
2138 {
2139         struct perf_event *event = file->private_data;
2140         struct perf_mmap_data *data;
2141         unsigned int events = POLL_HUP;
2142
2143         rcu_read_lock();
2144         data = rcu_dereference(event->data);
2145         if (data)
2146                 events = atomic_xchg(&data->poll, 0);
2147         rcu_read_unlock();
2148
2149         poll_wait(file, &event->waitq, wait);
2150
2151         return events;
2152 }
2153
2154 static void perf_event_reset(struct perf_event *event)
2155 {
2156         (void)perf_event_read(event);
2157         atomic64_set(&event->count, 0);
2158         perf_event_update_userpage(event);
2159 }
2160
2161 /*
2162  * Holding the top-level event's child_mutex means that any
2163  * descendant process that has inherited this event will block
2164  * in sync_child_event if it goes to exit, thus satisfying the
2165  * task existence requirements of perf_event_enable/disable.
2166  */
2167 static void perf_event_for_each_child(struct perf_event *event,
2168                                         void (*func)(struct perf_event *))
2169 {
2170         struct perf_event *child;
2171
2172         WARN_ON_ONCE(event->ctx->parent_ctx);
2173         mutex_lock(&event->child_mutex);
2174         func(event);
2175         list_for_each_entry(child, &event->child_list, child_list)
2176                 func(child);
2177         mutex_unlock(&event->child_mutex);
2178 }
2179
2180 static void perf_event_for_each(struct perf_event *event,
2181                                   void (*func)(struct perf_event *))
2182 {
2183         struct perf_event_context *ctx = event->ctx;
2184         struct perf_event *sibling;
2185
2186         WARN_ON_ONCE(ctx->parent_ctx);
2187         mutex_lock(&ctx->mutex);
2188         event = event->group_leader;
2189
2190         perf_event_for_each_child(event, func);
2191         func(event);
2192         list_for_each_entry(sibling, &event->sibling_list, group_entry)
2193                 perf_event_for_each_child(event, func);
2194         mutex_unlock(&ctx->mutex);
2195 }
2196
2197 static int perf_event_period(struct perf_event *event, u64 __user *arg)
2198 {
2199         struct perf_event_context *ctx = event->ctx;
2200         unsigned long size;
2201         int ret = 0;
2202         u64 value;
2203
2204         if (!event->attr.sample_period)
2205                 return -EINVAL;
2206
2207         size = copy_from_user(&value, arg, sizeof(value));
2208         if (size != sizeof(value))
2209                 return -EFAULT;
2210
2211         if (!value)
2212                 return -EINVAL;
2213
2214         raw_spin_lock_irq(&ctx->lock);
2215         if (event->attr.freq) {
2216                 if (value > sysctl_perf_event_sample_rate) {
2217                         ret = -EINVAL;
2218                         goto unlock;
2219                 }
2220
2221                 event->attr.sample_freq = value;
2222         } else {
2223                 event->attr.sample_period = value;
2224                 event->hw.sample_period = value;
2225         }
2226 unlock:
2227         raw_spin_unlock_irq(&ctx->lock);
2228
2229         return ret;
2230 }
2231
2232 static const struct file_operations perf_fops;
2233
2234 static struct perf_event *perf_fget_light(int fd, int *fput_needed)
2235 {
2236         struct file *file;
2237
2238         file = fget_light(fd, fput_needed);
2239         if (!file)
2240                 return ERR_PTR(-EBADF);
2241
2242         if (file->f_op != &perf_fops) {
2243                 fput_light(file, *fput_needed);
2244                 *fput_needed = 0;
2245                 return ERR_PTR(-EBADF);
2246         }
2247
2248         return file->private_data;
2249 }
2250
2251 static int perf_event_set_output(struct perf_event *event,
2252                                  struct perf_event *output_event);
2253 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2254
2255 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2256 {
2257         struct perf_event *event = file->private_data;
2258         void (*func)(struct perf_event *);
2259         u32 flags = arg;
2260
2261         switch (cmd) {
2262         case PERF_EVENT_IOC_ENABLE:
2263                 func = perf_event_enable;
2264                 break;
2265         case PERF_EVENT_IOC_DISABLE:
2266                 func = perf_event_disable;
2267                 break;
2268         case PERF_EVENT_IOC_RESET:
2269                 func = perf_event_reset;
2270                 break;
2271
2272         case PERF_EVENT_IOC_REFRESH:
2273                 return perf_event_refresh(event, arg);
2274
2275         case PERF_EVENT_IOC_PERIOD:
2276                 return perf_event_period(event, (u64 __user *)arg);
2277
2278         case PERF_EVENT_IOC_SET_OUTPUT:
2279         {
2280                 struct perf_event *output_event = NULL;
2281                 int fput_needed = 0;
2282                 int ret;
2283
2284                 if (arg != -1) {
2285                         output_event = perf_fget_light(arg, &fput_needed);
2286                         if (IS_ERR(output_event))
2287                                 return PTR_ERR(output_event);
2288                 }
2289
2290                 ret = perf_event_set_output(event, output_event);
2291                 if (output_event)
2292                         fput_light(output_event->filp, fput_needed);
2293
2294                 return ret;
2295         }
2296
2297         case PERF_EVENT_IOC_SET_FILTER:
2298                 return perf_event_set_filter(event, (void __user *)arg);
2299
2300         default:
2301                 return -ENOTTY;
2302         }
2303
2304         if (flags & PERF_IOC_FLAG_GROUP)
2305                 perf_event_for_each(event, func);
2306         else
2307                 perf_event_for_each_child(event, func);
2308
2309         return 0;
2310 }
2311
2312 int perf_event_task_enable(void)
2313 {
2314         struct perf_event *event;
2315
2316         mutex_lock(&current->perf_event_mutex);
2317         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2318                 perf_event_for_each_child(event, perf_event_enable);
2319         mutex_unlock(&current->perf_event_mutex);
2320
2321         return 0;
2322 }
2323
2324 int perf_event_task_disable(void)
2325 {
2326         struct perf_event *event;
2327
2328         mutex_lock(&current->perf_event_mutex);
2329         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2330                 perf_event_for_each_child(event, perf_event_disable);
2331         mutex_unlock(&current->perf_event_mutex);
2332
2333         return 0;
2334 }
2335
2336 #ifndef PERF_EVENT_INDEX_OFFSET
2337 # define PERF_EVENT_INDEX_OFFSET 0
2338 #endif
2339
2340 static int perf_event_index(struct perf_event *event)
2341 {
2342         if (event->state != PERF_EVENT_STATE_ACTIVE)
2343                 return 0;
2344
2345         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2346 }
2347
2348 /*
2349  * Callers need to ensure there can be no nesting of this function, otherwise
2350  * the seqlock logic goes bad. We can not serialize this because the arch
2351  * code calls this from NMI context.
2352  */
2353 void perf_event_update_userpage(struct perf_event *event)
2354 {
2355         struct perf_event_mmap_page *userpg;
2356         struct perf_mmap_data *data;
2357
2358         rcu_read_lock();
2359         data = rcu_dereference(event->data);
2360         if (!data)
2361                 goto unlock;
2362
2363         userpg = data->user_page;
2364
2365         /*
2366          * Disable preemption so as to not let the corresponding user-space
2367          * spin too long if we get preempted.
2368          */
2369         preempt_disable();
2370         ++userpg->lock;
2371         barrier();
2372         userpg->index = perf_event_index(event);
2373         userpg->offset = atomic64_read(&event->count);
2374         if (event->state == PERF_EVENT_STATE_ACTIVE)
2375                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2376
2377         userpg->time_enabled = event->total_time_enabled +
2378                         atomic64_read(&event->child_total_time_enabled);
2379
2380         userpg->time_running = event->total_time_running +
2381                         atomic64_read(&event->child_total_time_running);
2382
2383         barrier();
2384         ++userpg->lock;
2385         preempt_enable();
2386 unlock:
2387         rcu_read_unlock();
2388 }
2389
2390 #ifndef CONFIG_PERF_USE_VMALLOC
2391
2392 /*
2393  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2394  */
2395
2396 static struct page *
2397 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2398 {
2399         if (pgoff > data->nr_pages)
2400                 return NULL;
2401
2402         if (pgoff == 0)
2403                 return virt_to_page(data->user_page);
2404
2405         return virt_to_page(data->data_pages[pgoff - 1]);
2406 }
2407
2408 static void *perf_mmap_alloc_page(int cpu)
2409 {
2410         struct page *page;
2411         int node;
2412
2413         node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2414         page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2415         if (!page)
2416                 return NULL;
2417
2418         return page_address(page);
2419 }
2420
2421 static struct perf_mmap_data *
2422 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2423 {
2424         struct perf_mmap_data *data;
2425         unsigned long size;
2426         int i;
2427
2428         size = sizeof(struct perf_mmap_data);
2429         size += nr_pages * sizeof(void *);
2430
2431         data = kzalloc(size, GFP_KERNEL);
2432         if (!data)
2433                 goto fail;
2434
2435         data->user_page = perf_mmap_alloc_page(event->cpu);
2436         if (!data->user_page)
2437                 goto fail_user_page;
2438
2439         for (i = 0; i < nr_pages; i++) {
2440                 data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
2441                 if (!data->data_pages[i])
2442                         goto fail_data_pages;
2443         }
2444
2445         data->nr_pages = nr_pages;
2446
2447         return data;
2448
2449 fail_data_pages:
2450         for (i--; i >= 0; i--)
2451                 free_page((unsigned long)data->data_pages[i]);
2452
2453         free_page((unsigned long)data->user_page);
2454
2455 fail_user_page:
2456         kfree(data);
2457
2458 fail:
2459         return NULL;
2460 }
2461
2462 static void perf_mmap_free_page(unsigned long addr)
2463 {
2464         struct page *page = virt_to_page((void *)addr);
2465
2466         page->mapping = NULL;
2467         __free_page(page);
2468 }
2469
2470 static void perf_mmap_data_free(struct perf_mmap_data *data)
2471 {
2472         int i;
2473
2474         perf_mmap_free_page((unsigned long)data->user_page);
2475         for (i = 0; i < data->nr_pages; i++)
2476                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2477         kfree(data);
2478 }
2479
2480 static inline int page_order(struct perf_mmap_data *data)
2481 {
2482         return 0;
2483 }
2484
2485 #else
2486
2487 /*
2488  * Back perf_mmap() with vmalloc memory.
2489  *
2490  * Required for architectures that have d-cache aliasing issues.
2491  */
2492
2493 static inline int page_order(struct perf_mmap_data *data)
2494 {
2495         return data->page_order;
2496 }
2497
2498 static struct page *
2499 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2500 {
2501         if (pgoff > (1UL << page_order(data)))
2502                 return NULL;
2503
2504         return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2505 }
2506
2507 static void perf_mmap_unmark_page(void *addr)
2508 {
2509         struct page *page = vmalloc_to_page(addr);
2510
2511         page->mapping = NULL;
2512 }
2513
2514 static void perf_mmap_data_free_work(struct work_struct *work)
2515 {
2516         struct perf_mmap_data *data;
2517         void *base;
2518         int i, nr;
2519
2520         data = container_of(work, struct perf_mmap_data, work);
2521         nr = 1 << page_order(data);
2522
2523         base = data->user_page;
2524         for (i = 0; i < nr + 1; i++)
2525                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2526
2527         vfree(base);
2528         kfree(data);
2529 }
2530
2531 static void perf_mmap_data_free(struct perf_mmap_data *data)
2532 {
2533         schedule_work(&data->work);
2534 }
2535
2536 static struct perf_mmap_data *
2537 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2538 {
2539         struct perf_mmap_data *data;
2540         unsigned long size;
2541         void *all_buf;
2542
2543         size = sizeof(struct perf_mmap_data);
2544         size += sizeof(void *);
2545
2546         data = kzalloc(size, GFP_KERNEL);
2547         if (!data)
2548                 goto fail;
2549
2550         INIT_WORK(&data->work, perf_mmap_data_free_work);
2551
2552         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2553         if (!all_buf)
2554                 goto fail_all_buf;
2555
2556         data->user_page = all_buf;
2557         data->data_pages[0] = all_buf + PAGE_SIZE;
2558         data->page_order = ilog2(nr_pages);
2559         data->nr_pages = 1;
2560
2561         return data;
2562
2563 fail_all_buf:
2564         kfree(data);
2565
2566 fail:
2567         return NULL;
2568 }
2569
2570 #endif
2571
2572 static unsigned long perf_data_size(struct perf_mmap_data *data)
2573 {
2574         return data->nr_pages << (PAGE_SHIFT + page_order(data));
2575 }
2576
2577 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2578 {
2579         struct perf_event *event = vma->vm_file->private_data;
2580         struct perf_mmap_data *data;
2581         int ret = VM_FAULT_SIGBUS;
2582
2583         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2584                 if (vmf->pgoff == 0)
2585                         ret = 0;
2586                 return ret;
2587         }
2588
2589         rcu_read_lock();
2590         data = rcu_dereference(event->data);
2591         if (!data)
2592                 goto unlock;
2593
2594         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2595                 goto unlock;
2596
2597         vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2598         if (!vmf->page)
2599                 goto unlock;
2600
2601         get_page(vmf->page);
2602         vmf->page->mapping = vma->vm_file->f_mapping;
2603         vmf->page->index   = vmf->pgoff;
2604
2605         ret = 0;
2606 unlock:
2607         rcu_read_unlock();
2608
2609         return ret;
2610 }
2611
2612 static void
2613 perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2614 {
2615         long max_size = perf_data_size(data);
2616
2617         if (event->attr.watermark) {
2618                 data->watermark = min_t(long, max_size,
2619                                         event->attr.wakeup_watermark);
2620         }
2621
2622         if (!data->watermark)
2623                 data->watermark = max_size / 2;
2624
2625         atomic_set(&data->refcount, 1);
2626         rcu_assign_pointer(event->data, data);
2627 }
2628
2629 static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2630 {
2631         struct perf_mmap_data *data;
2632
2633         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2634         perf_mmap_data_free(data);
2635 }
2636
2637 static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
2638 {
2639         struct perf_mmap_data *data;
2640
2641         rcu_read_lock();
2642         data = rcu_dereference(event->data);
2643         if (data) {
2644                 if (!atomic_inc_not_zero(&data->refcount))
2645                         data = NULL;
2646         }
2647         rcu_read_unlock();
2648
2649         return data;
2650 }
2651
2652 static void perf_mmap_data_put(struct perf_mmap_data *data)
2653 {
2654         if (!atomic_dec_and_test(&data->refcount))
2655                 return;
2656
2657         call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2658 }
2659
2660 static void perf_mmap_open(struct vm_area_struct *vma)
2661 {
2662         struct perf_event *event = vma->vm_file->private_data;
2663
2664         atomic_inc(&event->mmap_count);
2665 }
2666
2667 static void perf_mmap_close(struct vm_area_struct *vma)
2668 {
2669         struct perf_event *event = vma->vm_file->private_data;
2670
2671         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2672                 unsigned long size = perf_data_size(event->data);
2673                 struct user_struct *user = event->mmap_user;
2674                 struct perf_mmap_data *data = event->data;
2675
2676                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2677                 vma->vm_mm->locked_vm -= event->mmap_locked;
2678                 rcu_assign_pointer(event->data, NULL);
2679                 mutex_unlock(&event->mmap_mutex);
2680
2681                 perf_mmap_data_put(data);
2682                 free_uid(user);
2683         }
2684 }
2685
2686 static const struct vm_operations_struct perf_mmap_vmops = {
2687         .open           = perf_mmap_open,
2688         .close          = perf_mmap_close,
2689         .fault          = perf_mmap_fault,
2690         .page_mkwrite   = perf_mmap_fault,
2691 };
2692
2693 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2694 {
2695         struct perf_event *event = file->private_data;
2696         unsigned long user_locked, user_lock_limit;
2697         struct user_struct *user = current_user();
2698         unsigned long locked, lock_limit;
2699         struct perf_mmap_data *data;
2700         unsigned long vma_size;
2701         unsigned long nr_pages;
2702         long user_extra, extra;
2703         int ret = 0;
2704
2705         /*
2706          * Don't allow mmap() of inherited per-task counters. This would
2707          * create a performance issue due to all children writing to the
2708          * same buffer.
2709          */
2710         if (event->cpu == -1 && event->attr.inherit)
2711                 return -EINVAL;
2712
2713         if (!(vma->vm_flags & VM_SHARED))
2714                 return -EINVAL;
2715
2716         vma_size = vma->vm_end - vma->vm_start;
2717         nr_pages = (vma_size / PAGE_SIZE) - 1;
2718
2719         /*
2720          * If we have data pages ensure they're a power-of-two number, so we
2721          * can do bitmasks instead of modulo.
2722          */
2723         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2724                 return -EINVAL;
2725
2726         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2727                 return -EINVAL;
2728
2729         if (vma->vm_pgoff != 0)
2730                 return -EINVAL;
2731
2732         WARN_ON_ONCE(event->ctx->parent_ctx);
2733         mutex_lock(&event->mmap_mutex);
2734         if (event->data) {
2735                 if (event->data->nr_pages == nr_pages)
2736                         atomic_inc(&event->data->refcount);
2737                 else
2738                         ret = -EINVAL;
2739                 goto unlock;
2740         }
2741
2742         user_extra = nr_pages + 1;
2743         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2744
2745         /*
2746          * Increase the limit linearly with more CPUs:
2747          */
2748         user_lock_limit *= num_online_cpus();
2749
2750         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2751
2752         extra = 0;
2753         if (user_locked > user_lock_limit)
2754                 extra = user_locked - user_lock_limit;
2755
2756         lock_limit = rlimit(RLIMIT_MEMLOCK);
2757         lock_limit >>= PAGE_SHIFT;
2758         locked = vma->vm_mm->locked_vm + extra;
2759
2760         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2761                 !capable(CAP_IPC_LOCK)) {
2762                 ret = -EPERM;
2763                 goto unlock;
2764         }
2765
2766         WARN_ON(event->data);
2767
2768         data = perf_mmap_data_alloc(event, nr_pages);
2769         if (!data) {
2770                 ret = -ENOMEM;
2771                 goto unlock;
2772         }
2773
2774         perf_mmap_data_init(event, data);
2775         if (vma->vm_flags & VM_WRITE)
2776                 event->data->writable = 1;
2777
2778         atomic_long_add(user_extra, &user->locked_vm);
2779         event->mmap_locked = extra;
2780         event->mmap_user = get_current_user();
2781         vma->vm_mm->locked_vm += event->mmap_locked;
2782
2783 unlock:
2784         if (!ret)
2785                 atomic_inc(&event->mmap_count);
2786         mutex_unlock(&event->mmap_mutex);
2787
2788         vma->vm_flags |= VM_RESERVED;
2789         vma->vm_ops = &perf_mmap_vmops;
2790
2791         return ret;
2792 }
2793
2794 static int perf_fasync(int fd, struct file *filp, int on)
2795 {
2796         struct inode *inode = filp->f_path.dentry->d_inode;
2797         struct perf_event *event = filp->private_data;
2798         int retval;
2799
2800         mutex_lock(&inode->i_mutex);
2801         retval = fasync_helper(fd, filp, on, &event->fasync);
2802         mutex_unlock(&inode->i_mutex);
2803
2804         if (retval < 0)
2805                 return retval;
2806
2807         return 0;
2808 }
2809
2810 static const struct file_operations perf_fops = {
2811         .llseek                 = no_llseek,
2812         .release                = perf_release,
2813         .read                   = perf_read,
2814         .poll                   = perf_poll,
2815         .unlocked_ioctl         = perf_ioctl,
2816         .compat_ioctl           = perf_ioctl,
2817         .mmap                   = perf_mmap,
2818         .fasync                 = perf_fasync,
2819 };
2820
2821 /*
2822  * Perf event wakeup
2823  *
2824  * If there's data, ensure we set the poll() state and publish everything
2825  * to user-space before waking everybody up.
2826  */
2827
2828 void perf_event_wakeup(struct perf_event *event)
2829 {
2830         wake_up_all(&event->waitq);
2831
2832         if (event->pending_kill) {
2833                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2834                 event->pending_kill = 0;
2835         }
2836 }
2837
2838 /*
2839  * Pending wakeups
2840  *
2841  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2842  *
2843  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2844  * single linked list and use cmpxchg() to add entries lockless.
2845  */
2846
2847 static void perf_pending_event(struct perf_pending_entry *entry)
2848 {
2849         struct perf_event *event = container_of(entry,
2850                         struct perf_event, pending);
2851
2852         if (event->pending_disable) {
2853                 event->pending_disable = 0;
2854                 __perf_event_disable(event);
2855         }
2856
2857         if (event->pending_wakeup) {
2858                 event->pending_wakeup = 0;
2859                 perf_event_wakeup(event);
2860         }
2861 }
2862
2863 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2864
2865 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2866         PENDING_TAIL,
2867 };
2868
2869 static void perf_pending_queue(struct perf_pending_entry *entry,
2870                                void (*func)(struct perf_pending_entry *))
2871 {
2872         struct perf_pending_entry **head;
2873
2874         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2875                 return;
2876
2877         entry->func = func;
2878
2879         head = &get_cpu_var(perf_pending_head);
2880
2881         do {
2882                 entry->next = *head;
2883         } while (cmpxchg(head, entry->next, entry) != entry->next);
2884
2885         set_perf_event_pending();
2886
2887         put_cpu_var(perf_pending_head);
2888 }
2889
2890 static int __perf_pending_run(void)
2891 {
2892         struct perf_pending_entry *list;
2893         int nr = 0;
2894
2895         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2896         while (list != PENDING_TAIL) {
2897                 void (*func)(struct perf_pending_entry *);
2898                 struct perf_pending_entry *entry = list;
2899
2900                 list = list->next;
2901
2902                 func = entry->func;
2903                 entry->next = NULL;
2904                 /*
2905                  * Ensure we observe the unqueue before we issue the wakeup,
2906                  * so that we won't be waiting forever.
2907                  * -- see perf_not_pending().
2908                  */
2909                 smp_wmb();
2910
2911                 func(entry);
2912                 nr++;
2913         }
2914
2915         return nr;
2916 }
2917
2918 static inline int perf_not_pending(struct perf_event *event)
2919 {
2920         /*
2921          * If we flush on whatever cpu we run, there is a chance we don't
2922          * need to wait.
2923          */
2924         get_cpu();
2925         __perf_pending_run();
2926         put_cpu();
2927
2928         /*
2929          * Ensure we see the proper queue state before going to sleep
2930          * so that we do not miss the wakeup. -- see perf_pending_handle()
2931          */
2932         smp_rmb();
2933         return event->pending.next == NULL;
2934 }
2935
2936 static void perf_pending_sync(struct perf_event *event)
2937 {
2938         wait_event(event->waitq, perf_not_pending(event));
2939 }
2940
2941 void perf_event_do_pending(void)
2942 {
2943         __perf_pending_run();
2944 }
2945
2946 /*
2947  * Callchain support -- arch specific
2948  */
2949
2950 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2951 {
2952         return NULL;
2953 }
2954
2955 __weak
2956 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2957 {
2958 }
2959
2960
2961 /*
2962  * We assume there is only KVM supporting the callbacks.
2963  * Later on, we might change it to a list if there is
2964  * another virtualization implementation supporting the callbacks.
2965  */
2966 struct perf_guest_info_callbacks *perf_guest_cbs;
2967
2968 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2969 {
2970         perf_guest_cbs = cbs;
2971         return 0;
2972 }
2973 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
2974
2975 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2976 {
2977         perf_guest_cbs = NULL;
2978         return 0;
2979 }
2980 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2981
2982 /*
2983  * Output
2984  */
2985 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2986                               unsigned long offset, unsigned long head)
2987 {
2988         unsigned long mask;
2989
2990         if (!data->writable)
2991                 return true;
2992
2993         mask = perf_data_size(data) - 1;
2994
2995         offset = (offset - tail) & mask;
2996         head   = (head   - tail) & mask;
2997
2998         if ((int)(head - offset) < 0)
2999                 return false;
3000
3001         return true;
3002 }
3003
3004 static void perf_output_wakeup(struct perf_output_handle *handle)
3005 {
3006         atomic_set(&handle->data->poll, POLL_IN);
3007
3008         if (handle->nmi) {
3009                 handle->event->pending_wakeup = 1;
3010                 perf_pending_queue(&handle->event->pending,
3011                                    perf_pending_event);
3012         } else
3013                 perf_event_wakeup(handle->event);
3014 }
3015
3016 /*
3017  * We need to ensure a later event_id doesn't publish a head when a former
3018  * event isn't done writing. However since we need to deal with NMIs we
3019  * cannot fully serialize things.
3020  *
3021  * We only publish the head (and generate a wakeup) when the outer-most
3022  * event completes.
3023  */
3024 static void perf_output_get_handle(struct perf_output_handle *handle)
3025 {
3026         struct perf_mmap_data *data = handle->data;
3027
3028         preempt_disable();
3029         local_inc(&data->nest);
3030         handle->wakeup = local_read(&data->wakeup);
3031 }
3032
3033 static void perf_output_put_handle(struct perf_output_handle *handle)
3034 {
3035         struct perf_mmap_data *data = handle->data;
3036         unsigned long head;
3037
3038 again:
3039         head = local_read(&data->head);
3040
3041         /*
3042          * IRQ/NMI can happen here, which means we can miss a head update.
3043          */
3044
3045         if (!local_dec_and_test(&data->nest))
3046                 goto out;
3047
3048         /*
3049          * Publish the known good head. Rely on the full barrier implied
3050          * by atomic_dec_and_test() order the data->head read and this
3051          * write.
3052          */
3053         data->user_page->data_head = head;
3054
3055         /*
3056          * Now check if we missed an update, rely on the (compiler)
3057          * barrier in atomic_dec_and_test() to re-read data->head.
3058          */
3059         if (unlikely(head != local_read(&data->head))) {
3060                 local_inc(&data->nest);
3061                 goto again;
3062         }
3063
3064         if (handle->wakeup != local_read(&data->wakeup))
3065                 perf_output_wakeup(handle);
3066
3067  out:
3068         preempt_enable();
3069 }
3070
3071 __always_inline void perf_output_copy(struct perf_output_handle *handle,
3072                       const void *buf, unsigned int len)
3073 {
3074         do {
3075                 unsigned long size = min_t(unsigned long, handle->size, len);
3076
3077                 memcpy(handle->addr, buf, size);
3078
3079                 len -= size;
3080                 handle->addr += size;
3081                 buf += size;
3082                 handle->size -= size;
3083                 if (!handle->size) {
3084                         struct perf_mmap_data *data = handle->data;
3085
3086                         handle->page++;
3087                         handle->page &= data->nr_pages - 1;
3088                         handle->addr = data->data_pages[handle->page];
3089                         handle->size = PAGE_SIZE << page_order(data);
3090                 }
3091         } while (len);
3092 }
3093
3094 int perf_output_begin(struct perf_output_handle *handle,
3095                       struct perf_event *event, unsigned int size,
3096                       int nmi, int sample)
3097 {
3098         struct perf_mmap_data *data;
3099         unsigned long tail, offset, head;
3100         int have_lost;
3101         struct {
3102                 struct perf_event_header header;
3103                 u64                      id;
3104                 u64                      lost;
3105         } lost_event;
3106
3107         rcu_read_lock();
3108         /*
3109          * For inherited events we send all the output towards the parent.
3110          */
3111         if (event->parent)
3112                 event = event->parent;
3113
3114         data = rcu_dereference(event->data);
3115         if (!data)
3116                 goto out;
3117
3118         handle->data    = data;
3119         handle->event   = event;
3120         handle->nmi     = nmi;
3121         handle->sample  = sample;
3122
3123         if (!data->nr_pages)
3124                 goto out;
3125
3126         have_lost = local_read(&data->lost);
3127         if (have_lost)
3128                 size += sizeof(lost_event);
3129
3130         perf_output_get_handle(handle);
3131
3132         do {
3133                 /*
3134                  * Userspace could choose to issue a mb() before updating the
3135                  * tail pointer. So that all reads will be completed before the
3136                  * write is issued.
3137                  */
3138                 tail = ACCESS_ONCE(data->user_page->data_tail);
3139                 smp_rmb();
3140                 offset = head = local_read(&data->head);
3141                 head += size;
3142                 if (unlikely(!perf_output_space(data, tail, offset, head)))
3143                         goto fail;
3144         } while (local_cmpxchg(&data->head, offset, head) != offset);
3145
3146         if (head - local_read(&data->wakeup) > data->watermark)
3147                 local_add(data->watermark, &data->wakeup);
3148
3149         handle->page = offset >> (PAGE_SHIFT + page_order(data));
3150         handle->page &= data->nr_pages - 1;
3151         handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
3152         handle->addr = data->data_pages[handle->page];
3153         handle->addr += handle->size;
3154         handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
3155
3156         if (have_lost) {
3157                 lost_event.header.type = PERF_RECORD_LOST;
3158                 lost_event.header.misc = 0;
3159                 lost_event.header.size = sizeof(lost_event);
3160                 lost_event.id          = event->id;
3161                 lost_event.lost        = local_xchg(&data->lost, 0);
3162
3163                 perf_output_put(handle, lost_event);
3164         }
3165
3166         return 0;
3167
3168 fail:
3169         local_inc(&data->lost);
3170         perf_output_put_handle(handle);
3171 out:
3172         rcu_read_unlock();
3173
3174         return -ENOSPC;
3175 }
3176
3177 void perf_output_end(struct perf_output_handle *handle)
3178 {
3179         struct perf_event *event = handle->event;
3180         struct perf_mmap_data *data = handle->data;
3181
3182         int wakeup_events = event->attr.wakeup_events;
3183
3184         if (handle->sample && wakeup_events) {
3185                 int events = local_inc_return(&data->events);
3186                 if (events >= wakeup_events) {
3187                         local_sub(wakeup_events, &data->events);
3188                         local_inc(&data->wakeup);
3189                 }
3190         }
3191
3192         perf_output_put_handle(handle);
3193         rcu_read_unlock();
3194 }
3195
3196 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3197 {
3198         /*
3199          * only top level events have the pid namespace they were created in
3200          */
3201         if (event->parent)
3202                 event = event->parent;
3203
3204         return task_tgid_nr_ns(p, event->ns);
3205 }
3206
3207 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3208 {
3209         /*
3210          * only top level events have the pid namespace they were created in
3211          */
3212         if (event->parent)
3213                 event = event->parent;
3214
3215         return task_pid_nr_ns(p, event->ns);
3216 }
3217
3218 static void perf_output_read_one(struct perf_output_handle *handle,
3219                                  struct perf_event *event)
3220 {
3221         u64 read_format = event->attr.read_format;
3222         u64 values[4];
3223         int n = 0;
3224
3225         values[n++] = atomic64_read(&event->count);
3226         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3227                 values[n++] = event->total_time_enabled +
3228                         atomic64_read(&event->child_total_time_enabled);
3229         }
3230         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3231                 values[n++] = event->total_time_running +
3232                         atomic64_read(&event->child_total_time_running);
3233         }
3234         if (read_format & PERF_FORMAT_ID)
3235                 values[n++] = primary_event_id(event);
3236
3237         perf_output_copy(handle, values, n * sizeof(u64));
3238 }
3239
3240 /*
3241  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3242  */
3243 static void perf_output_read_group(struct perf_output_handle *handle,
3244                             struct perf_event *event)
3245 {
3246         struct perf_event *leader = event->group_leader, *sub;
3247         u64 read_format = event->attr.read_format;
3248         u64 values[5];
3249         int n = 0;
3250
3251         values[n++] = 1 + leader->nr_siblings;
3252
3253         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3254                 values[n++] = leader->total_time_enabled;
3255
3256         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3257                 values[n++] = leader->total_time_running;
3258
3259         if (leader != event)
3260                 leader->pmu->read(leader);
3261
3262         values[n++] = atomic64_read(&leader->count);
3263         if (read_format & PERF_FORMAT_ID)
3264                 values[n++] = primary_event_id(leader);
3265
3266         perf_output_copy(handle, values, n * sizeof(u64));
3267
3268         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3269                 n = 0;
3270
3271                 if (sub != event)
3272                         sub->pmu->read(sub);
3273
3274                 values[n++] = atomic64_read(&sub->count);
3275                 if (read_format & PERF_FORMAT_ID)
3276                         values[n++] = primary_event_id(sub);
3277
3278                 perf_output_copy(handle, values, n * sizeof(u64));
3279         }
3280 }
3281
3282 static void perf_output_read(struct perf_output_handle *handle,
3283                              struct perf_event *event)
3284 {
3285         if (event->attr.read_format & PERF_FORMAT_GROUP)
3286                 perf_output_read_group(handle, event);
3287         else
3288                 perf_output_read_one(handle, event);
3289 }
3290
3291 void perf_output_sample(struct perf_output_handle *handle,
3292                         struct perf_event_header *header,
3293                         struct perf_sample_data *data,
3294                         struct perf_event *event)
3295 {
3296         u64 sample_type = data->type;
3297
3298         perf_output_put(handle, *header);
3299
3300         if (sample_type & PERF_SAMPLE_IP)
3301                 perf_output_put(handle, data->ip);
3302
3303         if (sample_type & PERF_SAMPLE_TID)
3304                 perf_output_put(handle, data->tid_entry);
3305
3306         if (sample_type & PERF_SAMPLE_TIME)
3307                 perf_output_put(handle, data->time);
3308
3309         if (sample_type & PERF_SAMPLE_ADDR)
3310                 perf_output_put(handle, data->addr);
3311
3312         if (sample_type & PERF_SAMPLE_ID)
3313                 perf_output_put(handle, data->id);
3314
3315         if (sample_type & PERF_SAMPLE_STREAM_ID)
3316                 perf_output_put(handle, data->stream_id);
3317
3318         if (sample_type & PERF_SAMPLE_CPU)
3319                 perf_output_put(handle, data->cpu_entry);
3320
3321         if (sample_type & PERF_SAMPLE_PERIOD)
3322                 perf_output_put(handle, data->period);
3323
3324         if (sample_type & PERF_SAMPLE_READ)
3325                 perf_output_read(handle, event);
3326
3327         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3328                 if (data->callchain) {
3329                         int size = 1;
3330
3331                         if (data->callchain)
3332                                 size += data->callchain->nr;
3333
3334                         size *= sizeof(u64);
3335
3336                         perf_output_copy(handle, data->callchain, size);
3337                 } else {
3338                         u64 nr = 0;
3339                         perf_output_put(handle, nr);
3340                 }
3341         }
3342
3343         if (sample_type & PERF_SAMPLE_RAW) {
3344                 if (data->raw) {
3345                         perf_output_put(handle, data->raw->size);
3346                         perf_output_copy(handle, data->raw->data,
3347                                          data->raw->size);
3348                 } else {
3349                         struct {
3350                                 u32     size;
3351                                 u32     data;
3352                         } raw = {
3353                                 .size = sizeof(u32),
3354                                 .data = 0,
3355                         };
3356                         perf_output_put(handle, raw);
3357                 }
3358         }
3359 }
3360
3361 void perf_prepare_sample(struct perf_event_header *header,
3362                          struct perf_sample_data *data,
3363                          struct perf_event *event,
3364                          struct pt_regs *regs)
3365 {
3366         u64 sample_type = event->attr.sample_type;
3367
3368         data->type = sample_type;
3369
3370         header->type = PERF_RECORD_SAMPLE;
3371         header->size = sizeof(*header);
3372
3373         header->misc = 0;
3374         header->misc |= perf_misc_flags(regs);
3375
3376         if (sample_type & PERF_SAMPLE_IP) {
3377                 data->ip = perf_instruction_pointer(regs);
3378
3379                 header->size += sizeof(data->ip);
3380         }
3381
3382         if (sample_type & PERF_SAMPLE_TID) {
3383                 /* namespace issues */
3384                 data->tid_entry.pid = perf_event_pid(event, current);
3385                 data->tid_entry.tid = perf_event_tid(event, current);
3386
3387                 header->size += sizeof(data->tid_entry);
3388         }
3389
3390         if (sample_type & PERF_SAMPLE_TIME) {
3391                 data->time = perf_clock();
3392
3393                 header->size += sizeof(data->time);
3394         }
3395
3396         if (sample_type & PERF_SAMPLE_ADDR)
3397                 header->size += sizeof(data->addr);
3398
3399         if (sample_type & PERF_SAMPLE_ID) {
3400                 data->id = primary_event_id(event);
3401
3402                 header->size += sizeof(data->id);
3403         }
3404
3405         if (sample_type & PERF_SAMPLE_STREAM_ID) {
3406                 data->stream_id = event->id;
3407
3408                 header->size += sizeof(data->stream_id);
3409         }
3410
3411         if (sample_type & PERF_SAMPLE_CPU) {
3412                 data->cpu_entry.cpu             = raw_smp_processor_id();
3413                 data->cpu_entry.reserved        = 0;
3414
3415                 header->size += sizeof(data->cpu_entry);
3416         }
3417
3418         if (sample_type & PERF_SAMPLE_PERIOD)
3419                 header->size += sizeof(data->period);
3420
3421         if (sample_type & PERF_SAMPLE_READ)
3422                 header->size += perf_event_read_size(event);
3423
3424         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3425                 int size = 1;
3426
3427                 data->callchain = perf_callchain(regs);
3428
3429                 if (data->callchain)
3430                         size += data->callchain->nr;
3431
3432                 header->size += size * sizeof(u64);
3433         }
3434
3435         if (sample_type & PERF_SAMPLE_RAW) {
3436                 int size = sizeof(u32);
3437
3438                 if (data->raw)
3439                         size += data->raw->size;
3440                 else
3441                         size += sizeof(u32);
3442
3443                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3444                 header->size += size;
3445         }
3446 }
3447
3448 static void perf_event_output(struct perf_event *event, int nmi,
3449                                 struct perf_sample_data *data,
3450                                 struct pt_regs *regs)
3451 {
3452         struct perf_output_handle handle;
3453         struct perf_event_header header;
3454
3455         perf_prepare_sample(&header, data, event, regs);
3456
3457         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3458                 return;
3459
3460         perf_output_sample(&handle, &header, data, event);
3461
3462         perf_output_end(&handle);
3463 }
3464
3465 /*
3466  * read event_id
3467  */
3468
3469 struct perf_read_event {
3470         struct perf_event_header        header;
3471
3472         u32                             pid;
3473         u32                             tid;
3474 };
3475
3476 static void
3477 perf_event_read_event(struct perf_event *event,
3478                         struct task_struct *task)
3479 {
3480         struct perf_output_handle handle;
3481         struct perf_read_event read_event = {
3482                 .header = {
3483                         .type = PERF_RECORD_READ,
3484                         .misc = 0,
3485                         .size = sizeof(read_event) + perf_event_read_size(event),
3486                 },
3487                 .pid = perf_event_pid(event, task),
3488                 .tid = perf_event_tid(event, task),
3489         };
3490         int ret;
3491
3492         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3493         if (ret)
3494                 return;
3495
3496         perf_output_put(&handle, read_event);
3497         perf_output_read(&handle, event);
3498
3499         perf_output_end(&handle);
3500 }
3501
3502 /*
3503  * task tracking -- fork/exit
3504  *
3505  * enabled by: attr.comm | attr.mmap | attr.task
3506  */
3507
3508 struct perf_task_event {
3509         struct task_struct              *task;
3510         struct perf_event_context       *task_ctx;
3511
3512         struct {
3513                 struct perf_event_header        header;
3514
3515                 u32                             pid;
3516                 u32                             ppid;
3517                 u32                             tid;
3518                 u32                             ptid;
3519                 u64                             time;
3520         } event_id;
3521 };
3522
3523 static void perf_event_task_output(struct perf_event *event,
3524                                      struct perf_task_event *task_event)
3525 {
3526         struct perf_output_handle handle;
3527         struct task_struct *task = task_event->task;
3528         int size, ret;
3529
3530         size  = task_event->event_id.header.size;
3531         ret = perf_output_begin(&handle, event, size, 0, 0);
3532
3533         if (ret)
3534                 return;
3535
3536         task_event->event_id.pid = perf_event_pid(event, task);
3537         task_event->event_id.ppid = perf_event_pid(event, current);
3538
3539         task_event->event_id.tid = perf_event_tid(event, task);
3540         task_event->event_id.ptid = perf_event_tid(event, current);
3541
3542         perf_output_put(&handle, task_event->event_id);
3543
3544         perf_output_end(&handle);
3545 }
3546
3547 static int perf_event_task_match(struct perf_event *event)
3548 {
3549         if (event->state < PERF_EVENT_STATE_INACTIVE)
3550                 return 0;
3551
3552         if (event->cpu != -1 && event->cpu != smp_processor_id())
3553                 return 0;
3554
3555         if (event->attr.comm || event->attr.mmap || event->attr.task)
3556                 return 1;
3557
3558         return 0;
3559 }
3560
3561 static void perf_event_task_ctx(struct perf_event_context *ctx,
3562                                   struct perf_task_event *task_event)
3563 {
3564         struct perf_event *event;
3565
3566         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3567                 if (perf_event_task_match(event))
3568                         perf_event_task_output(event, task_event);
3569         }
3570 }
3571
3572 static void perf_event_task_event(struct perf_task_event *task_event)
3573 {
3574         struct perf_cpu_context *cpuctx;
3575         struct perf_event_context *ctx = task_event->task_ctx;
3576
3577         rcu_read_lock();
3578         cpuctx = &get_cpu_var(perf_cpu_context);
3579         perf_event_task_ctx(&cpuctx->ctx, task_event);
3580         if (!ctx)
3581                 ctx = rcu_dereference(current->perf_event_ctxp);
3582         if (ctx)
3583                 perf_event_task_ctx(ctx, task_event);
3584         put_cpu_var(perf_cpu_context);
3585         rcu_read_unlock();
3586 }
3587
3588 static void perf_event_task(struct task_struct *task,
3589                               struct perf_event_context *task_ctx,
3590                               int new)
3591 {
3592         struct perf_task_event task_event;
3593
3594         if (!atomic_read(&nr_comm_events) &&
3595             !atomic_read(&nr_mmap_events) &&
3596             !atomic_read(&nr_task_events))
3597                 return;
3598
3599         task_event = (struct perf_task_event){
3600                 .task     = task,
3601                 .task_ctx = task_ctx,
3602                 .event_id    = {
3603                         .header = {
3604                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3605                                 .misc = 0,
3606                                 .size = sizeof(task_event.event_id),
3607                         },
3608                         /* .pid  */
3609                         /* .ppid */
3610                         /* .tid  */
3611                         /* .ptid */
3612                         .time = perf_clock(),
3613                 },
3614         };
3615
3616         perf_event_task_event(&task_event);
3617 }
3618
3619 void perf_event_fork(struct task_struct *task)
3620 {
3621         perf_event_task(task, NULL, 1);
3622 }
3623
3624 /*
3625  * comm tracking
3626  */
3627
3628 struct perf_comm_event {
3629         struct task_struct      *task;
3630         char                    *comm;
3631         int                     comm_size;
3632
3633         struct {
3634                 struct perf_event_header        header;
3635
3636                 u32                             pid;
3637                 u32                             tid;
3638         } event_id;
3639 };
3640
3641 static void perf_event_comm_output(struct perf_event *event,
3642                                      struct perf_comm_event *comm_event)
3643 {
3644         struct perf_output_handle handle;
3645         int size = comm_event->event_id.header.size;
3646         int ret = perf_output_begin(&handle, event, size, 0, 0);
3647
3648         if (ret)
3649                 return;
3650
3651         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3652         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3653
3654         perf_output_put(&handle, comm_event->event_id);
3655         perf_output_copy(&handle, comm_event->comm,
3656                                    comm_event->comm_size);
3657         perf_output_end(&handle);
3658 }
3659
3660 static int perf_event_comm_match(struct perf_event *event)
3661 {
3662         if (event->state < PERF_EVENT_STATE_INACTIVE)
3663                 return 0;
3664
3665         if (event->cpu != -1 && event->cpu != smp_processor_id())
3666                 return 0;
3667
3668         if (event->attr.comm)
3669                 return 1;
3670
3671         return 0;
3672 }
3673
3674 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3675                                   struct perf_comm_event *comm_event)
3676 {
3677         struct perf_event *event;
3678
3679         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3680                 if (perf_event_comm_match(event))
3681                         perf_event_comm_output(event, comm_event);
3682         }
3683 }
3684
3685 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3686 {
3687         struct perf_cpu_context *cpuctx;
3688         struct perf_event_context *ctx;
3689         unsigned int size;
3690         char comm[TASK_COMM_LEN];
3691
3692         memset(comm, 0, sizeof(comm));
3693         strlcpy(comm, comm_event->task->comm, sizeof(comm));
3694         size = ALIGN(strlen(comm)+1, sizeof(u64));
3695
3696         comm_event->comm = comm;
3697         comm_event->comm_size = size;
3698
3699         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3700
3701         rcu_read_lock();
3702         cpuctx = &get_cpu_var(perf_cpu_context);
3703         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3704         ctx = rcu_dereference(current->perf_event_ctxp);
3705         if (ctx)
3706                 perf_event_comm_ctx(ctx, comm_event);
3707         put_cpu_var(perf_cpu_context);
3708         rcu_read_unlock();
3709 }
3710
3711 void perf_event_comm(struct task_struct *task)
3712 {
3713         struct perf_comm_event comm_event;
3714
3715         if (task->perf_event_ctxp)
3716                 perf_event_enable_on_exec(task);
3717
3718         if (!atomic_read(&nr_comm_events))
3719                 return;
3720
3721         comm_event = (struct perf_comm_event){
3722                 .task   = task,
3723                 /* .comm      */
3724                 /* .comm_size */
3725                 .event_id  = {
3726                         .header = {
3727                                 .type = PERF_RECORD_COMM,
3728                                 .misc = 0,
3729                                 /* .size */
3730                         },
3731                         /* .pid */
3732                         /* .tid */
3733                 },
3734         };
3735
3736         perf_event_comm_event(&comm_event);
3737 }
3738
3739 /*
3740  * mmap tracking
3741  */
3742
3743 struct perf_mmap_event {
3744         struct vm_area_struct   *vma;
3745
3746         const char              *file_name;
3747         int                     file_size;
3748
3749         struct {
3750                 struct perf_event_header        header;
3751
3752                 u32                             pid;
3753                 u32                             tid;
3754                 u64                             start;
3755                 u64                             len;
3756                 u64                             pgoff;
3757         } event_id;
3758 };
3759
3760 static void perf_event_mmap_output(struct perf_event *event,
3761                                      struct perf_mmap_event *mmap_event)
3762 {
3763         struct perf_output_handle handle;
3764         int size = mmap_event->event_id.header.size;
3765         int ret = perf_output_begin(&handle, event, size, 0, 0);
3766
3767         if (ret)
3768                 return;
3769
3770         mmap_event->event_id.pid = perf_event_pid(event, current);
3771         mmap_event->event_id.tid = perf_event_tid(event, current);
3772
3773         perf_output_put(&handle, mmap_event->event_id);
3774         perf_output_copy(&handle, mmap_event->file_name,
3775                                    mmap_event->file_size);
3776         perf_output_end(&handle);
3777 }
3778
3779 static int perf_event_mmap_match(struct perf_event *event,
3780                                    struct perf_mmap_event *mmap_event)
3781 {
3782         if (event->state < PERF_EVENT_STATE_INACTIVE)
3783                 return 0;
3784
3785         if (event->cpu != -1 && event->cpu != smp_processor_id())
3786                 return 0;
3787
3788         if (event->attr.mmap)
3789                 return 1;
3790
3791         return 0;
3792 }
3793
3794 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3795                                   struct perf_mmap_event *mmap_event)
3796 {
3797         struct perf_event *event;
3798
3799         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3800                 if (perf_event_mmap_match(event, mmap_event))
3801                         perf_event_mmap_output(event, mmap_event);
3802         }
3803 }
3804
3805 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3806 {
3807         struct perf_cpu_context *cpuctx;
3808         struct perf_event_context *ctx;
3809         struct vm_area_struct *vma = mmap_event->vma;
3810         struct file *file = vma->vm_file;
3811         unsigned int size;
3812         char tmp[16];
3813         char *buf = NULL;
3814         const char *name;
3815
3816         memset(tmp, 0, sizeof(tmp));
3817
3818         if (file) {
3819                 /*
3820                  * d_path works from the end of the buffer backwards, so we
3821                  * need to add enough zero bytes after the string to handle
3822                  * the 64bit alignment we do later.
3823                  */
3824                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3825                 if (!buf) {
3826                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3827                         goto got_name;
3828                 }
3829                 name = d_path(&file->f_path, buf, PATH_MAX);
3830                 if (IS_ERR(name)) {
3831                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3832                         goto got_name;
3833                 }
3834         } else {
3835                 if (arch_vma_name(mmap_event->vma)) {
3836                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3837                                        sizeof(tmp));
3838                         goto got_name;
3839                 }
3840
3841                 if (!vma->vm_mm) {
3842                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3843                         goto got_name;
3844                 }
3845
3846                 name = strncpy(tmp, "//anon", sizeof(tmp));
3847                 goto got_name;
3848         }
3849
3850 got_name:
3851         size = ALIGN(strlen(name)+1, sizeof(u64));
3852
3853         mmap_event->file_name = name;
3854         mmap_event->file_size = size;
3855
3856         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3857
3858         rcu_read_lock();
3859         cpuctx = &get_cpu_var(perf_cpu_context);
3860         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3861         ctx = rcu_dereference(current->perf_event_ctxp);
3862         if (ctx)
3863                 perf_event_mmap_ctx(ctx, mmap_event);
3864         put_cpu_var(perf_cpu_context);
3865         rcu_read_unlock();
3866
3867         kfree(buf);
3868 }
3869
3870 void __perf_event_mmap(struct vm_area_struct *vma)
3871 {
3872         struct perf_mmap_event mmap_event;
3873
3874         if (!atomic_read(&nr_mmap_events))
3875                 return;
3876
3877         mmap_event = (struct perf_mmap_event){
3878                 .vma    = vma,
3879                 /* .file_name */
3880                 /* .file_size */
3881                 .event_id  = {
3882                         .header = {
3883                                 .type = PERF_RECORD_MMAP,
3884                                 .misc = PERF_RECORD_MISC_USER,
3885                                 /* .size */
3886                         },
3887                         /* .pid */
3888                         /* .tid */
3889                         .start  = vma->vm_start,
3890                         .len    = vma->vm_end - vma->vm_start,
3891                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
3892                 },
3893         };
3894
3895         perf_event_mmap_event(&mmap_event);
3896 }
3897
3898 /*
3899  * IRQ throttle logging
3900  */
3901
3902 static void perf_log_throttle(struct perf_event *event, int enable)
3903 {
3904         struct perf_output_handle handle;
3905         int ret;
3906
3907         struct {
3908                 struct perf_event_header        header;
3909                 u64                             time;
3910                 u64                             id;
3911                 u64                             stream_id;
3912         } throttle_event = {
3913                 .header = {
3914                         .type = PERF_RECORD_THROTTLE,
3915                         .misc = 0,
3916                         .size = sizeof(throttle_event),
3917                 },
3918                 .time           = perf_clock(),
3919                 .id             = primary_event_id(event),
3920                 .stream_id      = event->id,
3921         };
3922
3923         if (enable)
3924                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3925
3926         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3927         if (ret)
3928                 return;
3929
3930         perf_output_put(&handle, throttle_event);
3931         perf_output_end(&handle);
3932 }
3933
3934 /*
3935  * Generic event overflow handling, sampling.
3936  */
3937
3938 static int __perf_event_overflow(struct perf_event *event, int nmi,
3939                                    int throttle, struct perf_sample_data *data,
3940                                    struct pt_regs *regs)
3941 {
3942         int events = atomic_read(&event->event_limit);
3943         struct hw_perf_event *hwc = &event->hw;
3944         int ret = 0;
3945
3946         throttle = (throttle && event->pmu->unthrottle != NULL);
3947
3948         if (!throttle) {
3949                 hwc->interrupts++;
3950         } else {
3951                 if (hwc->interrupts != MAX_INTERRUPTS) {
3952                         hwc->interrupts++;
3953                         if (HZ * hwc->interrupts >
3954                                         (u64)sysctl_perf_event_sample_rate) {
3955                                 hwc->interrupts = MAX_INTERRUPTS;
3956                                 perf_log_throttle(event, 0);
3957                                 ret = 1;
3958                         }
3959                 } else {
3960                         /*
3961                          * Keep re-disabling events even though on the previous
3962                          * pass we disabled it - just in case we raced with a
3963                          * sched-in and the event got enabled again:
3964                          */
3965                         ret = 1;
3966                 }
3967         }
3968
3969         if (event->attr.freq) {
3970                 u64 now = perf_clock();
3971                 s64 delta = now - hwc->freq_time_stamp;
3972
3973                 hwc->freq_time_stamp = now;
3974
3975                 if (delta > 0 && delta < 2*TICK_NSEC)
3976                         perf_adjust_period(event, delta, hwc->last_period);
3977         }
3978
3979         /*
3980          * XXX event_limit might not quite work as expected on inherited
3981          * events
3982          */
3983
3984         event->pending_kill = POLL_IN;
3985         if (events && atomic_dec_and_test(&event->event_limit)) {
3986                 ret = 1;
3987                 event->pending_kill = POLL_HUP;
3988                 if (nmi) {
3989                         event->pending_disable = 1;
3990                         perf_pending_queue(&event->pending,
3991                                            perf_pending_event);
3992                 } else
3993                         perf_event_disable(event);
3994         }
3995
3996         if (event->overflow_handler)
3997                 event->overflow_handler(event, nmi, data, regs);
3998         else
3999                 perf_event_output(event, nmi, data, regs);
4000
4001         return ret;
4002 }
4003
4004 int perf_event_overflow(struct perf_event *event, int nmi,
4005                           struct perf_sample_data *data,
4006                           struct pt_regs *regs)
4007 {
4008         return __perf_event_overflow(event, nmi, 1, data, regs);
4009 }
4010
4011 /*
4012  * Generic software event infrastructure
4013  */
4014
4015 /*
4016  * We directly increment event->count and keep a second value in
4017  * event->hw.period_left to count intervals. This period event
4018  * is kept in the range [-sample_period, 0] so that we can use the
4019  * sign as trigger.
4020  */
4021
4022 static u64 perf_swevent_set_period(struct perf_event *event)
4023 {
4024         struct hw_perf_event *hwc = &event->hw;
4025         u64 period = hwc->last_period;
4026         u64 nr, offset;
4027         s64 old, val;
4028
4029         hwc->last_period = hwc->sample_period;
4030
4031 again:
4032         old = val = atomic64_read(&hwc->period_left);
4033         if (val < 0)
4034                 return 0;
4035
4036         nr = div64_u64(period + val, period);
4037         offset = nr * period;
4038         val -= offset;
4039         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
4040                 goto again;
4041
4042         return nr;
4043 }
4044
4045 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4046                                     int nmi, struct perf_sample_data *data,
4047                                     struct pt_regs *regs)
4048 {
4049         struct hw_perf_event *hwc = &event->hw;
4050         int throttle = 0;
4051
4052         data->period = event->hw.last_period;
4053         if (!overflow)
4054                 overflow = perf_swevent_set_period(event);
4055
4056         if (hwc->interrupts == MAX_INTERRUPTS)
4057                 return;
4058
4059         for (; overflow; overflow--) {
4060                 if (__perf_event_overflow(event, nmi, throttle,
4061                                             data, regs)) {
4062                         /*
4063                          * We inhibit the overflow from happening when
4064                          * hwc->interrupts == MAX_INTERRUPTS.
4065                          */
4066                         break;
4067                 }
4068                 throttle = 1;
4069         }
4070 }
4071
4072 static void perf_swevent_add(struct perf_event *event, u64 nr,
4073                                int nmi, struct perf_sample_data *data,
4074                                struct pt_regs *regs)
4075 {
4076         struct hw_perf_event *hwc = &event->hw;
4077
4078         atomic64_add(nr, &event->count);
4079
4080         if (!regs)
4081                 return;
4082
4083         if (!hwc->sample_period)
4084                 return;
4085
4086         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4087                 return perf_swevent_overflow(event, 1, nmi, data, regs);
4088
4089         if (atomic64_add_negative(nr, &hwc->period_left))
4090                 return;
4091
4092         perf_swevent_overflow(event, 0, nmi, data, regs);
4093 }
4094
4095 static int perf_exclude_event(struct perf_event *event,
4096                               struct pt_regs *regs)
4097 {
4098         if (regs) {
4099                 if (event->attr.exclude_user && user_mode(regs))
4100                         return 1;
4101
4102                 if (event->attr.exclude_kernel && !user_mode(regs))
4103                         return 1;
4104         }
4105
4106         return 0;
4107 }
4108
4109 static int perf_swevent_match(struct perf_event *event,
4110                                 enum perf_type_id type,
4111                                 u32 event_id,
4112                                 struct perf_sample_data *data,
4113                                 struct pt_regs *regs)
4114 {
4115         if (event->attr.type != type)
4116                 return 0;
4117
4118         if (event->attr.config != event_id)
4119                 return 0;
4120
4121         if (perf_exclude_event(event, regs))
4122                 return 0;
4123
4124         return 1;
4125 }
4126
4127 static inline u64 swevent_hash(u64 type, u32 event_id)
4128 {
4129         u64 val = event_id | (type << 32);
4130
4131         return hash_64(val, SWEVENT_HLIST_BITS);
4132 }
4133
4134 static inline struct hlist_head *
4135 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4136 {
4137         u64 hash = swevent_hash(type, event_id);
4138
4139         return &hlist->heads[hash];
4140 }
4141
4142 /* For the read side: events when they trigger */
4143 static inline struct hlist_head *
4144 find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4145 {
4146         struct swevent_hlist *hlist;
4147
4148         hlist = rcu_dereference(ctx->swevent_hlist);
4149         if (!hlist)
4150                 return NULL;
4151
4152         return __find_swevent_head(hlist, type, event_id);
4153 }
4154
4155 /* For the event head insertion and removal in the hlist */
4156 static inline struct hlist_head *
4157 find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4158 {
4159         struct swevent_hlist *hlist;
4160         u32 event_id = event->attr.config;
4161         u64 type = event->attr.type;
4162
4163         /*
4164          * Event scheduling is always serialized against hlist allocation
4165          * and release. Which makes the protected version suitable here.
4166          * The context lock guarantees that.
4167          */
4168         hlist = rcu_dereference_protected(ctx->swevent_hlist,
4169                                           lockdep_is_held(&event->ctx->lock));
4170         if (!hlist)
4171                 return NULL;
4172
4173         return __find_swevent_head(hlist, type, event_id);
4174 }
4175
4176 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4177                                     u64 nr, int nmi,
4178                                     struct perf_sample_data *data,
4179                                     struct pt_regs *regs)
4180 {
4181         struct perf_cpu_context *cpuctx;
4182         struct perf_event *event;
4183         struct hlist_node *node;
4184         struct hlist_head *head;
4185
4186         cpuctx = &__get_cpu_var(perf_cpu_context);
4187
4188         rcu_read_lock();
4189
4190         head = find_swevent_head_rcu(cpuctx, type, event_id);
4191
4192         if (!head)
4193                 goto end;
4194
4195         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4196                 if (perf_swevent_match(event, type, event_id, data, regs))
4197                         perf_swevent_add(event, nr, nmi, data, regs);
4198         }
4199 end:
4200         rcu_read_unlock();
4201 }
4202
4203 int perf_swevent_get_recursion_context(void)
4204 {
4205         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4206         int rctx;
4207
4208         if (in_nmi())
4209                 rctx = 3;
4210         else if (in_irq())
4211                 rctx = 2;
4212         else if (in_softirq())
4213                 rctx = 1;
4214         else
4215                 rctx = 0;
4216
4217         if (cpuctx->recursion[rctx])
4218                 return -1;
4219
4220         cpuctx->recursion[rctx]++;
4221         barrier();
4222
4223         return rctx;
4224 }
4225 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4226
4227 void perf_swevent_put_recursion_context(int rctx)
4228 {
4229         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4230         barrier();
4231         cpuctx->recursion[rctx]--;
4232 }
4233 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4234
4235
4236 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4237                             struct pt_regs *regs, u64 addr)
4238 {
4239         struct perf_sample_data data;
4240         int rctx;
4241
4242         preempt_disable_notrace();
4243         rctx = perf_swevent_get_recursion_context();
4244         if (rctx < 0)
4245                 return;
4246
4247         perf_sample_data_init(&data, addr);
4248
4249         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4250
4251         perf_swevent_put_recursion_context(rctx);
4252         preempt_enable_notrace();
4253 }
4254
4255 static void perf_swevent_read(struct perf_event *event)
4256 {
4257 }
4258
4259 static int perf_swevent_enable(struct perf_event *event)
4260 {
4261         struct hw_perf_event *hwc = &event->hw;
4262         struct perf_cpu_context *cpuctx;
4263         struct hlist_head *head;
4264
4265         cpuctx = &__get_cpu_var(perf_cpu_context);
4266
4267         if (hwc->sample_period) {
4268                 hwc->last_period = hwc->sample_period;
4269                 perf_swevent_set_period(event);
4270         }
4271
4272         head = find_swevent_head(cpuctx, event);
4273         if (WARN_ON_ONCE(!head))
4274                 return -EINVAL;
4275
4276         hlist_add_head_rcu(&event->hlist_entry, head);
4277
4278         return 0;
4279 }
4280
4281 static void perf_swevent_disable(struct perf_event *event)
4282 {
4283         hlist_del_rcu(&event->hlist_entry);
4284 }
4285
4286 static void perf_swevent_void(struct perf_event *event)
4287 {
4288 }
4289
4290 static int perf_swevent_int(struct perf_event *event)
4291 {
4292         return 0;
4293 }
4294
4295 static const struct pmu perf_ops_generic = {
4296         .enable         = perf_swevent_enable,
4297         .disable        = perf_swevent_disable,
4298         .start          = perf_swevent_int,
4299         .stop           = perf_swevent_void,
4300         .read           = perf_swevent_read,
4301         .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
4302 };
4303
4304 /*
4305  * hrtimer based swevent callback
4306  */
4307
4308 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4309 {
4310         enum hrtimer_restart ret = HRTIMER_RESTART;
4311         struct perf_sample_data data;
4312         struct pt_regs *regs;
4313         struct perf_event *event;
4314         u64 period;
4315
4316         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4317         event->pmu->read(event);
4318
4319         perf_sample_data_init(&data, 0);
4320         data.period = event->hw.last_period;
4321         regs = get_irq_regs();
4322
4323         if (regs && !perf_exclude_event(event, regs)) {
4324                 if (!(event->attr.exclude_idle && current->pid == 0))
4325                         if (perf_event_overflow(event, 0, &data, regs))
4326                                 ret = HRTIMER_NORESTART;
4327         }
4328
4329         period = max_t(u64, 10000, event->hw.sample_period);
4330         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4331
4332         return ret;
4333 }
4334
4335 static void perf_swevent_start_hrtimer(struct perf_event *event)
4336 {
4337         struct hw_perf_event *hwc = &event->hw;
4338
4339         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4340         hwc->hrtimer.function = perf_swevent_hrtimer;
4341         if (hwc->sample_period) {
4342                 u64 period;
4343
4344                 if (hwc->remaining) {
4345                         if (hwc->remaining < 0)
4346                                 period = 10000;
4347                         else
4348                                 period = hwc->remaining;
4349                         hwc->remaining = 0;
4350                 } else {
4351                         period = max_t(u64, 10000, hwc->sample_period);
4352                 }
4353                 __hrtimer_start_range_ns(&hwc->hrtimer,
4354                                 ns_to_ktime(period), 0,
4355                                 HRTIMER_MODE_REL, 0);
4356         }
4357 }
4358
4359 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4360 {
4361         struct hw_perf_event *hwc = &event->hw;
4362
4363         if (hwc->sample_period) {
4364                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4365                 hwc->remaining = ktime_to_ns(remaining);
4366
4367                 hrtimer_cancel(&hwc->hrtimer);
4368         }
4369 }
4370
4371 /*
4372  * Software event: cpu wall time clock
4373  */
4374
4375 static void cpu_clock_perf_event_update(struct perf_event *event)
4376 {
4377         int cpu = raw_smp_processor_id();
4378         s64 prev;
4379         u64 now;
4380
4381         now = cpu_clock(cpu);
4382         prev = atomic64_xchg(&event->hw.prev_count, now);
4383         atomic64_add(now - prev, &event->count);
4384 }
4385
4386 static int cpu_clock_perf_event_enable(struct perf_event *event)
4387 {
4388         struct hw_perf_event *hwc = &event->hw;
4389         int cpu = raw_smp_processor_id();
4390
4391         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4392         perf_swevent_start_hrtimer(event);
4393
4394         return 0;
4395 }
4396
4397 static void cpu_clock_perf_event_disable(struct perf_event *event)
4398 {
4399         perf_swevent_cancel_hrtimer(event);
4400         cpu_clock_perf_event_update(event);
4401 }
4402
4403 static void cpu_clock_perf_event_read(struct perf_event *event)
4404 {
4405         cpu_clock_perf_event_update(event);
4406 }
4407
4408 static const struct pmu perf_ops_cpu_clock = {
4409         .enable         = cpu_clock_perf_event_enable,
4410         .disable        = cpu_clock_perf_event_disable,
4411         .read           = cpu_clock_perf_event_read,
4412 };
4413
4414 /*
4415  * Software event: task time clock
4416  */
4417
4418 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4419 {
4420         u64 prev;
4421         s64 delta;
4422
4423         prev = atomic64_xchg(&event->hw.prev_count, now);
4424         delta = now - prev;
4425         atomic64_add(delta, &event->count);
4426 }
4427
4428 static int task_clock_perf_event_enable(struct perf_event *event)
4429 {
4430         struct hw_perf_event *hwc = &event->hw;
4431         u64 now;
4432
4433         now = event->ctx->time;
4434
4435         atomic64_set(&hwc->prev_count, now);
4436
4437         perf_swevent_start_hrtimer(event);
4438
4439         return 0;
4440 }
4441
4442 static void task_clock_perf_event_disable(struct perf_event *event)
4443 {
4444         perf_swevent_cancel_hrtimer(event);
4445         task_clock_perf_event_update(event, event->ctx->time);
4446
4447 }
4448
4449 static void task_clock_perf_event_read(struct perf_event *event)
4450 {
4451         u64 time;
4452
4453         if (!in_nmi()) {
4454                 update_context_time(event->ctx);
4455                 time = event->ctx->time;
4456         } else {
4457                 u64 now = perf_clock();
4458                 u64 delta = now - event->ctx->timestamp;
4459                 time = event->ctx->time + delta;
4460         }
4461
4462         task_clock_perf_event_update(event, time);
4463 }
4464
4465 static const struct pmu perf_ops_task_clock = {
4466         .enable         = task_clock_perf_event_enable,
4467         .disable        = task_clock_perf_event_disable,
4468         .read           = task_clock_perf_event_read,
4469 };
4470
4471 /* Deref the hlist from the update side */
4472 static inline struct swevent_hlist *
4473 swevent_hlist_deref(struct perf_cpu_context *cpuctx)
4474 {
4475         return rcu_dereference_protected(cpuctx->swevent_hlist,
4476                                          lockdep_is_held(&cpuctx->hlist_mutex));
4477 }
4478
4479 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4480 {
4481         struct swevent_hlist *hlist;
4482
4483         hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4484         kfree(hlist);
4485 }
4486
4487 static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4488 {
4489         struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
4490
4491         if (!hlist)
4492                 return;
4493
4494         rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4495         call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4496 }
4497
4498 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4499 {
4500         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4501
4502         mutex_lock(&cpuctx->hlist_mutex);
4503
4504         if (!--cpuctx->hlist_refcount)
4505                 swevent_hlist_release(cpuctx);
4506
4507         mutex_unlock(&cpuctx->hlist_mutex);
4508 }
4509
4510 static void swevent_hlist_put(struct perf_event *event)
4511 {
4512         int cpu;
4513
4514         if (event->cpu != -1) {
4515                 swevent_hlist_put_cpu(event, event->cpu);
4516                 return;
4517         }
4518
4519         for_each_possible_cpu(cpu)
4520                 swevent_hlist_put_cpu(event, cpu);
4521 }
4522
4523 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4524 {
4525         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4526         int err = 0;
4527
4528         mutex_lock(&cpuctx->hlist_mutex);
4529
4530         if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
4531                 struct swevent_hlist *hlist;
4532
4533                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4534                 if (!hlist) {
4535                         err = -ENOMEM;
4536                         goto exit;
4537                 }
4538                 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4539         }
4540         cpuctx->hlist_refcount++;
4541  exit:
4542         mutex_unlock(&cpuctx->hlist_mutex);
4543
4544         return err;
4545 }
4546
4547 static int swevent_hlist_get(struct perf_event *event)
4548 {
4549         int err;
4550         int cpu, failed_cpu;
4551
4552         if (event->cpu != -1)
4553                 return swevent_hlist_get_cpu(event, event->cpu);
4554
4555         get_online_cpus();
4556         for_each_possible_cpu(cpu) {
4557                 err = swevent_hlist_get_cpu(event, cpu);
4558                 if (err) {
4559                         failed_cpu = cpu;
4560                         goto fail;
4561                 }
4562         }
4563         put_online_cpus();
4564
4565         return 0;
4566  fail:
4567         for_each_possible_cpu(cpu) {
4568                 if (cpu == failed_cpu)
4569                         break;
4570                 swevent_hlist_put_cpu(event, cpu);
4571         }
4572
4573         put_online_cpus();
4574         return err;
4575 }
4576
4577 #ifdef CONFIG_EVENT_TRACING
4578
4579 static const struct pmu perf_ops_tracepoint = {
4580         .enable         = perf_trace_enable,
4581         .disable        = perf_trace_disable,
4582         .start          = perf_swevent_int,
4583         .stop           = perf_swevent_void,
4584         .read           = perf_swevent_read,
4585         .unthrottle     = perf_swevent_void,
4586 };
4587
4588 static int perf_tp_filter_match(struct perf_event *event,
4589                                 struct perf_sample_data *data)
4590 {
4591         void *record = data->raw->data;
4592
4593         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4594                 return 1;
4595         return 0;
4596 }
4597
4598 static int perf_tp_event_match(struct perf_event *event,
4599                                 struct perf_sample_data *data,
4600                                 struct pt_regs *regs)
4601 {
4602         /*
4603          * All tracepoints are from kernel-space.
4604          */
4605         if (event->attr.exclude_kernel)
4606                 return 0;
4607
4608         if (!perf_tp_filter_match(event, data))
4609                 return 0;
4610
4611         return 1;
4612 }
4613
4614 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4615                    struct pt_regs *regs, struct hlist_head *head)
4616 {
4617         struct perf_sample_data data;
4618         struct perf_event *event;
4619         struct hlist_node *node;
4620
4621         struct perf_raw_record raw = {
4622                 .size = entry_size,
4623                 .data = record,
4624         };
4625
4626         perf_sample_data_init(&data, addr);
4627         data.raw = &raw;
4628
4629         rcu_read_lock();
4630         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4631                 if (perf_tp_event_match(event, &data, regs))
4632                         perf_swevent_add(event, count, 1, &data, regs);
4633         }
4634         rcu_read_unlock();
4635 }
4636 EXPORT_SYMBOL_GPL(perf_tp_event);
4637
4638 static void tp_perf_event_destroy(struct perf_event *event)
4639 {
4640         perf_trace_destroy(event);
4641 }
4642
4643 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4644 {
4645         int err;
4646
4647         /*
4648          * Raw tracepoint data is a severe data leak, only allow root to
4649          * have these.
4650          */
4651         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4652                         perf_paranoid_tracepoint_raw() &&
4653                         !capable(CAP_SYS_ADMIN))
4654                 return ERR_PTR(-EPERM);
4655
4656         err = perf_trace_init(event);
4657         if (err)
4658                 return NULL;
4659
4660         event->destroy = tp_perf_event_destroy;
4661
4662         return &perf_ops_tracepoint;
4663 }
4664
4665 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4666 {
4667         char *filter_str;
4668         int ret;
4669
4670         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4671                 return -EINVAL;
4672
4673         filter_str = strndup_user(arg, PAGE_SIZE);
4674         if (IS_ERR(filter_str))
4675                 return PTR_ERR(filter_str);
4676
4677         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4678
4679         kfree(filter_str);
4680         return ret;
4681 }
4682
4683 static void perf_event_free_filter(struct perf_event *event)
4684 {
4685         ftrace_profile_free_filter(event);
4686 }
4687
4688 #else
4689
4690 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4691 {
4692         return NULL;
4693 }
4694
4695 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4696 {
4697         return -ENOENT;
4698 }
4699
4700 static void perf_event_free_filter(struct perf_event *event)
4701 {
4702 }
4703
4704 #endif /* CONFIG_EVENT_TRACING */
4705
4706 #ifdef CONFIG_HAVE_HW_BREAKPOINT
4707 static void bp_perf_event_destroy(struct perf_event *event)
4708 {
4709         release_bp_slot(event);
4710 }
4711
4712 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4713 {
4714         int err;
4715
4716         err = register_perf_hw_breakpoint(bp);
4717         if (err)
4718                 return ERR_PTR(err);
4719
4720         bp->destroy = bp_perf_event_destroy;
4721
4722         return &perf_ops_bp;
4723 }
4724
4725 void perf_bp_event(struct perf_event *bp, void *data)
4726 {
4727         struct perf_sample_data sample;
4728         struct pt_regs *regs = data;
4729
4730         perf_sample_data_init(&sample, bp->attr.bp_addr);
4731
4732         if (!perf_exclude_event(bp, regs))
4733                 perf_swevent_add(bp, 1, 1, &sample, regs);
4734 }
4735 #else
4736 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4737 {
4738         return NULL;
4739 }
4740
4741 void perf_bp_event(struct perf_event *bp, void *regs)
4742 {
4743 }
4744 #endif
4745
4746 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4747
4748 static void sw_perf_event_destroy(struct perf_event *event)
4749 {
4750         u64 event_id = event->attr.config;
4751
4752         WARN_ON(event->parent);
4753
4754         atomic_dec(&perf_swevent_enabled[event_id]);
4755         swevent_hlist_put(event);
4756 }
4757
4758 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4759 {
4760         const struct pmu *pmu = NULL;
4761         u64 event_id = event->attr.config;
4762
4763         /*
4764          * Software events (currently) can't in general distinguish
4765          * between user, kernel and hypervisor events.
4766          * However, context switches and cpu migrations are considered
4767          * to be kernel events, and page faults are never hypervisor
4768          * events.
4769          */
4770         switch (event_id) {
4771         case PERF_COUNT_SW_CPU_CLOCK:
4772                 pmu = &perf_ops_cpu_clock;
4773
4774                 break;
4775         case PERF_COUNT_SW_TASK_CLOCK:
4776                 /*
4777                  * If the user instantiates this as a per-cpu event,
4778                  * use the cpu_clock event instead.
4779                  */
4780                 if (event->ctx->task)
4781                         pmu = &perf_ops_task_clock;
4782                 else
4783                         pmu = &perf_ops_cpu_clock;
4784
4785                 break;
4786         case PERF_COUNT_SW_PAGE_FAULTS:
4787         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4788         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4789         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4790         case PERF_COUNT_SW_CPU_MIGRATIONS:
4791         case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4792         case PERF_COUNT_SW_EMULATION_FAULTS:
4793                 if (!event->parent) {
4794                         int err;
4795
4796                         err = swevent_hlist_get(event);
4797                         if (err)
4798                                 return ERR_PTR(err);
4799
4800                         atomic_inc(&perf_swevent_enabled[event_id]);
4801                         event->destroy = sw_perf_event_destroy;
4802                 }
4803                 pmu = &perf_ops_generic;
4804                 break;
4805         }
4806
4807         return pmu;
4808 }
4809
4810 /*
4811  * Allocate and initialize a event structure
4812  */
4813 static struct perf_event *
4814 perf_event_alloc(struct perf_event_attr *attr,
4815                    int cpu,
4816                    struct perf_event_context *ctx,
4817                    struct perf_event *group_leader,
4818                    struct perf_event *parent_event,
4819                    perf_overflow_handler_t overflow_handler,
4820                    gfp_t gfpflags)
4821 {
4822         const struct pmu *pmu;
4823         struct perf_event *event;
4824         struct hw_perf_event *hwc;
4825         long err;
4826
4827         event = kzalloc(sizeof(*event), gfpflags);
4828         if (!event)
4829                 return ERR_PTR(-ENOMEM);
4830
4831         /*
4832          * Single events are their own group leaders, with an
4833          * empty sibling list:
4834          */
4835         if (!group_leader)
4836                 group_leader = event;
4837
4838         mutex_init(&event->child_mutex);
4839         INIT_LIST_HEAD(&event->child_list);
4840
4841         INIT_LIST_HEAD(&event->group_entry);
4842         INIT_LIST_HEAD(&event->event_entry);
4843         INIT_LIST_HEAD(&event->sibling_list);
4844         init_waitqueue_head(&event->waitq);
4845
4846         mutex_init(&event->mmap_mutex);
4847
4848         event->cpu              = cpu;
4849         event->attr             = *attr;
4850         event->group_leader     = group_leader;
4851         event->pmu              = NULL;
4852         event->ctx              = ctx;
4853         event->oncpu            = -1;
4854
4855         event->parent           = parent_event;
4856
4857         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4858         event->id               = atomic64_inc_return(&perf_event_id);
4859
4860         event->state            = PERF_EVENT_STATE_INACTIVE;
4861
4862         if (!overflow_handler && parent_event)
4863                 overflow_handler = parent_event->overflow_handler;
4864
4865         event->overflow_handler = overflow_handler;
4866
4867         if (attr->disabled)
4868                 event->state = PERF_EVENT_STATE_OFF;
4869
4870         pmu = NULL;
4871
4872         hwc = &event->hw;
4873         hwc->sample_period = attr->sample_period;
4874         if (attr->freq && attr->sample_freq)
4875                 hwc->sample_period = 1;
4876         hwc->last_period = hwc->sample_period;
4877
4878         atomic64_set(&hwc->period_left, hwc->sample_period);
4879
4880         /*
4881          * we currently do not support PERF_FORMAT_GROUP on inherited events
4882          */
4883         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4884                 goto done;
4885
4886         switch (attr->type) {
4887         case PERF_TYPE_RAW:
4888         case PERF_TYPE_HARDWARE:
4889         case PERF_TYPE_HW_CACHE:
4890                 pmu = hw_perf_event_init(event);
4891                 break;
4892
4893         case PERF_TYPE_SOFTWARE:
4894                 pmu = sw_perf_event_init(event);
4895                 break;
4896
4897         case PERF_TYPE_TRACEPOINT:
4898                 pmu = tp_perf_event_init(event);
4899                 break;
4900
4901         case PERF_TYPE_BREAKPOINT:
4902                 pmu = bp_perf_event_init(event);
4903                 break;
4904
4905
4906         default:
4907                 break;
4908         }
4909 done:
4910         err = 0;
4911         if (!pmu)
4912                 err = -EINVAL;
4913         else if (IS_ERR(pmu))
4914                 err = PTR_ERR(pmu);
4915
4916         if (err) {
4917                 if (event->ns)
4918                         put_pid_ns(event->ns);
4919                 kfree(event);
4920                 return ERR_PTR(err);
4921         }
4922
4923         event->pmu = pmu;
4924
4925         if (!event->parent) {
4926                 atomic_inc(&nr_events);
4927                 if (event->attr.mmap)
4928                         atomic_inc(&nr_mmap_events);
4929                 if (event->attr.comm)
4930                         atomic_inc(&nr_comm_events);
4931                 if (event->attr.task)
4932                         atomic_inc(&nr_task_events);
4933         }
4934
4935         return event;
4936 }
4937
4938 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4939                           struct perf_event_attr *attr)
4940 {
4941         u32 size;
4942         int ret;
4943
4944         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4945                 return -EFAULT;
4946
4947         /*
4948          * zero the full structure, so that a short copy will be nice.
4949          */
4950         memset(attr, 0, sizeof(*attr));
4951
4952         ret = get_user(size, &uattr->size);
4953         if (ret)
4954                 return ret;
4955
4956         if (size > PAGE_SIZE)   /* silly large */
4957                 goto err_size;
4958
4959         if (!size)              /* abi compat */
4960                 size = PERF_ATTR_SIZE_VER0;
4961
4962         if (size < PERF_ATTR_SIZE_VER0)
4963                 goto err_size;
4964
4965         /*
4966          * If we're handed a bigger struct than we know of,
4967          * ensure all the unknown bits are 0 - i.e. new
4968          * user-space does not rely on any kernel feature
4969          * extensions we dont know about yet.
4970          */
4971         if (size > sizeof(*attr)) {
4972                 unsigned char __user *addr;
4973                 unsigned char __user *end;
4974                 unsigned char val;
4975
4976                 addr = (void __user *)uattr + sizeof(*attr);
4977                 end  = (void __user *)uattr + size;
4978
4979                 for (; addr < end; addr++) {
4980                         ret = get_user(val, addr);
4981                         if (ret)
4982                                 return ret;
4983                         if (val)
4984                                 goto err_size;
4985                 }
4986                 size = sizeof(*attr);
4987         }
4988
4989         ret = copy_from_user(attr, uattr, size);
4990         if (ret)
4991                 return -EFAULT;
4992
4993         /*
4994          * If the type exists, the corresponding creation will verify
4995          * the attr->config.
4996          */
4997         if (attr->type >= PERF_TYPE_MAX)
4998                 return -EINVAL;
4999
5000         if (attr->__reserved_1)
5001                 return -EINVAL;
5002
5003         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
5004                 return -EINVAL;
5005
5006         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
5007                 return -EINVAL;
5008
5009 out:
5010         return ret;
5011
5012 err_size:
5013         put_user(sizeof(*attr), &uattr->size);
5014         ret = -E2BIG;
5015         goto out;
5016 }
5017
5018 static int
5019 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
5020 {
5021         struct perf_mmap_data *data = NULL, *old_data = NULL;
5022         int ret = -EINVAL;
5023
5024         if (!output_event)
5025                 goto set;
5026
5027         /* don't allow circular references */
5028         if (event == output_event)
5029                 goto out;
5030
5031         /*
5032          * Don't allow cross-cpu buffers
5033          */
5034         if (output_event->cpu != event->cpu)
5035                 goto out;
5036
5037         /*
5038          * If its not a per-cpu buffer, it must be the same task.
5039          */
5040         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
5041                 goto out;
5042
5043 set:
5044         mutex_lock(&event->mmap_mutex);
5045         /* Can't redirect output if we've got an active mmap() */
5046         if (atomic_read(&event->mmap_count))
5047                 goto unlock;
5048
5049         if (output_event) {
5050                 /* get the buffer we want to redirect to */
5051                 data = perf_mmap_data_get(output_event);
5052                 if (!data)
5053                         goto unlock;
5054         }
5055
5056         old_data = event->data;
5057         rcu_assign_pointer(event->data, data);
5058         ret = 0;
5059 unlock:
5060         mutex_unlock(&event->mmap_mutex);
5061
5062         if (old_data)
5063                 perf_mmap_data_put(old_data);
5064 out:
5065         return ret;
5066 }
5067
5068 /**
5069  * sys_perf_event_open - open a performance event, associate it to a task/cpu
5070  *
5071  * @attr_uptr:  event_id type attributes for monitoring/sampling
5072  * @pid:                target pid
5073  * @cpu:                target cpu
5074  * @group_fd:           group leader event fd
5075  */
5076 SYSCALL_DEFINE5(perf_event_open,
5077                 struct perf_event_attr __user *, attr_uptr,
5078                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
5079 {
5080         struct perf_event *event, *group_leader = NULL, *output_event = NULL;
5081         struct perf_event_attr attr;
5082         struct perf_event_context *ctx;
5083         struct file *event_file = NULL;
5084         struct file *group_file = NULL;
5085         int event_fd;
5086         int fput_needed = 0;
5087         int err;
5088
5089         /* for future expandability... */
5090         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
5091                 return -EINVAL;
5092
5093         err = perf_copy_attr(attr_uptr, &attr);
5094         if (err)
5095                 return err;
5096
5097         if (!attr.exclude_kernel) {
5098                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
5099                         return -EACCES;
5100         }
5101
5102         if (attr.freq) {
5103                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
5104                         return -EINVAL;
5105         }
5106
5107         event_fd = get_unused_fd_flags(O_RDWR);
5108         if (event_fd < 0)
5109                 return event_fd;
5110
5111         /*
5112          * Get the target context (task or percpu):
5113          */
5114         ctx = find_get_context(pid, cpu);
5115         if (IS_ERR(ctx)) {
5116                 err = PTR_ERR(ctx);
5117                 goto err_fd;
5118         }
5119
5120         if (group_fd != -1) {
5121                 group_leader = perf_fget_light(group_fd, &fput_needed);
5122                 if (IS_ERR(group_leader)) {
5123                         err = PTR_ERR(group_leader);
5124                         goto err_put_context;
5125                 }
5126                 group_file = group_leader->filp;
5127                 if (flags & PERF_FLAG_FD_OUTPUT)
5128                         output_event = group_leader;
5129                 if (flags & PERF_FLAG_FD_NO_GROUP)
5130                         group_leader = NULL;
5131         }
5132
5133         /*
5134          * Look up the group leader (we will attach this event to it):
5135          */
5136         if (group_leader) {
5137                 err = -EINVAL;
5138
5139                 /*
5140                  * Do not allow a recursive hierarchy (this new sibling
5141                  * becoming part of another group-sibling):
5142                  */
5143                 if (group_leader->group_leader != group_leader)
5144                         goto err_put_context;
5145                 /*
5146                  * Do not allow to attach to a group in a different
5147                  * task or CPU context:
5148                  */
5149                 if (group_leader->ctx != ctx)
5150                         goto err_put_context;
5151                 /*
5152                  * Only a group leader can be exclusive or pinned
5153                  */
5154                 if (attr.exclusive || attr.pinned)
5155                         goto err_put_context;
5156         }
5157
5158         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
5159                                      NULL, NULL, GFP_KERNEL);
5160         if (IS_ERR(event)) {
5161                 err = PTR_ERR(event);
5162                 goto err_put_context;
5163         }
5164
5165         if (output_event) {
5166                 err = perf_event_set_output(event, output_event);
5167                 if (err)
5168                         goto err_free_put_context;
5169         }
5170
5171         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
5172         if (IS_ERR(event_file)) {
5173                 err = PTR_ERR(event_file);
5174                 goto err_free_put_context;
5175         }
5176
5177         event->filp = event_file;
5178         WARN_ON_ONCE(ctx->parent_ctx);
5179         mutex_lock(&ctx->mutex);
5180         perf_install_in_context(ctx, event, cpu);
5181         ++ctx->generation;
5182         mutex_unlock(&ctx->mutex);
5183
5184         event->owner = current;
5185         get_task_struct(current);
5186         mutex_lock(&current->perf_event_mutex);
5187         list_add_tail(&event->owner_entry, &current->perf_event_list);
5188         mutex_unlock(&current->perf_event_mutex);
5189
5190         /*
5191          * Drop the reference on the group_event after placing the
5192          * new event on the sibling_list. This ensures destruction
5193          * of the group leader will find the pointer to itself in
5194          * perf_group_detach().
5195          */
5196         fput_light(group_file, fput_needed);
5197         fd_install(event_fd, event_file);
5198         return event_fd;
5199
5200 err_free_put_context:
5201         free_event(event);
5202 err_put_context:
5203         fput_light(group_file, fput_needed);
5204         put_ctx(ctx);
5205 err_fd:
5206         put_unused_fd(event_fd);
5207         return err;
5208 }
5209
5210 /**
5211  * perf_event_create_kernel_counter
5212  *
5213  * @attr: attributes of the counter to create
5214  * @cpu: cpu in which the counter is bound
5215  * @pid: task to profile
5216  */
5217 struct perf_event *
5218 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5219                                  pid_t pid,
5220                                  perf_overflow_handler_t overflow_handler)
5221 {
5222         struct perf_event *event;
5223         struct perf_event_context *ctx;
5224         int err;
5225
5226         /*
5227          * Get the target context (task or percpu):
5228          */
5229
5230         ctx = find_get_context(pid, cpu);
5231         if (IS_ERR(ctx)) {
5232                 err = PTR_ERR(ctx);
5233                 goto err_exit;
5234         }
5235
5236         event = perf_event_alloc(attr, cpu, ctx, NULL,
5237                                  NULL, overflow_handler, GFP_KERNEL);
5238         if (IS_ERR(event)) {
5239                 err = PTR_ERR(event);
5240                 goto err_put_context;
5241         }
5242
5243         event->filp = NULL;
5244         WARN_ON_ONCE(ctx->parent_ctx);
5245         mutex_lock(&ctx->mutex);
5246         perf_install_in_context(ctx, event, cpu);
5247         ++ctx->generation;
5248         mutex_unlock(&ctx->mutex);
5249
5250         event->owner = current;
5251         get_task_struct(current);
5252         mutex_lock(&current->perf_event_mutex);
5253         list_add_tail(&event->owner_entry, &current->perf_event_list);
5254         mutex_unlock(&current->perf_event_mutex);
5255
5256         return event;
5257
5258  err_put_context:
5259         put_ctx(ctx);
5260  err_exit:
5261         return ERR_PTR(err);
5262 }
5263 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
5264
5265 /*
5266  * inherit a event from parent task to child task:
5267  */
5268 static struct perf_event *
5269 inherit_event(struct perf_event *parent_event,
5270               struct task_struct *parent,
5271               struct perf_event_context *parent_ctx,
5272               struct task_struct *child,
5273               struct perf_event *group_leader,
5274               struct perf_event_context *child_ctx)
5275 {
5276         struct perf_event *child_event;
5277
5278         /*
5279          * Instead of creating recursive hierarchies of events,
5280          * we link inherited events back to the original parent,
5281          * which has a filp for sure, which we use as the reference
5282          * count:
5283          */
5284         if (parent_event->parent)
5285                 parent_event = parent_event->parent;
5286
5287         child_event = perf_event_alloc(&parent_event->attr,
5288                                            parent_event->cpu, child_ctx,
5289                                            group_leader, parent_event,
5290                                            NULL, GFP_KERNEL);
5291         if (IS_ERR(child_event))
5292                 return child_event;
5293         get_ctx(child_ctx);
5294
5295         /*
5296          * Make the child state follow the state of the parent event,
5297          * not its attr.disabled bit.  We hold the parent's mutex,
5298          * so we won't race with perf_event_{en, dis}able_family.
5299          */
5300         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5301                 child_event->state = PERF_EVENT_STATE_INACTIVE;
5302         else
5303                 child_event->state = PERF_EVENT_STATE_OFF;
5304
5305         if (parent_event->attr.freq) {
5306                 u64 sample_period = parent_event->hw.sample_period;
5307                 struct hw_perf_event *hwc = &child_event->hw;
5308
5309                 hwc->sample_period = sample_period;
5310                 hwc->last_period   = sample_period;
5311
5312                 atomic64_set(&hwc->period_left, sample_period);
5313         }
5314
5315         child_event->overflow_handler = parent_event->overflow_handler;
5316
5317         /*
5318          * Link it up in the child's context:
5319          */
5320         add_event_to_ctx(child_event, child_ctx);
5321
5322         /*
5323          * Get a reference to the parent filp - we will fput it
5324          * when the child event exits. This is safe to do because
5325          * we are in the parent and we know that the filp still
5326          * exists and has a nonzero count:
5327          */
5328         atomic_long_inc(&parent_event->filp->f_count);
5329
5330         /*
5331          * Link this into the parent event's child list
5332          */
5333         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5334         mutex_lock(&parent_event->child_mutex);
5335         list_add_tail(&child_event->child_list, &parent_event->child_list);
5336         mutex_unlock(&parent_event->child_mutex);
5337
5338         return child_event;
5339 }
5340
5341 static int inherit_group(struct perf_event *parent_event,
5342               struct task_struct *parent,
5343               struct perf_event_context *parent_ctx,
5344               struct task_struct *child,
5345               struct perf_event_context *child_ctx)
5346 {
5347         struct perf_event *leader;
5348         struct perf_event *sub;
5349         struct perf_event *child_ctr;
5350
5351         leader = inherit_event(parent_event, parent, parent_ctx,
5352                                  child, NULL, child_ctx);
5353         if (IS_ERR(leader))
5354                 return PTR_ERR(leader);
5355         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5356                 child_ctr = inherit_event(sub, parent, parent_ctx,
5357                                             child, leader, child_ctx);
5358                 if (IS_ERR(child_ctr))
5359                         return PTR_ERR(child_ctr);
5360         }
5361         return 0;
5362 }
5363
5364 static void sync_child_event(struct perf_event *child_event,
5365                                struct task_struct *child)
5366 {
5367         struct perf_event *parent_event = child_event->parent;
5368         u64 child_val;
5369
5370         if (child_event->attr.inherit_stat)
5371                 perf_event_read_event(child_event, child);
5372
5373         child_val = atomic64_read(&child_event->count);
5374
5375         /*
5376          * Add back the child's count to the parent's count:
5377          */
5378         atomic64_add(child_val, &parent_event->count);
5379         atomic64_add(child_event->total_time_enabled,
5380                      &parent_event->child_total_time_enabled);
5381         atomic64_add(child_event->total_time_running,
5382                      &parent_event->child_total_time_running);
5383
5384         /*
5385          * Remove this event from the parent's list
5386          */
5387         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5388         mutex_lock(&parent_event->child_mutex);
5389         list_del_init(&child_event->child_list);
5390         mutex_unlock(&parent_event->child_mutex);
5391
5392         /*
5393          * Release the parent event, if this was the last
5394          * reference to it.
5395          */
5396         fput(parent_event->filp);
5397 }
5398
5399 static void
5400 __perf_event_exit_task(struct perf_event *child_event,
5401                          struct perf_event_context *child_ctx,
5402                          struct task_struct *child)
5403 {
5404         if (child_event->parent) {
5405                 raw_spin_lock_irq(&child_ctx->lock);
5406                 perf_group_detach(child_event);
5407                 raw_spin_unlock_irq(&child_ctx->lock);
5408         }
5409
5410         perf_event_remove_from_context(child_event);
5411
5412         /*
5413          * It can happen that the parent exits first, and has events
5414          * that are still around due to the child reference. These
5415          * events need to be zapped.
5416          */
5417         if (child_event->parent) {
5418                 sync_child_event(child_event, child);
5419                 free_event(child_event);
5420         }
5421 }
5422
5423 /*
5424  * When a child task exits, feed back event values to parent events.
5425  */
5426 void perf_event_exit_task(struct task_struct *child)
5427 {
5428         struct perf_event *child_event, *tmp;
5429         struct perf_event_context *child_ctx;
5430         unsigned long flags;
5431
5432         if (likely(!child->perf_event_ctxp)) {
5433                 perf_event_task(child, NULL, 0);
5434                 return;
5435         }
5436
5437         local_irq_save(flags);
5438         /*
5439          * We can't reschedule here because interrupts are disabled,
5440          * and either child is current or it is a task that can't be
5441          * scheduled, so we are now safe from rescheduling changing
5442          * our context.
5443          */
5444         child_ctx = child->perf_event_ctxp;
5445         __perf_event_task_sched_out(child_ctx);
5446
5447         /*
5448          * Take the context lock here so that if find_get_context is
5449          * reading child->perf_event_ctxp, we wait until it has
5450          * incremented the context's refcount before we do put_ctx below.
5451          */
5452         raw_spin_lock(&child_ctx->lock);
5453         child->perf_event_ctxp = NULL;
5454         /*
5455          * If this context is a clone; unclone it so it can't get
5456          * swapped to another process while we're removing all
5457          * the events from it.
5458          */
5459         unclone_ctx(child_ctx);
5460         update_context_time(child_ctx);
5461         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5462
5463         /*
5464          * Report the task dead after unscheduling the events so that we
5465          * won't get any samples after PERF_RECORD_EXIT. We can however still
5466          * get a few PERF_RECORD_READ events.
5467          */
5468         perf_event_task(child, child_ctx, 0);
5469
5470         /*
5471          * We can recurse on the same lock type through:
5472          *
5473          *   __perf_event_exit_task()
5474          *     sync_child_event()
5475          *       fput(parent_event->filp)
5476          *         perf_release()
5477          *           mutex_lock(&ctx->mutex)
5478          *
5479          * But since its the parent context it won't be the same instance.
5480          */
5481         mutex_lock(&child_ctx->mutex);
5482
5483 again:
5484         list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5485                                  group_entry)
5486                 __perf_event_exit_task(child_event, child_ctx, child);
5487
5488         list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5489                                  group_entry)
5490                 __perf_event_exit_task(child_event, child_ctx, child);
5491
5492         /*
5493          * If the last event was a group event, it will have appended all
5494          * its siblings to the list, but we obtained 'tmp' before that which
5495          * will still point to the list head terminating the iteration.
5496          */
5497         if (!list_empty(&child_ctx->pinned_groups) ||
5498             !list_empty(&child_ctx->flexible_groups))
5499                 goto again;
5500
5501         mutex_unlock(&child_ctx->mutex);
5502
5503         put_ctx(child_ctx);
5504 }
5505
5506 static void perf_free_event(struct perf_event *event,
5507                             struct perf_event_context *ctx)
5508 {
5509         struct perf_event *parent = event->parent;
5510
5511         if (WARN_ON_ONCE(!parent))
5512                 return;
5513
5514         mutex_lock(&parent->child_mutex);
5515         list_del_init(&event->child_list);
5516         mutex_unlock(&parent->child_mutex);
5517
5518         fput(parent->filp);
5519
5520         perf_group_detach(event);
5521         list_del_event(event, ctx);
5522         free_event(event);
5523 }
5524
5525 /*
5526  * free an unexposed, unused context as created by inheritance by
5527  * init_task below, used by fork() in case of fail.
5528  */
5529 void perf_event_free_task(struct task_struct *task)
5530 {
5531         struct perf_event_context *ctx = task->perf_event_ctxp;
5532         struct perf_event *event, *tmp;
5533
5534         if (!ctx)
5535                 return;
5536
5537         mutex_lock(&ctx->mutex);
5538 again:
5539         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5540                 perf_free_event(event, ctx);
5541
5542         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5543                                  group_entry)
5544                 perf_free_event(event, ctx);
5545
5546         if (!list_empty(&ctx->pinned_groups) ||
5547             !list_empty(&ctx->flexible_groups))
5548                 goto again;
5549
5550         mutex_unlock(&ctx->mutex);
5551
5552         put_ctx(ctx);
5553 }
5554
5555 static int
5556 inherit_task_group(struct perf_event *event, struct task_struct *parent,
5557                    struct perf_event_context *parent_ctx,
5558                    struct task_struct *child,
5559                    int *inherited_all)
5560 {
5561         int ret;
5562         struct perf_event_context *child_ctx = child->perf_event_ctxp;
5563
5564         if (!event->attr.inherit) {
5565                 *inherited_all = 0;
5566                 return 0;
5567         }
5568
5569         if (!child_ctx) {
5570                 /*
5571                  * This is executed from the parent task context, so
5572                  * inherit events that have been marked for cloning.
5573                  * First allocate and initialize a context for the
5574                  * child.
5575                  */
5576
5577                 child_ctx = kzalloc(sizeof(struct perf_event_context),
5578                                     GFP_KERNEL);
5579                 if (!child_ctx)
5580                         return -ENOMEM;
5581
5582                 __perf_event_init_context(child_ctx, child);
5583                 child->perf_event_ctxp = child_ctx;
5584                 get_task_struct(child);
5585         }
5586
5587         ret = inherit_group(event, parent, parent_ctx,
5588                             child, child_ctx);
5589
5590         if (ret)
5591                 *inherited_all = 0;
5592
5593         return ret;
5594 }
5595
5596
5597 /*
5598  * Initialize the perf_event context in task_struct
5599  */
5600 int perf_event_init_task(struct task_struct *child)
5601 {
5602         struct perf_event_context *child_ctx, *parent_ctx;
5603         struct perf_event_context *cloned_ctx;
5604         struct perf_event *event;
5605         struct task_struct *parent = current;
5606         int inherited_all = 1;
5607         unsigned long flags;
5608         int ret = 0;
5609
5610         child->perf_event_ctxp = NULL;
5611
5612         mutex_init(&child->perf_event_mutex);
5613         INIT_LIST_HEAD(&child->perf_event_list);
5614
5615         if (likely(!parent->perf_event_ctxp))
5616                 return 0;
5617
5618         /*
5619          * If the parent's context is a clone, pin it so it won't get
5620          * swapped under us.
5621          */
5622         parent_ctx = perf_pin_task_context(parent);
5623
5624         /*
5625          * No need to check if parent_ctx != NULL here; since we saw
5626          * it non-NULL earlier, the only reason for it to become NULL
5627          * is if we exit, and since we're currently in the middle of
5628          * a fork we can't be exiting at the same time.
5629          */
5630
5631         /*
5632          * Lock the parent list. No need to lock the child - not PID
5633          * hashed yet and not running, so nobody can access it.
5634          */
5635         mutex_lock(&parent_ctx->mutex);
5636
5637         /*
5638          * We dont have to disable NMIs - we are only looking at
5639          * the list, not manipulating it:
5640          */
5641         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5642                 ret = inherit_task_group(event, parent, parent_ctx, child,
5643                                          &inherited_all);
5644                 if (ret)
5645                         break;
5646         }
5647
5648         /*
5649          * We can't hold ctx->lock when iterating the ->flexible_group list due
5650          * to allocations, but we need to prevent rotation because
5651          * rotate_ctx() will change the list from interrupt context.
5652          */
5653         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
5654         parent_ctx->rotate_disable = 1;
5655         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
5656
5657         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5658                 ret = inherit_task_group(event, parent, parent_ctx, child,
5659                                          &inherited_all);
5660                 if (ret)
5661                         break;
5662         }
5663
5664         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
5665         parent_ctx->rotate_disable = 0;
5666         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
5667
5668         child_ctx = child->perf_event_ctxp;
5669
5670         if (child_ctx && inherited_all) {
5671                 /*
5672                  * Mark the child context as a clone of the parent
5673                  * context, or of whatever the parent is a clone of.
5674                  * Note that if the parent is a clone, it could get
5675                  * uncloned at any point, but that doesn't matter
5676                  * because the list of events and the generation
5677                  * count can't have changed since we took the mutex.
5678                  */
5679                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5680                 if (cloned_ctx) {
5681                         child_ctx->parent_ctx = cloned_ctx;
5682                         child_ctx->parent_gen = parent_ctx->parent_gen;
5683                 } else {
5684                         child_ctx->parent_ctx = parent_ctx;
5685                         child_ctx->parent_gen = parent_ctx->generation;
5686                 }
5687                 get_ctx(child_ctx->parent_ctx);
5688         }
5689
5690         mutex_unlock(&parent_ctx->mutex);
5691
5692         perf_unpin_context(parent_ctx);
5693
5694         return ret;
5695 }
5696
5697 static void __init perf_event_init_all_cpus(void)
5698 {
5699         int cpu;
5700         struct perf_cpu_context *cpuctx;
5701
5702         for_each_possible_cpu(cpu) {
5703                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5704                 mutex_init(&cpuctx->hlist_mutex);
5705                 __perf_event_init_context(&cpuctx->ctx, NULL);
5706         }
5707 }
5708
5709 static void __cpuinit perf_event_init_cpu(int cpu)
5710 {
5711         struct perf_cpu_context *cpuctx;
5712
5713         cpuctx = &per_cpu(perf_cpu_context, cpu);
5714
5715         spin_lock(&perf_resource_lock);
5716         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5717         spin_unlock(&perf_resource_lock);
5718
5719         mutex_lock(&cpuctx->hlist_mutex);
5720         if (cpuctx->hlist_refcount > 0) {
5721                 struct swevent_hlist *hlist;
5722
5723                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5724                 WARN_ON_ONCE(!hlist);
5725                 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5726         }
5727         mutex_unlock(&cpuctx->hlist_mutex);
5728 }
5729
5730 #ifdef CONFIG_HOTPLUG_CPU
5731 static void __perf_event_exit_cpu(void *info)
5732 {
5733         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5734         struct perf_event_context *ctx = &cpuctx->ctx;
5735         struct perf_event *event, *tmp;
5736
5737         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5738                 __perf_event_remove_from_context(event);
5739         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5740                 __perf_event_remove_from_context(event);
5741 }
5742 static void perf_event_exit_cpu(int cpu)
5743 {
5744         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5745         struct perf_event_context *ctx = &cpuctx->ctx;
5746
5747         mutex_lock(&cpuctx->hlist_mutex);
5748         swevent_hlist_release(cpuctx);
5749         mutex_unlock(&cpuctx->hlist_mutex);
5750
5751         mutex_lock(&ctx->mutex);
5752         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5753         mutex_unlock(&ctx->mutex);
5754 }
5755 #else
5756 static inline void perf_event_exit_cpu(int cpu) { }
5757 #endif
5758
5759 static int __cpuinit
5760 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5761 {
5762         unsigned int cpu = (long)hcpu;
5763
5764         switch (action) {
5765
5766         case CPU_UP_PREPARE:
5767         case CPU_UP_PREPARE_FROZEN:
5768                 perf_event_init_cpu(cpu);
5769                 break;
5770
5771         case CPU_DOWN_PREPARE:
5772         case CPU_DOWN_PREPARE_FROZEN:
5773                 perf_event_exit_cpu(cpu);
5774                 break;
5775
5776         default:
5777                 break;
5778         }
5779
5780         return NOTIFY_OK;
5781 }
5782
5783 /*
5784  * This has to have a higher priority than migration_notifier in sched.c.
5785  */
5786 static struct notifier_block __cpuinitdata perf_cpu_nb = {
5787         .notifier_call          = perf_cpu_notify,
5788         .priority               = 20,
5789 };
5790
5791 void __init perf_event_init(void)
5792 {
5793         perf_event_init_all_cpus();
5794         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5795                         (void *)(long)smp_processor_id());
5796         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5797                         (void *)(long)smp_processor_id());
5798         register_cpu_notifier(&perf_cpu_nb);
5799 }
5800
5801 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5802                                         struct sysdev_class_attribute *attr,
5803                                         char *buf)
5804 {
5805         return sprintf(buf, "%d\n", perf_reserved_percpu);
5806 }
5807
5808 static ssize_t
5809 perf_set_reserve_percpu(struct sysdev_class *class,
5810                         struct sysdev_class_attribute *attr,
5811                         const char *buf,
5812                         size_t count)
5813 {
5814         struct perf_cpu_context *cpuctx;
5815         unsigned long val;
5816         int err, cpu, mpt;
5817
5818         err = strict_strtoul(buf, 10, &val);
5819         if (err)
5820                 return err;
5821         if (val > perf_max_events)
5822                 return -EINVAL;
5823
5824         spin_lock(&perf_resource_lock);
5825         perf_reserved_percpu = val;
5826         for_each_online_cpu(cpu) {
5827                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5828                 raw_spin_lock_irq(&cpuctx->ctx.lock);
5829                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5830                           perf_max_events - perf_reserved_percpu);
5831                 cpuctx->max_pertask = mpt;
5832                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5833         }
5834         spin_unlock(&perf_resource_lock);
5835
5836         return count;
5837 }
5838
5839 static ssize_t perf_show_overcommit(struct sysdev_class *class,
5840                                     struct sysdev_class_attribute *attr,
5841                                     char *buf)
5842 {
5843         return sprintf(buf, "%d\n", perf_overcommit);
5844 }
5845
5846 static ssize_t
5847 perf_set_overcommit(struct sysdev_class *class,
5848                     struct sysdev_class_attribute *attr,
5849                     const char *buf, size_t count)
5850 {
5851         unsigned long val;
5852         int err;
5853
5854         err = strict_strtoul(buf, 10, &val);
5855         if (err)
5856                 return err;
5857         if (val > 1)
5858                 return -EINVAL;
5859
5860         spin_lock(&perf_resource_lock);
5861         perf_overcommit = val;
5862         spin_unlock(&perf_resource_lock);
5863
5864         return count;
5865 }
5866
5867 static SYSDEV_CLASS_ATTR(
5868                                 reserve_percpu,
5869                                 0644,
5870                                 perf_show_reserve_percpu,
5871                                 perf_set_reserve_percpu
5872                         );
5873
5874 static SYSDEV_CLASS_ATTR(
5875                                 overcommit,
5876                                 0644,
5877                                 perf_show_overcommit,
5878                                 perf_set_overcommit
5879                         );
5880
5881 static struct attribute *perfclass_attrs[] = {
5882         &attr_reserve_percpu.attr,
5883         &attr_overcommit.attr,
5884         NULL
5885 };
5886
5887 static struct attribute_group perfclass_attr_group = {
5888         .attrs                  = perfclass_attrs,
5889         .name                   = "perf_events",
5890 };
5891
5892 static int __init perf_event_sysfs_init(void)
5893 {
5894         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5895                                   &perfclass_attr_group);
5896 }
5897 device_initcall(perf_event_sysfs_init);