kernel/perf_event.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/sysfs.h>
  19 #include <linux/dcache.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/vmstat.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/hardirq.h>
  25 #include <linux/rculist.h>
  26 #include <linux/uaccess.h>
  27 #include <linux/syscalls.h>
  28 #include <linux/anon_inodes.h>
  29 #include <linux/kernel_stat.h>
  30 #include <linux/perf_event.h>
  31 #include <linux/ftrace_event.h>
  32 #include <linux/hw_breakpoint.h>
  33
  34 #include <asm/irq_regs.h>
  35
  36 /*
  37  * Each CPU has a list of per CPU events:
  38  */
  39 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  40
  41 int perf_max_events __read_mostly = 1;
  42 static int perf_reserved_percpu __read_mostly;
  43 static int perf_overcommit __read_mostly = 1;
  44
  45 static atomic_t nr_events __read_mostly;
  46 static atomic_t nr_mmap_events __read_mostly;
  47 static atomic_t nr_comm_events __read_mostly;
  48 static atomic_t nr_task_events __read_mostly;
  49
  50 /*
  51  * perf event paranoia level:
  52  *  -1 - not paranoid at all
  53  *   0 - disallow raw tracepoint access for unpriv
  54  *   1 - disallow cpu events for unpriv
  55  *   2 - disallow kernel profiling for unpriv
  56  */
  57 int sysctl_perf_event_paranoid __read_mostly = 1;
  58
  59 static inline bool perf_paranoid_tracepoint_raw(void)
  60 {
  61         return sysctl_perf_event_paranoid > -1;
  62 }
  63
  64 static inline bool perf_paranoid_cpu(void)
  65 {
  66         return sysctl_perf_event_paranoid > 0;
  67 }
  68
  69 static inline bool perf_paranoid_kernel(void)
  70 {
  71         return sysctl_perf_event_paranoid > 1;
  72 }
  73
  74 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
  75
  76 /*
  77  * max perf event sample rate
  78  */
  79 int sysctl_perf_event_sample_rate __read_mostly = 100000;
  80
  81 static atomic64_t perf_event_id;
  82
  83 /*
  84  * Lock for (sysadmin-configurable) event reservations:
  85  */
  86 static DEFINE_SPINLOCK(perf_resource_lock);
  87
  88 /*
  89  * Architecture provided APIs - weak aliases:
  90  */
  91 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  92 {
  93         return NULL;
  94 }
  95
  96 void __weak hw_perf_disable(void)               { barrier(); }
  97 void __weak hw_perf_enable(void)                { barrier(); }
  98
  99 void __weak hw_perf_event_setup(int cpu)        { barrier(); }
 100 void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
 101 void __weak hw_perf_event_setup_offline(int cpu)        { barrier(); }
 102
 103 int __weak
 104 hw_perf_group_sched_in(struct perf_event *group_leader,
 105                struct perf_cpu_context *cpuctx,
 106                struct perf_event_context *ctx)
 107 {
 108         return 0;
 109 }
 110
 111 void __weak perf_event_print_debug(void)        { }
 112
 113 static DEFINE_PER_CPU(int, perf_disable_count);
 114
 115 void __perf_disable(void)
 116 {
 117         __get_cpu_var(perf_disable_count)++;
 118 }
 119
 120 bool __perf_enable(void)
 121 {
 122         return !--__get_cpu_var(perf_disable_count);
 123 }
 124
 125 void perf_disable(void)
 126 {
 127         __perf_disable();
 128         hw_perf_disable();
 129 }
 130
 131 void perf_enable(void)
 132 {
 133         if (__perf_enable())
 134                 hw_perf_enable();
 135 }
 136
 137 static void get_ctx(struct perf_event_context *ctx)
 138 {
 139         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 140 }
 141
 142 static void free_ctx(struct rcu_head *head)
 143 {
 144         struct perf_event_context *ctx;
 145
 146         ctx = container_of(head, struct perf_event_context, rcu_head);
 147         kfree(ctx);
 148 }
 149
 150 static void put_ctx(struct perf_event_context *ctx)
 151 {
 152         if (atomic_dec_and_test(&ctx->refcount)) {
 153                 if (ctx->parent_ctx)
 154                         put_ctx(ctx->parent_ctx);
 155                 if (ctx->task)
 156                         put_task_struct(ctx->task);
 157                 call_rcu(&ctx->rcu_head, free_ctx);
 158         }
 159 }
 160
 161 static void unclone_ctx(struct perf_event_context *ctx)
 162 {
 163         if (ctx->parent_ctx) {
 164                 put_ctx(ctx->parent_ctx);
 165                 ctx->parent_ctx = NULL;
 166         }
 167 }
 168
 169 /*
 170  * If we inherit events we want to return the parent event id
 171  * to userspace.
 172  */
 173 static u64 primary_event_id(struct perf_event *event)
 174 {
 175         u64 id = event->id;
 176
 177         if (event->parent)
 178                 id = event->parent->id;
 179
 180         return id;
 181 }
 182
 183 /*
 184  * Get the perf_event_context for a task and lock it.
 185  * This has to cope with with the fact that until it is locked,
 186  * the context could get moved to another task.
 187  */
 188 static struct perf_event_context *
 189 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 190 {
 191         struct perf_event_context *ctx;
 192
 193         rcu_read_lock();
 194  retry:
 195         ctx = rcu_dereference(task->perf_event_ctxp);
 196         if (ctx) {
 197                 /*
 198                  * If this context is a clone of another, it might
 199                  * get swapped for another underneath us by
 200                  * perf_event_task_sched_out, though the
 201                  * rcu_read_lock() protects us from any context
 202                  * getting freed.  Lock the context and check if it
 203                  * got swapped before we could get the lock, and retry
 204                  * if so.  If we locked the right context, then it
 205                  * can't get swapped on us any more.
 206                  */
 207                 raw_spin_lock_irqsave(&ctx->lock, *flags);
 208                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 209                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 210                         goto retry;
 211                 }
 212
 213                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 214                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 215                         ctx = NULL;
 216                 }
 217         }
 218         rcu_read_unlock();
 219         return ctx;
 220 }
 221
 222 /*
 223  * Get the context for a task and increment its pin_count so it
 224  * can't get swapped to another task.  This also increments its
 225  * reference count so that the context can't get freed.
 226  */
 227 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 228 {
 229         struct perf_event_context *ctx;
 230         unsigned long flags;
 231
 232         ctx = perf_lock_task_context(task, &flags);
 233         if (ctx) {
 234                 ++ctx->pin_count;
 235                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
 236         }
 237         return ctx;
 238 }
 239
 240 static void perf_unpin_context(struct perf_event_context *ctx)
 241 {
 242         unsigned long flags;
 243
 244         raw_spin_lock_irqsave(&ctx->lock, flags);
 245         --ctx->pin_count;
 246         raw_spin_unlock_irqrestore(&ctx->lock, flags);
 247         put_ctx(ctx);
 248 }
 249
 250 static inline u64 perf_clock(void)
 251 {
 252         return cpu_clock(smp_processor_id());
 253 }
 254
 255 /*
 256  * Update the record of the current time in a context.
 257  */
 258 static void update_context_time(struct perf_event_context *ctx)
 259 {
 260         u64 now = perf_clock();
 261
 262         ctx->time += now - ctx->timestamp;
 263         ctx->timestamp = now;
 264 }
 265
 266 /*
 267  * Update the total_time_enabled and total_time_running fields for a event.
 268  */
 269 static void update_event_times(struct perf_event *event)
 270 {
 271         struct perf_event_context *ctx = event->ctx;
 272         u64 run_end;
 273
 274         if (event->state < PERF_EVENT_STATE_INACTIVE ||
 275             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 276                 return;
 277
 278         if (ctx->is_active)
 279                 run_end = ctx->time;
 280         else
 281                 run_end = event->tstamp_stopped;
 282
 283         event->total_time_enabled = run_end - event->tstamp_enabled;
 284
 285         if (event->state == PERF_EVENT_STATE_INACTIVE)
 286                 run_end = event->tstamp_stopped;
 287         else
 288                 run_end = ctx->time;
 289
 290         event->total_time_running = run_end - event->tstamp_running;
 291 }
 292
 293 static struct list_head *
 294 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 295 {
 296         if (event->attr.pinned)
 297                 return &ctx->pinned_groups;
 298         else
 299                 return &ctx->flexible_groups;
 300 }
 301
 302 /*
 303  * Add a event from the lists for its context.
 304  * Must be called with ctx->mutex and ctx->lock held.
 305  */
 306 static void
 307 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 308 {
 309         struct perf_event *group_leader = event->group_leader;
 310
 311         /*
 312          * Depending on whether it is a standalone or sibling event,
 313          * add it straight to the context's event list, or to the group
 314          * leader's sibling list:
 315          */
 316         if (group_leader == event) {
 317                 struct list_head *list;
 318
 319                 if (is_software_event(event))
 320                         event->group_flags |= PERF_GROUP_SOFTWARE;
 321
 322                 list = ctx_group_list(event, ctx);
 323                 list_add_tail(&event->group_entry, list);
 324         } else {
 325                 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
 326                     !is_software_event(event))
 327                         group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
 328
 329                 list_add_tail(&event->group_entry, &group_leader->sibling_list);
 330                 group_leader->nr_siblings++;
 331         }
 332
 333         list_add_rcu(&event->event_entry, &ctx->event_list);
 334         ctx->nr_events++;
 335         if (event->attr.inherit_stat)
 336                 ctx->nr_stat++;
 337 }
 338
 339 /*
 340  * Remove a event from the lists for its context.
 341  * Must be called with ctx->mutex and ctx->lock held.
 342  */
 343 static void
 344 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 345 {
 346         struct perf_event *sibling, *tmp;
 347
 348         if (list_empty(&event->group_entry))
 349                 return;
 350         ctx->nr_events--;
 351         if (event->attr.inherit_stat)
 352                 ctx->nr_stat--;
 353
 354         list_del_init(&event->group_entry);
 355         list_del_rcu(&event->event_entry);
 356
 357         if (event->group_leader != event)
 358                 event->group_leader->nr_siblings--;
 359
 360         update_event_times(event);
 361
 362         /*
 363          * If event was in error state, then keep it
 364          * that way, otherwise bogus counts will be
 365          * returned on read(). The only way to get out
 366          * of error state is by explicit re-enabling
 367          * of the event
 368          */
 369         if (event->state > PERF_EVENT_STATE_OFF)
 370                 event->state = PERF_EVENT_STATE_OFF;
 371
 372         /*
 373          * If this was a group event with sibling events then
 374          * upgrade the siblings to singleton events by adding them
 375          * to the context list directly:
 376          */
 377         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 378                 struct list_head *list;
 379
 380                 list = ctx_group_list(event, ctx);
 381                 list_move_tail(&sibling->group_entry, list);
 382                 sibling->group_leader = sibling;
 383
 384                 /* Inherit group flags from the previous leader */
 385                 sibling->group_flags = event->group_flags;
 386         }
 387 }
 388
 389 static void
 390 event_sched_out(struct perf_event *event,
 391                   struct perf_cpu_context *cpuctx,
 392                   struct perf_event_context *ctx)
 393 {
 394         if (event->state != PERF_EVENT_STATE_ACTIVE)
 395                 return;
 396
 397         event->state = PERF_EVENT_STATE_INACTIVE;
 398         if (event->pending_disable) {
 399                 event->pending_disable = 0;
 400                 event->state = PERF_EVENT_STATE_OFF;
 401         }
 402         event->tstamp_stopped = ctx->time;
 403         event->pmu->disable(event);
 404         event->oncpu = -1;
 405
 406         if (!is_software_event(event))
 407                 cpuctx->active_oncpu--;
 408         ctx->nr_active--;
 409         if (event->attr.exclusive || !cpuctx->active_oncpu)
 410                 cpuctx->exclusive = 0;
 411 }
 412
 413 static void
 414 group_sched_out(struct perf_event *group_event,
 415                 struct perf_cpu_context *cpuctx,
 416                 struct perf_event_context *ctx)
 417 {
 418         struct perf_event *event;
 419
 420         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 421                 return;
 422
 423         event_sched_out(group_event, cpuctx, ctx);
 424
 425         /*
 426          * Schedule out siblings (if any):
 427          */
 428         list_for_each_entry(event, &group_event->sibling_list, group_entry)
 429                 event_sched_out(event, cpuctx, ctx);
 430
 431         if (group_event->attr.exclusive)
 432                 cpuctx->exclusive = 0;
 433 }
 434
 435 /*
 436  * Cross CPU call to remove a performance event
 437  *
 438  * We disable the event on the hardware level first. After that we
 439  * remove it from the context list.
 440  */
 441 static void __perf_event_remove_from_context(void *info)
 442 {
 443         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 444         struct perf_event *event = info;
 445         struct perf_event_context *ctx = event->ctx;
 446
 447         /*
 448          * If this is a task context, we need to check whether it is
 449          * the current task context of this cpu. If not it has been
 450          * scheduled out before the smp call arrived.
 451          */
 452         if (ctx->task && cpuctx->task_ctx != ctx)
 453                 return;
 454
 455         raw_spin_lock(&ctx->lock);
 456         /*
 457          * Protect the list operation against NMI by disabling the
 458          * events on a global level.
 459          */
 460         perf_disable();
 461
 462         event_sched_out(event, cpuctx, ctx);
 463
 464         list_del_event(event, ctx);
 465
 466         if (!ctx->task) {
 467                 /*
 468                  * Allow more per task events with respect to the
 469                  * reservation:
 470                  */
 471                 cpuctx->max_pertask =
 472                         min(perf_max_events - ctx->nr_events,
 473                             perf_max_events - perf_reserved_percpu);
 474         }
 475
 476         perf_enable();
 477         raw_spin_unlock(&ctx->lock);
 478 }
 479
 480
 481 /*
 482  * Remove the event from a task's (or a CPU's) list of events.
 483  *
 484  * Must be called with ctx->mutex held.
 485  *
 486  * CPU events are removed with a smp call. For task events we only
 487  * call when the task is on a CPU.
 488  *
 489  * If event->ctx is a cloned context, callers must make sure that
 490  * every task struct that event->ctx->task could possibly point to
 491  * remains valid.  This is OK when called from perf_release since
 492  * that only calls us on the top-level context, which can't be a clone.
 493  * When called from perf_event_exit_task, it's OK because the
 494  * context has been detached from its task.
 495  */
 496 static void perf_event_remove_from_context(struct perf_event *event)
 497 {
 498         struct perf_event_context *ctx = event->ctx;
 499         struct task_struct *task = ctx->task;
 500
 501         if (!task) {
 502                 /*
 503                  * Per cpu events are removed via an smp call and
 504                  * the removal is always successful.
 505                  */
 506                 smp_call_function_single(event->cpu,
 507                                          __perf_event_remove_from_context,
 508                                          event, 1);
 509                 return;
 510         }
 511
 512 retry:
 513         task_oncpu_function_call(task, __perf_event_remove_from_context,
 514                                  event);
 515
 516         raw_spin_lock_irq(&ctx->lock);
 517         /*
 518          * If the context is active we need to retry the smp call.
 519          */
 520         if (ctx->nr_active && !list_empty(&event->group_entry)) {
 521                 raw_spin_unlock_irq(&ctx->lock);
 522                 goto retry;
 523         }
 524
 525         /*
 526          * The lock prevents that this context is scheduled in so we
 527          * can remove the event safely, if the call above did not
 528          * succeed.
 529          */
 530         if (!list_empty(&event->group_entry))
 531                 list_del_event(event, ctx);
 532         raw_spin_unlock_irq(&ctx->lock);
 533 }
 534
 535 /*
 536  * Update total_time_enabled and total_time_running for all events in a group.
 537  */
 538 static void update_group_times(struct perf_event *leader)
 539 {
 540         struct perf_event *event;
 541
 542         update_event_times(leader);
 543         list_for_each_entry(event, &leader->sibling_list, group_entry)
 544                 update_event_times(event);
 545 }
 546
 547 /*
 548  * Cross CPU call to disable a performance event
 549  */
 550 static void __perf_event_disable(void *info)
 551 {
 552         struct perf_event *event = info;
 553         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 554         struct perf_event_context *ctx = event->ctx;
 555
 556         /*
 557          * If this is a per-task event, need to check whether this
 558          * event's task is the current task on this cpu.
 559          */
 560         if (ctx->task && cpuctx->task_ctx != ctx)
 561                 return;
 562
 563         raw_spin_lock(&ctx->lock);
 564
 565         /*
 566          * If the event is on, turn it off.
 567          * If it is in error state, leave it in error state.
 568          */
 569         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 570                 update_context_time(ctx);
 571                 update_group_times(event);
 572                 if (event == event->group_leader)
 573                         group_sched_out(event, cpuctx, ctx);
 574                 else
 575                         event_sched_out(event, cpuctx, ctx);
 576                 event->state = PERF_EVENT_STATE_OFF;
 577         }
 578
 579         raw_spin_unlock(&ctx->lock);
 580 }
 581
 582 /*
 583  * Disable a event.
 584  *
 585  * If event->ctx is a cloned context, callers must make sure that
 586  * every task struct that event->ctx->task could possibly point to
 587  * remains valid.  This condition is satisifed when called through
 588  * perf_event_for_each_child or perf_event_for_each because they
 589  * hold the top-level event's child_mutex, so any descendant that
 590  * goes to exit will block in sync_child_event.
 591  * When called from perf_pending_event it's OK because event->ctx
 592  * is the current context on this CPU and preemption is disabled,
 593  * hence we can't get into perf_event_task_sched_out for this context.
 594  */
 595 void perf_event_disable(struct perf_event *event)
 596 {
 597         struct perf_event_context *ctx = event->ctx;
 598         struct task_struct *task = ctx->task;
 599
 600         if (!task) {
 601                 /*
 602                  * Disable the event on the cpu that it's on
 603                  */
 604                 smp_call_function_single(event->cpu, __perf_event_disable,
 605                                          event, 1);
 606                 return;
 607         }
 608
 609  retry:
 610         task_oncpu_function_call(task, __perf_event_disable, event);
 611
 612         raw_spin_lock_irq(&ctx->lock);
 613         /*
 614          * If the event is still active, we need to retry the cross-call.
 615          */
 616         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 617                 raw_spin_unlock_irq(&ctx->lock);
 618                 goto retry;
 619         }
 620
 621         /*
 622          * Since we have the lock this context can't be scheduled
 623          * in, so we can change the state safely.
 624          */
 625         if (event->state == PERF_EVENT_STATE_INACTIVE) {
 626                 update_group_times(event);
 627                 event->state = PERF_EVENT_STATE_OFF;
 628         }
 629
 630         raw_spin_unlock_irq(&ctx->lock);
 631 }
 632
 633 static int
 634 event_sched_in(struct perf_event *event,
 635                  struct perf_cpu_context *cpuctx,
 636                  struct perf_event_context *ctx)
 637 {
 638         if (event->state <= PERF_EVENT_STATE_OFF)
 639                 return 0;
 640
 641         event->state = PERF_EVENT_STATE_ACTIVE;
 642         event->oncpu = smp_processor_id();
 643         /*
 644          * The new state must be visible before we turn it on in the hardware:
 645          */
 646         smp_wmb();
 647
 648         if (event->pmu->enable(event)) {
 649                 event->state = PERF_EVENT_STATE_INACTIVE;
 650                 event->oncpu = -1;
 651                 return -EAGAIN;
 652         }
 653
 654         event->tstamp_running += ctx->time - event->tstamp_stopped;
 655
 656         if (!is_software_event(event))
 657                 cpuctx->active_oncpu++;
 658         ctx->nr_active++;
 659
 660         if (event->attr.exclusive)
 661                 cpuctx->exclusive = 1;
 662
 663         return 0;
 664 }
 665
 666 static int
 667 group_sched_in(struct perf_event *group_event,
 668                struct perf_cpu_context *cpuctx,
 669                struct perf_event_context *ctx)
 670 {
 671         struct perf_event *event, *partial_group;
 672         int ret;
 673
 674         if (group_event->state == PERF_EVENT_STATE_OFF)
 675                 return 0;
 676
 677         ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
 678         if (ret)
 679                 return ret < 0 ? ret : 0;
 680
 681         if (event_sched_in(group_event, cpuctx, ctx))
 682                 return -EAGAIN;
 683
 684         /*
 685          * Schedule in siblings as one group (if any):
 686          */
 687         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 688                 if (event_sched_in(event, cpuctx, ctx)) {
 689                         partial_group = event;
 690                         goto group_error;
 691                 }
 692         }
 693
 694         return 0;
 695
 696 group_error:
 697         /*
 698          * Groups can be scheduled in as one unit only, so undo any
 699          * partial group before returning:
 700          */
 701         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 702                 if (event == partial_group)
 703                         break;
 704                 event_sched_out(event, cpuctx, ctx);
 705         }
 706         event_sched_out(group_event, cpuctx, ctx);
 707
 708         return -EAGAIN;
 709 }
 710
 711 /*
 712  * Work out whether we can put this event group on the CPU now.
 713  */
 714 static int group_can_go_on(struct perf_event *event,
 715                            struct perf_cpu_context *cpuctx,
 716                            int can_add_hw)
 717 {
 718         /*
 719          * Groups consisting entirely of software events can always go on.
 720          */
 721         if (event->group_flags & PERF_GROUP_SOFTWARE)
 722                 return 1;
 723         /*
 724          * If an exclusive group is already on, no other hardware
 725          * events can go on.
 726          */
 727         if (cpuctx->exclusive)
 728                 return 0;
 729         /*
 730          * If this group is exclusive and there are already
 731          * events on the CPU, it can't go on.
 732          */
 733         if (event->attr.exclusive && cpuctx->active_oncpu)
 734                 return 0;
 735         /*
 736          * Otherwise, try to add it if all previous groups were able
 737          * to go on.
 738          */
 739         return can_add_hw;
 740 }
 741
 742 static void add_event_to_ctx(struct perf_event *event,
 743                                struct perf_event_context *ctx)
 744 {
 745         list_add_event(event, ctx);
 746         event->tstamp_enabled = ctx->time;
 747         event->tstamp_running = ctx->time;
 748         event->tstamp_stopped = ctx->time;
 749 }
 750
 751 /*
 752  * Cross CPU call to install and enable a performance event
 753  *
 754  * Must be called with ctx->mutex held
 755  */
 756 static void __perf_install_in_context(void *info)
 757 {
 758         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 759         struct perf_event *event = info;
 760         struct perf_event_context *ctx = event->ctx;
 761         struct perf_event *leader = event->group_leader;
 762         int err;
 763
 764         /*
 765          * If this is a task context, we need to check whether it is
 766          * the current task context of this cpu. If not it has been
 767          * scheduled out before the smp call arrived.
 768          * Or possibly this is the right context but it isn't
 769          * on this cpu because it had no events.
 770          */
 771         if (ctx->task && cpuctx->task_ctx != ctx) {
 772                 if (cpuctx->task_ctx || ctx->task != current)
 773                         return;
 774                 cpuctx->task_ctx = ctx;
 775         }
 776
 777         raw_spin_lock(&ctx->lock);
 778         ctx->is_active = 1;
 779         update_context_time(ctx);
 780
 781         /*
 782          * Protect the list operation against NMI by disabling the
 783          * events on a global level. NOP for non NMI based events.
 784          */
 785         perf_disable();
 786
 787         add_event_to_ctx(event, ctx);
 788
 789         if (event->cpu != -1 && event->cpu != smp_processor_id())
 790                 goto unlock;
 791
 792         /*
 793          * Don't put the event on if it is disabled or if
 794          * it is in a group and the group isn't on.
 795          */
 796         if (event->state != PERF_EVENT_STATE_INACTIVE ||
 797             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 798                 goto unlock;
 799
 800         /*
 801          * An exclusive event can't go on if there are already active
 802          * hardware events, and no hardware event can go on if there
 803          * is already an exclusive event on.
 804          */
 805         if (!group_can_go_on(event, cpuctx, 1))
 806                 err = -EEXIST;
 807         else
 808                 err = event_sched_in(event, cpuctx, ctx);
 809
 810         if (err) {
 811                 /*
 812                  * This event couldn't go on.  If it is in a group
 813                  * then we have to pull the whole group off.
 814                  * If the event group is pinned then put it in error state.
 815                  */
 816                 if (leader != event)
 817                         group_sched_out(leader, cpuctx, ctx);
 818                 if (leader->attr.pinned) {
 819                         update_group_times(leader);
 820                         leader->state = PERF_EVENT_STATE_ERROR;
 821                 }
 822         }
 823
 824         if (!err && !ctx->task && cpuctx->max_pertask)
 825                 cpuctx->max_pertask--;
 826
 827  unlock:
 828         perf_enable();
 829
 830         raw_spin_unlock(&ctx->lock);
 831 }
 832
 833 /*
 834  * Attach a performance event to a context
 835  *
 836  * First we add the event to the list with the hardware enable bit
 837  * in event->hw_config cleared.
 838  *
 839  * If the event is attached to a task which is on a CPU we use a smp
 840  * call to enable it in the task context. The task might have been
 841  * scheduled away, but we check this in the smp call again.
 842  *
 843  * Must be called with ctx->mutex held.
 844  */
 845 static void
 846 perf_install_in_context(struct perf_event_context *ctx,
 847                         struct perf_event *event,
 848                         int cpu)
 849 {
 850         struct task_struct *task = ctx->task;
 851
 852         if (!task) {
 853                 /*
 854                  * Per cpu events are installed via an smp call and
 855                  * the install is always successful.
 856                  */
 857                 smp_call_function_single(cpu, __perf_install_in_context,
 858                                          event, 1);
 859                 return;
 860         }
 861
 862 retry:
 863         task_oncpu_function_call(task, __perf_install_in_context,
 864                                  event);
 865
 866         raw_spin_lock_irq(&ctx->lock);
 867         /*
 868          * we need to retry the smp call.
 869          */
 870         if (ctx->is_active && list_empty(&event->group_entry)) {
 871                 raw_spin_unlock_irq(&ctx->lock);
 872                 goto retry;
 873         }
 874
 875         /*
 876          * The lock prevents that this context is scheduled in so we
 877          * can add the event safely, if it the call above did not
 878          * succeed.
 879          */
 880         if (list_empty(&event->group_entry))
 881                 add_event_to_ctx(event, ctx);
 882         raw_spin_unlock_irq(&ctx->lock);
 883 }
 884
 885 /*
 886  * Put a event into inactive state and update time fields.
 887  * Enabling the leader of a group effectively enables all
 888  * the group members that aren't explicitly disabled, so we
 889  * have to update their ->tstamp_enabled also.
 890  * Note: this works for group members as well as group leaders
 891  * since the non-leader members' sibling_lists will be empty.
 892  */
 893 static void __perf_event_mark_enabled(struct perf_event *event,
 894                                         struct perf_event_context *ctx)
 895 {
 896         struct perf_event *sub;
 897
 898         event->state = PERF_EVENT_STATE_INACTIVE;
 899         event->tstamp_enabled = ctx->time - event->total_time_enabled;
 900         list_for_each_entry(sub, &event->sibling_list, group_entry)
 901                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 902                         sub->tstamp_enabled =
 903                                 ctx->time - sub->total_time_enabled;
 904 }
 905
 906 /*
 907  * Cross CPU call to enable a performance event
 908  */
 909 static void __perf_event_enable(void *info)
 910 {
 911         struct perf_event *event = info;
 912         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 913         struct perf_event_context *ctx = event->ctx;
 914         struct perf_event *leader = event->group_leader;
 915         int err;
 916
 917         /*
 918          * If this is a per-task event, need to check whether this
 919          * event's task is the current task on this cpu.
 920          */
 921         if (ctx->task && cpuctx->task_ctx != ctx) {
 922                 if (cpuctx->task_ctx || ctx->task != current)
 923                         return;
 924                 cpuctx->task_ctx = ctx;
 925         }
 926
 927         raw_spin_lock(&ctx->lock);
 928         ctx->is_active = 1;
 929         update_context_time(ctx);
 930
 931         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 932                 goto unlock;
 933         __perf_event_mark_enabled(event, ctx);
 934
 935         if (event->cpu != -1 && event->cpu != smp_processor_id())
 936                 goto unlock;
 937
 938         /*
 939          * If the event is in a group and isn't the group leader,
 940          * then don't put it on unless the group is on.
 941          */
 942         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 943                 goto unlock;
 944
 945         if (!group_can_go_on(event, cpuctx, 1)) {
 946                 err = -EEXIST;
 947         } else {
 948                 perf_disable();
 949                 if (event == leader)
 950                         err = group_sched_in(event, cpuctx, ctx);
 951                 else
 952                         err = event_sched_in(event, cpuctx, ctx);
 953                 perf_enable();
 954         }
 955
 956         if (err) {
 957                 /*
 958                  * If this event can't go on and it's part of a
 959                  * group, then the whole group has to come off.
 960                  */
 961                 if (leader != event)
 962                         group_sched_out(leader, cpuctx, ctx);
 963                 if (leader->attr.pinned) {
 964                         update_group_times(leader);
 965                         leader->state = PERF_EVENT_STATE_ERROR;
 966                 }
 967         }
 968
 969  unlock:
 970         raw_spin_unlock(&ctx->lock);
 971 }
 972
 973 /*
 974  * Enable a event.
 975  *
 976  * If event->ctx is a cloned context, callers must make sure that
 977  * every task struct that event->ctx->task could possibly point to
 978  * remains valid.  This condition is satisfied when called through
 979  * perf_event_for_each_child or perf_event_for_each as described
 980  * for perf_event_disable.
 981  */
 982 void perf_event_enable(struct perf_event *event)
 983 {
 984         struct perf_event_context *ctx = event->ctx;
 985         struct task_struct *task = ctx->task;
 986
 987         if (!task) {
 988                 /*
 989                  * Enable the event on the cpu that it's on
 990                  */
 991                 smp_call_function_single(event->cpu, __perf_event_enable,
 992                                          event, 1);
 993                 return;
 994         }
 995
 996         raw_spin_lock_irq(&ctx->lock);
 997         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 998                 goto out;
 999
1000         /*
1001          * If the event is in error state, clear that first.
1002          * That way, if we see the event in error state below, we
1003          * know that it has gone back into error state, as distinct
1004          * from the task having been scheduled away before the
1005          * cross-call arrived.
1006          */
1007         if (event->state == PERF_EVENT_STATE_ERROR)
1008                 event->state = PERF_EVENT_STATE_OFF;
1009
1010  retry:
1011         raw_spin_unlock_irq(&ctx->lock);
1012         task_oncpu_function_call(task, __perf_event_enable, event);
1013
1014         raw_spin_lock_irq(&ctx->lock);
1015
1016         /*
1017          * If the context is active and the event is still off,
1018          * we need to retry the cross-call.
1019          */
1020         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1021                 goto retry;
1022
1023         /*
1024          * Since we have the lock this context can't be scheduled
1025          * in, so we can change the state safely.
1026          */
1027         if (event->state == PERF_EVENT_STATE_OFF)
1028                 __perf_event_mark_enabled(event, ctx);
1029
1030  out:
1031         raw_spin_unlock_irq(&ctx->lock);
1032 }
1033
1034 static int perf_event_refresh(struct perf_event *event, int refresh)
1035 {
1036         /*
1037          * not supported on inherited events
1038          */
1039         if (event->attr.inherit)
1040                 return -EINVAL;
1041
1042         atomic_add(refresh, &event->event_limit);
1043         perf_event_enable(event);
1044
1045         return 0;
1046 }
1047
1048 enum event_type_t {
1049         EVENT_FLEXIBLE = 0x1,
1050         EVENT_PINNED = 0x2,
1051         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1052 };
1053
1054 static void ctx_sched_out(struct perf_event_context *ctx,
1055                           struct perf_cpu_context *cpuctx,
1056                           enum event_type_t event_type)
1057 {
1058         struct perf_event *event;
1059
1060         raw_spin_lock(&ctx->lock);
1061         ctx->is_active = 0;
1062         if (likely(!ctx->nr_events))
1063                 goto out;
1064         update_context_time(ctx);
1065
1066         perf_disable();
1067         if (!ctx->nr_active)
1068                 goto out_enable;
1069
1070         if (event_type & EVENT_PINNED)
1071                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1072                         group_sched_out(event, cpuctx, ctx);
1073
1074         if (event_type & EVENT_FLEXIBLE)
1075                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1076                         group_sched_out(event, cpuctx, ctx);
1077
1078  out_enable:
1079         perf_enable();
1080  out:
1081         raw_spin_unlock(&ctx->lock);
1082 }
1083
1084 /*
1085  * Test whether two contexts are equivalent, i.e. whether they
1086  * have both been cloned from the same version of the same context
1087  * and they both have the same number of enabled events.
1088  * If the number of enabled events is the same, then the set
1089  * of enabled events should be the same, because these are both
1090  * inherited contexts, therefore we can't access individual events
1091  * in them directly with an fd; we can only enable/disable all
1092  * events via prctl, or enable/disable all events in a family
1093  * via ioctl, which will have the same effect on both contexts.
1094  */
1095 static int context_equiv(struct perf_event_context *ctx1,
1096                          struct perf_event_context *ctx2)
1097 {
1098         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1099                 && ctx1->parent_gen == ctx2->parent_gen
1100                 && !ctx1->pin_count && !ctx2->pin_count;
1101 }
1102
1103 static void __perf_event_sync_stat(struct perf_event *event,
1104                                      struct perf_event *next_event)
1105 {
1106         u64 value;
1107
1108         if (!event->attr.inherit_stat)
1109                 return;
1110
1111         /*
1112          * Update the event value, we cannot use perf_event_read()
1113          * because we're in the middle of a context switch and have IRQs
1114          * disabled, which upsets smp_call_function_single(), however
1115          * we know the event must be on the current CPU, therefore we
1116          * don't need to use it.
1117          */
1118         switch (event->state) {
1119         case PERF_EVENT_STATE_ACTIVE:
1120                 event->pmu->read(event);
1121                 /* fall-through */
1122
1123         case PERF_EVENT_STATE_INACTIVE:
1124                 update_event_times(event);
1125                 break;
1126
1127         default:
1128                 break;
1129         }
1130
1131         /*
1132          * In order to keep per-task stats reliable we need to flip the event
1133          * values when we flip the contexts.
1134          */
1135         value = atomic64_read(&next_event->count);
1136         value = atomic64_xchg(&event->count, value);
1137         atomic64_set(&next_event->count, value);
1138
1139         swap(event->total_time_enabled, next_event->total_time_enabled);
1140         swap(event->total_time_running, next_event->total_time_running);
1141
1142         /*
1143          * Since we swizzled the values, update the user visible data too.
1144          */
1145         perf_event_update_userpage(event);
1146         perf_event_update_userpage(next_event);
1147 }
1148
1149 #define list_next_entry(pos, member) \
1150         list_entry(pos->member.next, typeof(*pos), member)
1151
1152 static void perf_event_sync_stat(struct perf_event_context *ctx,
1153                                    struct perf_event_context *next_ctx)
1154 {
1155         struct perf_event *event, *next_event;
1156
1157         if (!ctx->nr_stat)
1158                 return;
1159
1160         update_context_time(ctx);
1161
1162         event = list_first_entry(&ctx->event_list,
1163                                    struct perf_event, event_entry);
1164
1165         next_event = list_first_entry(&next_ctx->event_list,
1166                                         struct perf_event, event_entry);
1167
1168         while (&event->event_entry != &ctx->event_list &&
1169                &next_event->event_entry != &next_ctx->event_list) {
1170
1171                 __perf_event_sync_stat(event, next_event);
1172
1173                 event = list_next_entry(event, event_entry);
1174                 next_event = list_next_entry(next_event, event_entry);
1175         }
1176 }
1177
1178 /*
1179  * Called from scheduler to remove the events of the current task,
1180  * with interrupts disabled.
1181  *
1182  * We stop each event and update the event value in event->count.
1183  *
1184  * This does not protect us against NMI, but disable()
1185  * sets the disabled bit in the control field of event _before_
1186  * accessing the event control register. If a NMI hits, then it will
1187  * not restart the event.
1188  */
1189 void perf_event_task_sched_out(struct task_struct *task,
1190                                  struct task_struct *next)
1191 {
1192         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1193         struct perf_event_context *ctx = task->perf_event_ctxp;
1194         struct perf_event_context *next_ctx;
1195         struct perf_event_context *parent;
1196         struct pt_regs *regs;
1197         int do_switch = 1;
1198
1199         regs = task_pt_regs(task);
1200         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1201
1202         if (likely(!ctx || !cpuctx->task_ctx))
1203                 return;
1204
1205         rcu_read_lock();
1206         parent = rcu_dereference(ctx->parent_ctx);
1207         next_ctx = next->perf_event_ctxp;
1208         if (parent && next_ctx &&
1209             rcu_dereference(next_ctx->parent_ctx) == parent) {
1210                 /*
1211                  * Looks like the two contexts are clones, so we might be
1212                  * able to optimize the context switch.  We lock both
1213                  * contexts and check that they are clones under the
1214                  * lock (including re-checking that neither has been
1215                  * uncloned in the meantime).  It doesn't matter which
1216                  * order we take the locks because no other cpu could
1217                  * be trying to lock both of these tasks.
1218                  */
1219                 raw_spin_lock(&ctx->lock);
1220                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1221                 if (context_equiv(ctx, next_ctx)) {
1222                         /*
1223                          * XXX do we need a memory barrier of sorts
1224                          * wrt to rcu_dereference() of perf_event_ctxp
1225                          */
1226                         task->perf_event_ctxp = next_ctx;
1227                         next->perf_event_ctxp = ctx;
1228                         ctx->task = next;
1229                         next_ctx->task = task;
1230                         do_switch = 0;
1231
1232                         perf_event_sync_stat(ctx, next_ctx);
1233                 }
1234                 raw_spin_unlock(&next_ctx->lock);
1235                 raw_spin_unlock(&ctx->lock);
1236         }
1237         rcu_read_unlock();
1238
1239         if (do_switch) {
1240                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1241                 cpuctx->task_ctx = NULL;
1242         }
1243 }
1244
1245 static void task_ctx_sched_out(struct perf_event_context *ctx,
1246                                enum event_type_t event_type)
1247 {
1248         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1249
1250         if (!cpuctx->task_ctx)
1251                 return;
1252
1253         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1254                 return;
1255
1256         ctx_sched_out(ctx, cpuctx, event_type);
1257         cpuctx->task_ctx = NULL;
1258 }
1259
1260 /*
1261  * Called with IRQs disabled
1262  */
1263 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1264 {
1265         task_ctx_sched_out(ctx, EVENT_ALL);
1266 }
1267
1268 /*
1269  * Called with IRQs disabled
1270  */
1271 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1272                               enum event_type_t event_type)
1273 {
1274         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1275 }
1276
1277 static void
1278 ctx_pinned_sched_in(struct perf_event_context *ctx,
1279                     struct perf_cpu_context *cpuctx)
1280 {
1281         struct perf_event *event;
1282
1283         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1284                 if (event->state <= PERF_EVENT_STATE_OFF)
1285                         continue;
1286                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1287                         continue;
1288
1289                 if (group_can_go_on(event, cpuctx, 1))
1290                         group_sched_in(event, cpuctx, ctx);
1291
1292                 /*
1293                  * If this pinned group hasn't been scheduled,
1294                  * put it in error state.
1295                  */
1296                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1297                         update_group_times(event);
1298                         event->state = PERF_EVENT_STATE_ERROR;
1299                 }
1300         }
1301 }
1302
1303 static void
1304 ctx_flexible_sched_in(struct perf_event_context *ctx,
1305                       struct perf_cpu_context *cpuctx)
1306 {
1307         struct perf_event *event;
1308         int can_add_hw = 1;
1309
1310         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1311                 /* Ignore events in OFF or ERROR state */
1312                 if (event->state <= PERF_EVENT_STATE_OFF)
1313                         continue;
1314                 /*
1315                  * Listen to the 'cpu' scheduling filter constraint
1316                  * of events:
1317                  */
1318                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1319                         continue;
1320
1321                 if (group_can_go_on(event, cpuctx, can_add_hw))
1322                         if (group_sched_in(event, cpuctx, ctx))
1323                                 can_add_hw = 0;
1324         }
1325 }
1326
1327 static void
1328 ctx_sched_in(struct perf_event_context *ctx,
1329              struct perf_cpu_context *cpuctx,
1330              enum event_type_t event_type)
1331 {
1332         raw_spin_lock(&ctx->lock);
1333         ctx->is_active = 1;
1334         if (likely(!ctx->nr_events))
1335                 goto out;
1336
1337         ctx->timestamp = perf_clock();
1338
1339         perf_disable();
1340
1341         /*
1342          * First go through the list and put on any pinned groups
1343          * in order to give them the best chance of going on.
1344          */
1345         if (event_type & EVENT_PINNED)
1346                 ctx_pinned_sched_in(ctx, cpuctx);
1347
1348         /* Then walk through the lower prio flexible groups */
1349         if (event_type & EVENT_FLEXIBLE)
1350                 ctx_flexible_sched_in(ctx, cpuctx);
1351
1352         perf_enable();
1353  out:
1354         raw_spin_unlock(&ctx->lock);
1355 }
1356
1357 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1358                              enum event_type_t event_type)
1359 {
1360         struct perf_event_context *ctx = &cpuctx->ctx;
1361
1362         ctx_sched_in(ctx, cpuctx, event_type);
1363 }
1364
1365 static void task_ctx_sched_in(struct task_struct *task,
1366                               enum event_type_t event_type)
1367 {
1368         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1369         struct perf_event_context *ctx = task->perf_event_ctxp;
1370
1371         if (likely(!ctx))
1372                 return;
1373         if (cpuctx->task_ctx == ctx)
1374                 return;
1375         ctx_sched_in(ctx, cpuctx, event_type);
1376         cpuctx->task_ctx = ctx;
1377 }
1378 /*
1379  * Called from scheduler to add the events of the current task
1380  * with interrupts disabled.
1381  *
1382  * We restore the event value and then enable it.
1383  *
1384  * This does not protect us against NMI, but enable()
1385  * sets the enabled bit in the control field of event _before_
1386  * accessing the event control register. If a NMI hits, then it will
1387  * keep the event running.
1388  */
1389 void perf_event_task_sched_in(struct task_struct *task)
1390 {
1391         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1392         struct perf_event_context *ctx = task->perf_event_ctxp;
1393
1394         if (likely(!ctx))
1395                 return;
1396
1397         if (cpuctx->task_ctx == ctx)
1398                 return;
1399
1400         /*
1401          * We want to keep the following priority order:
1402          * cpu pinned (that don't need to move), task pinned,
1403          * cpu flexible, task flexible.
1404          */
1405         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1406
1407         ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1408         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1409         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1410
1411         cpuctx->task_ctx = ctx;
1412 }
1413
1414 #define MAX_INTERRUPTS (~0ULL)
1415
1416 static void perf_log_throttle(struct perf_event *event, int enable);
1417
1418 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1419 {
1420         u64 frequency = event->attr.sample_freq;
1421         u64 sec = NSEC_PER_SEC;
1422         u64 divisor, dividend;
1423
1424         int count_fls, nsec_fls, frequency_fls, sec_fls;
1425
1426         count_fls = fls64(count);
1427         nsec_fls = fls64(nsec);
1428         frequency_fls = fls64(frequency);
1429         sec_fls = 30;
1430
1431         /*
1432          * We got @count in @nsec, with a target of sample_freq HZ
1433          * the target period becomes:
1434          *
1435          *             @count * 10^9
1436          * period = -------------------
1437          *          @nsec * sample_freq
1438          *
1439          */
1440
1441         /*
1442          * Reduce accuracy by one bit such that @a and @b converge
1443          * to a similar magnitude.
1444          */
1445 #define REDUCE_FLS(a, b)                \
1446 do {                                    \
1447         if (a##_fls > b##_fls) {        \
1448                 a >>= 1;                \
1449                 a##_fls--;              \
1450         } else {                        \
1451                 b >>= 1;                \
1452                 b##_fls--;              \
1453         }                               \
1454 } while (0)
1455
1456         /*
1457          * Reduce accuracy until either term fits in a u64, then proceed with
1458          * the other, so that finally we can do a u64/u64 division.
1459          */
1460         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1461                 REDUCE_FLS(nsec, frequency);
1462                 REDUCE_FLS(sec, count);
1463         }
1464
1465         if (count_fls + sec_fls > 64) {
1466                 divisor = nsec * frequency;
1467
1468                 while (count_fls + sec_fls > 64) {
1469                         REDUCE_FLS(count, sec);
1470                         divisor >>= 1;
1471                 }
1472
1473                 dividend = count * sec;
1474         } else {
1475                 dividend = count * sec;
1476
1477                 while (nsec_fls + frequency_fls > 64) {
1478                         REDUCE_FLS(nsec, frequency);
1479                         dividend >>= 1;
1480                 }
1481
1482                 divisor = nsec * frequency;
1483         }
1484
1485         return div64_u64(dividend, divisor);
1486 }
1487
1488 static void perf_event_stop(struct perf_event *event)
1489 {
1490         if (!event->pmu->stop)
1491                 return event->pmu->disable(event);
1492
1493         return event->pmu->stop(event);
1494 }
1495
1496 static int perf_event_start(struct perf_event *event)
1497 {
1498         if (!event->pmu->start)
1499                 return event->pmu->enable(event);
1500
1501         return event->pmu->start(event);
1502 }
1503
1504 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1505 {
1506         struct hw_perf_event *hwc = &event->hw;
1507         u64 period, sample_period;
1508         s64 delta;
1509
1510         period = perf_calculate_period(event, nsec, count);
1511
1512         delta = (s64)(period - hwc->sample_period);
1513         delta = (delta + 7) / 8; /* low pass filter */
1514
1515         sample_period = hwc->sample_period + delta;
1516
1517         if (!sample_period)
1518                 sample_period = 1;
1519
1520         hwc->sample_period = sample_period;
1521
1522         if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1523                 perf_disable();
1524                 perf_event_stop(event);
1525                 atomic64_set(&hwc->period_left, 0);
1526                 perf_event_start(event);
1527                 perf_enable();
1528         }
1529 }
1530
1531 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1532 {
1533         struct perf_event *event;
1534         struct hw_perf_event *hwc;
1535         u64 interrupts, now;
1536         s64 delta;
1537
1538         raw_spin_lock(&ctx->lock);
1539         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1540                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1541                         continue;
1542
1543                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1544                         continue;
1545
1546                 hwc = &event->hw;
1547
1548                 interrupts = hwc->interrupts;
1549                 hwc->interrupts = 0;
1550
1551                 /*
1552                  * unthrottle events on the tick
1553                  */
1554                 if (interrupts == MAX_INTERRUPTS) {
1555                         perf_log_throttle(event, 1);
1556                         event->pmu->unthrottle(event);
1557                 }
1558
1559                 if (!event->attr.freq || !event->attr.sample_freq)
1560                         continue;
1561
1562                 event->pmu->read(event);
1563                 now = atomic64_read(&event->count);
1564                 delta = now - hwc->freq_count_stamp;
1565                 hwc->freq_count_stamp = now;
1566
1567                 if (delta > 0)
1568                         perf_adjust_period(event, TICK_NSEC, delta);
1569         }
1570         raw_spin_unlock(&ctx->lock);
1571 }
1572
1573 /*
1574  * Round-robin a context's events:
1575  */
1576 static void rotate_ctx(struct perf_event_context *ctx)
1577 {
1578         if (!ctx->nr_events)
1579                 return;
1580
1581         raw_spin_lock(&ctx->lock);
1582
1583         /* Rotate the first entry last of non-pinned groups */
1584         list_rotate_left(&ctx->flexible_groups);
1585
1586         raw_spin_unlock(&ctx->lock);
1587 }
1588
1589 void perf_event_task_tick(struct task_struct *curr)
1590 {
1591         struct perf_cpu_context *cpuctx;
1592         struct perf_event_context *ctx;
1593
1594         if (!atomic_read(&nr_events))
1595                 return;
1596
1597         cpuctx = &__get_cpu_var(perf_cpu_context);
1598         ctx = curr->perf_event_ctxp;
1599
1600         perf_disable();
1601
1602         perf_ctx_adjust_freq(&cpuctx->ctx);
1603         if (ctx)
1604                 perf_ctx_adjust_freq(ctx);
1605
1606         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1607         if (ctx)
1608                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1609
1610         rotate_ctx(&cpuctx->ctx);
1611         if (ctx)
1612                 rotate_ctx(ctx);
1613
1614         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1615         if (ctx)
1616                 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618         perf_enable();
1619 }
1620
1621 static int event_enable_on_exec(struct perf_event *event,
1622                                 struct perf_event_context *ctx)
1623 {
1624         if (!event->attr.enable_on_exec)
1625                 return 0;
1626
1627         event->attr.enable_on_exec = 0;
1628         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1629                 return 0;
1630
1631         __perf_event_mark_enabled(event, ctx);
1632
1633         return 1;
1634 }
1635
1636 /*
1637  * Enable all of a task's events that have been marked enable-on-exec.
1638  * This expects task == current.
1639  */
1640 static void perf_event_enable_on_exec(struct task_struct *task)
1641 {
1642         struct perf_event_context *ctx;
1643         struct perf_event *event;
1644         unsigned long flags;
1645         int enabled = 0;
1646         int ret;
1647
1648         local_irq_save(flags);
1649         ctx = task->perf_event_ctxp;
1650         if (!ctx || !ctx->nr_events)
1651                 goto out;
1652
1653         __perf_event_task_sched_out(ctx);
1654
1655         raw_spin_lock(&ctx->lock);
1656
1657         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1658                 ret = event_enable_on_exec(event, ctx);
1659                 if (ret)
1660                         enabled = 1;
1661         }
1662
1663         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1664                 ret = event_enable_on_exec(event, ctx);
1665                 if (ret)
1666                         enabled = 1;
1667         }
1668
1669         /*
1670          * Unclone this context if we enabled any event.
1671          */
1672         if (enabled)
1673                 unclone_ctx(ctx);
1674
1675         raw_spin_unlock(&ctx->lock);
1676
1677         perf_event_task_sched_in(task);
1678  out:
1679         local_irq_restore(flags);
1680 }
1681
1682 /*
1683  * Cross CPU call to read the hardware event
1684  */
1685 static void __perf_event_read(void *info)
1686 {
1687         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1688         struct perf_event *event = info;
1689         struct perf_event_context *ctx = event->ctx;
1690
1691         /*
1692          * If this is a task context, we need to check whether it is
1693          * the current task context of this cpu.  If not it has been
1694          * scheduled out before the smp call arrived.  In that case
1695          * event->count would have been updated to a recent sample
1696          * when the event was scheduled out.
1697          */
1698         if (ctx->task && cpuctx->task_ctx != ctx)
1699                 return;
1700
1701         raw_spin_lock(&ctx->lock);
1702         update_context_time(ctx);
1703         update_event_times(event);
1704         raw_spin_unlock(&ctx->lock);
1705
1706         event->pmu->read(event);
1707 }
1708
1709 static u64 perf_event_read(struct perf_event *event)
1710 {
1711         /*
1712          * If event is enabled and currently active on a CPU, update the
1713          * value in the event structure:
1714          */
1715         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1716                 smp_call_function_single(event->oncpu,
1717                                          __perf_event_read, event, 1);
1718         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1719                 struct perf_event_context *ctx = event->ctx;
1720                 unsigned long flags;
1721
1722                 raw_spin_lock_irqsave(&ctx->lock, flags);
1723                 update_context_time(ctx);
1724                 update_event_times(event);
1725                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1726         }
1727
1728         return atomic64_read(&event->count);
1729 }
1730
1731 /*
1732  * Initialize the perf_event context in a task_struct:
1733  */
1734 static void
1735 __perf_event_init_context(struct perf_event_context *ctx,
1736                             struct task_struct *task)
1737 {
1738         raw_spin_lock_init(&ctx->lock);
1739         mutex_init(&ctx->mutex);
1740         INIT_LIST_HEAD(&ctx->pinned_groups);
1741         INIT_LIST_HEAD(&ctx->flexible_groups);
1742         INIT_LIST_HEAD(&ctx->event_list);
1743         atomic_set(&ctx->refcount, 1);
1744         ctx->task = task;
1745 }
1746
1747 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1748 {
1749         struct perf_event_context *ctx;
1750         struct perf_cpu_context *cpuctx;
1751         struct task_struct *task;
1752         unsigned long flags;
1753         int err;
1754
1755         if (pid == -1 && cpu != -1) {
1756                 /* Must be root to operate on a CPU event: */
1757                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1758                         return ERR_PTR(-EACCES);
1759
1760                 if (cpu < 0 || cpu >= nr_cpumask_bits)
1761                         return ERR_PTR(-EINVAL);
1762
1763                 /*
1764                  * We could be clever and allow to attach a event to an
1765                  * offline CPU and activate it when the CPU comes up, but
1766                  * that's for later.
1767                  */
1768                 if (!cpu_online(cpu))
1769                         return ERR_PTR(-ENODEV);
1770
1771                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1772                 ctx = &cpuctx->ctx;
1773                 get_ctx(ctx);
1774
1775                 return ctx;
1776         }
1777
1778         rcu_read_lock();
1779         if (!pid)
1780                 task = current;
1781         else
1782                 task = find_task_by_vpid(pid);
1783         if (task)
1784                 get_task_struct(task);
1785         rcu_read_unlock();
1786
1787         if (!task)
1788                 return ERR_PTR(-ESRCH);
1789
1790         /*
1791          * Can't attach events to a dying task.
1792          */
1793         err = -ESRCH;
1794         if (task->flags & PF_EXITING)
1795                 goto errout;
1796
1797         /* Reuse ptrace permission checks for now. */
1798         err = -EACCES;
1799         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1800                 goto errout;
1801
1802  retry:
1803         ctx = perf_lock_task_context(task, &flags);
1804         if (ctx) {
1805                 unclone_ctx(ctx);
1806                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1807         }
1808
1809         if (!ctx) {
1810                 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1811                 err = -ENOMEM;
1812                 if (!ctx)
1813                         goto errout;
1814                 __perf_event_init_context(ctx, task);
1815                 get_ctx(ctx);
1816                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1817                         /*
1818                          * We raced with some other task; use
1819                          * the context they set.
1820                          */
1821                         kfree(ctx);
1822                         goto retry;
1823                 }
1824                 get_task_struct(task);
1825         }
1826
1827         put_task_struct(task);
1828         return ctx;
1829
1830  errout:
1831         put_task_struct(task);
1832         return ERR_PTR(err);
1833 }
1834
1835 static void perf_event_free_filter(struct perf_event *event);
1836
1837 static void free_event_rcu(struct rcu_head *head)
1838 {
1839         struct perf_event *event;
1840
1841         event = container_of(head, struct perf_event, rcu_head);
1842         if (event->ns)
1843                 put_pid_ns(event->ns);
1844         perf_event_free_filter(event);
1845         kfree(event);
1846 }
1847
1848 static void perf_pending_sync(struct perf_event *event);
1849
1850 static void free_event(struct perf_event *event)
1851 {
1852         perf_pending_sync(event);
1853
1854         if (!event->parent) {
1855                 atomic_dec(&nr_events);
1856                 if (event->attr.mmap)
1857                         atomic_dec(&nr_mmap_events);
1858                 if (event->attr.comm)
1859                         atomic_dec(&nr_comm_events);
1860                 if (event->attr.task)
1861                         atomic_dec(&nr_task_events);
1862         }
1863
1864         if (event->output) {
1865                 fput(event->output->filp);
1866                 event->output = NULL;
1867         }
1868
1869         if (event->destroy)
1870                 event->destroy(event);
1871
1872         put_ctx(event->ctx);
1873         call_rcu(&event->rcu_head, free_event_rcu);
1874 }
1875
1876 int perf_event_release_kernel(struct perf_event *event)
1877 {
1878         struct perf_event_context *ctx = event->ctx;
1879
1880         WARN_ON_ONCE(ctx->parent_ctx);
1881         mutex_lock(&ctx->mutex);
1882         perf_event_remove_from_context(event);
1883         mutex_unlock(&ctx->mutex);
1884
1885         mutex_lock(&event->owner->perf_event_mutex);
1886         list_del_init(&event->owner_entry);
1887         mutex_unlock(&event->owner->perf_event_mutex);
1888         put_task_struct(event->owner);
1889
1890         free_event(event);
1891
1892         return 0;
1893 }
1894 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1895
1896 /*
1897  * Called when the last reference to the file is gone.
1898  */
1899 static int perf_release(struct inode *inode, struct file *file)
1900 {
1901         struct perf_event *event = file->private_data;
1902
1903         file->private_data = NULL;
1904
1905         return perf_event_release_kernel(event);
1906 }
1907
1908 static int perf_event_read_size(struct perf_event *event)
1909 {
1910         int entry = sizeof(u64); /* value */
1911         int size = 0;
1912         int nr = 1;
1913
1914         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1915                 size += sizeof(u64);
1916
1917         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1918                 size += sizeof(u64);
1919
1920         if (event->attr.read_format & PERF_FORMAT_ID)
1921                 entry += sizeof(u64);
1922
1923         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1924                 nr += event->group_leader->nr_siblings;
1925                 size += sizeof(u64);
1926         }
1927
1928         size += entry * nr;
1929
1930         return size;
1931 }
1932
1933 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1934 {
1935         struct perf_event *child;
1936         u64 total = 0;
1937
1938         *enabled = 0;
1939         *running = 0;
1940
1941         mutex_lock(&event->child_mutex);
1942         total += perf_event_read(event);
1943         *enabled += event->total_time_enabled +
1944                         atomic64_read(&event->child_total_time_enabled);
1945         *running += event->total_time_running +
1946                         atomic64_read(&event->child_total_time_running);
1947
1948         list_for_each_entry(child, &event->child_list, child_list) {
1949                 total += perf_event_read(child);
1950                 *enabled += child->total_time_enabled;
1951                 *running += child->total_time_running;
1952         }
1953         mutex_unlock(&event->child_mutex);
1954
1955         return total;
1956 }
1957 EXPORT_SYMBOL_GPL(perf_event_read_value);
1958
1959 static int perf_event_read_group(struct perf_event *event,
1960                                    u64 read_format, char __user *buf)
1961 {
1962         struct perf_event *leader = event->group_leader, *sub;
1963         int n = 0, size = 0, ret = -EFAULT;
1964         struct perf_event_context *ctx = leader->ctx;
1965         u64 values[5];
1966         u64 count, enabled, running;
1967
1968         mutex_lock(&ctx->mutex);
1969         count = perf_event_read_value(leader, &enabled, &running);
1970
1971         values[n++] = 1 + leader->nr_siblings;
1972         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1973                 values[n++] = enabled;
1974         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1975                 values[n++] = running;
1976         values[n++] = count;
1977         if (read_format & PERF_FORMAT_ID)
1978                 values[n++] = primary_event_id(leader);
1979
1980         size = n * sizeof(u64);
1981
1982         if (copy_to_user(buf, values, size))
1983                 goto unlock;
1984
1985         ret = size;
1986
1987         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1988                 n = 0;
1989
1990                 values[n++] = perf_event_read_value(sub, &enabled, &running);
1991                 if (read_format & PERF_FORMAT_ID)
1992                         values[n++] = primary_event_id(sub);
1993
1994                 size = n * sizeof(u64);
1995
1996                 if (copy_to_user(buf + ret, values, size)) {
1997                         ret = -EFAULT;
1998                         goto unlock;
1999                 }
2000
2001                 ret += size;
2002         }
2003 unlock:
2004         mutex_unlock(&ctx->mutex);
2005
2006         return ret;
2007 }
2008
2009 static int perf_event_read_one(struct perf_event *event,
2010                                  u64 read_format, char __user *buf)
2011 {
2012         u64 enabled, running;
2013         u64 values[4];
2014         int n = 0;
2015
2016         values[n++] = perf_event_read_value(event, &enabled, &running);
2017         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2018                 values[n++] = enabled;
2019         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2020                 values[n++] = running;
2021         if (read_format & PERF_FORMAT_ID)
2022                 values[n++] = primary_event_id(event);
2023
2024         if (copy_to_user(buf, values, n * sizeof(u64)))
2025                 return -EFAULT;
2026
2027         return n * sizeof(u64);
2028 }
2029
2030 /*
2031  * Read the performance event - simple non blocking version for now
2032  */
2033 static ssize_t
2034 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2035 {
2036         u64 read_format = event->attr.read_format;
2037         int ret;
2038
2039         /*
2040          * Return end-of-file for a read on a event that is in
2041          * error state (i.e. because it was pinned but it couldn't be
2042          * scheduled on to the CPU at some point).
2043          */
2044         if (event->state == PERF_EVENT_STATE_ERROR)
2045                 return 0;
2046
2047         if (count < perf_event_read_size(event))
2048                 return -ENOSPC;
2049
2050         WARN_ON_ONCE(event->ctx->parent_ctx);
2051         if (read_format & PERF_FORMAT_GROUP)
2052                 ret = perf_event_read_group(event, read_format, buf);
2053         else
2054                 ret = perf_event_read_one(event, read_format, buf);
2055
2056         return ret;
2057 }
2058
2059 static ssize_t
2060 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2061 {
2062         struct perf_event *event = file->private_data;
2063
2064         return perf_read_hw(event, buf, count);
2065 }
2066
2067 static unsigned int perf_poll(struct file *file, poll_table *wait)
2068 {
2069         struct perf_event *event = file->private_data;
2070         struct perf_mmap_data *data;
2071         unsigned int events = POLL_HUP;
2072
2073         rcu_read_lock();
2074         data = rcu_dereference(event->data);
2075         if (data)
2076                 events = atomic_xchg(&data->poll, 0);
2077         rcu_read_unlock();
2078
2079         poll_wait(file, &event->waitq, wait);
2080
2081         return events;
2082 }
2083
2084 static void perf_event_reset(struct perf_event *event)
2085 {
2086         (void)perf_event_read(event);
2087         atomic64_set(&event->count, 0);
2088         perf_event_update_userpage(event);
2089 }
2090
2091 /*
2092  * Holding the top-level event's child_mutex means that any
2093  * descendant process that has inherited this event will block
2094  * in sync_child_event if it goes to exit, thus satisfying the
2095  * task existence requirements of perf_event_enable/disable.
2096  */
2097 static void perf_event_for_each_child(struct perf_event *event,
2098                                         void (*func)(struct perf_event *))
2099 {
2100         struct perf_event *child;
2101
2102         WARN_ON_ONCE(event->ctx->parent_ctx);
2103         mutex_lock(&event->child_mutex);
2104         func(event);
2105         list_for_each_entry(child, &event->child_list, child_list)
2106                 func(child);
2107         mutex_unlock(&event->child_mutex);
2108 }
2109
2110 static void perf_event_for_each(struct perf_event *event,
2111                                   void (*func)(struct perf_event *))
2112 {
2113         struct perf_event_context *ctx = event->ctx;
2114         struct perf_event *sibling;
2115
2116         WARN_ON_ONCE(ctx->parent_ctx);
2117         mutex_lock(&ctx->mutex);
2118         event = event->group_leader;
2119
2120         perf_event_for_each_child(event, func);
2121         func(event);
2122         list_for_each_entry(sibling, &event->sibling_list, group_entry)
2123                 perf_event_for_each_child(event, func);
2124         mutex_unlock(&ctx->mutex);
2125 }
2126
2127 static int perf_event_period(struct perf_event *event, u64 __user *arg)
2128 {
2129         struct perf_event_context *ctx = event->ctx;
2130         unsigned long size;
2131         int ret = 0;
2132         u64 value;
2133
2134         if (!event->attr.sample_period)
2135                 return -EINVAL;
2136
2137         size = copy_from_user(&value, arg, sizeof(value));
2138         if (size != sizeof(value))
2139                 return -EFAULT;
2140
2141         if (!value)
2142                 return -EINVAL;
2143
2144         raw_spin_lock_irq(&ctx->lock);
2145         if (event->attr.freq) {
2146                 if (value > sysctl_perf_event_sample_rate) {
2147                         ret = -EINVAL;
2148                         goto unlock;
2149                 }
2150
2151                 event->attr.sample_freq = value;
2152         } else {
2153                 event->attr.sample_period = value;
2154                 event->hw.sample_period = value;
2155         }
2156 unlock:
2157         raw_spin_unlock_irq(&ctx->lock);
2158
2159         return ret;
2160 }
2161
2162 static int perf_event_set_output(struct perf_event *event, int output_fd);
2163 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2164
2165 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2166 {
2167         struct perf_event *event = file->private_data;
2168         void (*func)(struct perf_event *);
2169         u32 flags = arg;
2170
2171         switch (cmd) {
2172         case PERF_EVENT_IOC_ENABLE:
2173                 func = perf_event_enable;
2174                 break;
2175         case PERF_EVENT_IOC_DISABLE:
2176                 func = perf_event_disable;
2177                 break;
2178         case PERF_EVENT_IOC_RESET:
2179                 func = perf_event_reset;
2180                 break;
2181
2182         case PERF_EVENT_IOC_REFRESH:
2183                 return perf_event_refresh(event, arg);
2184
2185         case PERF_EVENT_IOC_PERIOD:
2186                 return perf_event_period(event, (u64 __user *)arg);
2187
2188         case PERF_EVENT_IOC_SET_OUTPUT:
2189                 return perf_event_set_output(event, arg);
2190
2191         case PERF_EVENT_IOC_SET_FILTER:
2192                 return perf_event_set_filter(event, (void __user *)arg);
2193
2194         default:
2195                 return -ENOTTY;
2196         }
2197
2198         if (flags & PERF_IOC_FLAG_GROUP)
2199                 perf_event_for_each(event, func);
2200         else
2201                 perf_event_for_each_child(event, func);
2202
2203         return 0;
2204 }
2205
2206 int perf_event_task_enable(void)
2207 {
2208         struct perf_event *event;
2209
2210         mutex_lock(&current->perf_event_mutex);
2211         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2212                 perf_event_for_each_child(event, perf_event_enable);
2213         mutex_unlock(&current->perf_event_mutex);
2214
2215         return 0;
2216 }
2217
2218 int perf_event_task_disable(void)
2219 {
2220         struct perf_event *event;
2221
2222         mutex_lock(&current->perf_event_mutex);
2223         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2224                 perf_event_for_each_child(event, perf_event_disable);
2225         mutex_unlock(&current->perf_event_mutex);
2226
2227         return 0;
2228 }
2229
2230 #ifndef PERF_EVENT_INDEX_OFFSET
2231 # define PERF_EVENT_INDEX_OFFSET 0
2232 #endif
2233
2234 static int perf_event_index(struct perf_event *event)
2235 {
2236         if (event->state != PERF_EVENT_STATE_ACTIVE)
2237                 return 0;
2238
2239         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2240 }
2241
2242 /*
2243  * Callers need to ensure there can be no nesting of this function, otherwise
2244  * the seqlock logic goes bad. We can not serialize this because the arch
2245  * code calls this from NMI context.
2246  */
2247 void perf_event_update_userpage(struct perf_event *event)
2248 {
2249         struct perf_event_mmap_page *userpg;
2250         struct perf_mmap_data *data;
2251
2252         rcu_read_lock();
2253         data = rcu_dereference(event->data);
2254         if (!data)
2255                 goto unlock;
2256
2257         userpg = data->user_page;
2258
2259         /*
2260          * Disable preemption so as to not let the corresponding user-space
2261          * spin too long if we get preempted.
2262          */
2263         preempt_disable();
2264         ++userpg->lock;
2265         barrier();
2266         userpg->index = perf_event_index(event);
2267         userpg->offset = atomic64_read(&event->count);
2268         if (event->state == PERF_EVENT_STATE_ACTIVE)
2269                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2270
2271         userpg->time_enabled = event->total_time_enabled +
2272                         atomic64_read(&event->child_total_time_enabled);
2273
2274         userpg->time_running = event->total_time_running +
2275                         atomic64_read(&event->child_total_time_running);
2276
2277         barrier();
2278         ++userpg->lock;
2279         preempt_enable();
2280 unlock:
2281         rcu_read_unlock();
2282 }
2283
2284 static unsigned long perf_data_size(struct perf_mmap_data *data)
2285 {
2286         return data->nr_pages << (PAGE_SHIFT + data->data_order);
2287 }
2288
2289 #ifndef CONFIG_PERF_USE_VMALLOC
2290
2291 /*
2292  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2293  */
2294
2295 static struct page *
2296 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2297 {
2298         if (pgoff > data->nr_pages)
2299                 return NULL;
2300
2301         if (pgoff == 0)
2302                 return virt_to_page(data->user_page);
2303
2304         return virt_to_page(data->data_pages[pgoff - 1]);
2305 }
2306
2307 static struct perf_mmap_data *
2308 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2309 {
2310         struct perf_mmap_data *data;
2311         unsigned long size;
2312         int i;
2313
2314         WARN_ON(atomic_read(&event->mmap_count));
2315
2316         size = sizeof(struct perf_mmap_data);
2317         size += nr_pages * sizeof(void *);
2318
2319         data = kzalloc(size, GFP_KERNEL);
2320         if (!data)
2321                 goto fail;
2322
2323         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2324         if (!data->user_page)
2325                 goto fail_user_page;
2326
2327         for (i = 0; i < nr_pages; i++) {
2328                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2329                 if (!data->data_pages[i])
2330                         goto fail_data_pages;
2331         }
2332
2333         data->data_order = 0;
2334         data->nr_pages = nr_pages;
2335
2336         return data;
2337
2338 fail_data_pages:
2339         for (i--; i >= 0; i--)
2340                 free_page((unsigned long)data->data_pages[i]);
2341
2342         free_page((unsigned long)data->user_page);
2343
2344 fail_user_page:
2345         kfree(data);
2346
2347 fail:
2348         return NULL;
2349 }
2350
2351 static void perf_mmap_free_page(unsigned long addr)
2352 {
2353         struct page *page = virt_to_page((void *)addr);
2354
2355         page->mapping = NULL;
2356         __free_page(page);
2357 }
2358
2359 static void perf_mmap_data_free(struct perf_mmap_data *data)
2360 {
2361         int i;
2362
2363         perf_mmap_free_page((unsigned long)data->user_page);
2364         for (i = 0; i < data->nr_pages; i++)
2365                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2366         kfree(data);
2367 }
2368
2369 #else
2370
2371 /*
2372  * Back perf_mmap() with vmalloc memory.
2373  *
2374  * Required for architectures that have d-cache aliasing issues.
2375  */
2376
2377 static struct page *
2378 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2379 {
2380         if (pgoff > (1UL << data->data_order))
2381                 return NULL;
2382
2383         return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2384 }
2385
2386 static void perf_mmap_unmark_page(void *addr)
2387 {
2388         struct page *page = vmalloc_to_page(addr);
2389
2390         page->mapping = NULL;
2391 }
2392
2393 static void perf_mmap_data_free_work(struct work_struct *work)
2394 {
2395         struct perf_mmap_data *data;
2396         void *base;
2397         int i, nr;
2398
2399         data = container_of(work, struct perf_mmap_data, work);
2400         nr = 1 << data->data_order;
2401
2402         base = data->user_page;
2403         for (i = 0; i < nr + 1; i++)
2404                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2405
2406         vfree(base);
2407         kfree(data);
2408 }
2409
2410 static void perf_mmap_data_free(struct perf_mmap_data *data)
2411 {
2412         schedule_work(&data->work);
2413 }
2414
2415 static struct perf_mmap_data *
2416 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2417 {
2418         struct perf_mmap_data *data;
2419         unsigned long size;
2420         void *all_buf;
2421
2422         WARN_ON(atomic_read(&event->mmap_count));
2423
2424         size = sizeof(struct perf_mmap_data);
2425         size += sizeof(void *);
2426
2427         data = kzalloc(size, GFP_KERNEL);
2428         if (!data)
2429                 goto fail;
2430
2431         INIT_WORK(&data->work, perf_mmap_data_free_work);
2432
2433         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2434         if (!all_buf)
2435                 goto fail_all_buf;
2436
2437         data->user_page = all_buf;
2438         data->data_pages[0] = all_buf + PAGE_SIZE;
2439         data->data_order = ilog2(nr_pages);
2440         data->nr_pages = 1;
2441
2442         return data;
2443
2444 fail_all_buf:
2445         kfree(data);
2446
2447 fail:
2448         return NULL;
2449 }
2450
2451 #endif
2452
2453 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2454 {
2455         struct perf_event *event = vma->vm_file->private_data;
2456         struct perf_mmap_data *data;
2457         int ret = VM_FAULT_SIGBUS;
2458
2459         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2460                 if (vmf->pgoff == 0)
2461                         ret = 0;
2462                 return ret;
2463         }
2464
2465         rcu_read_lock();
2466         data = rcu_dereference(event->data);
2467         if (!data)
2468                 goto unlock;
2469
2470         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2471                 goto unlock;
2472
2473         vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2474         if (!vmf->page)
2475                 goto unlock;
2476
2477         get_page(vmf->page);
2478         vmf->page->mapping = vma->vm_file->f_mapping;
2479         vmf->page->index   = vmf->pgoff;
2480
2481         ret = 0;
2482 unlock:
2483         rcu_read_unlock();
2484
2485         return ret;
2486 }
2487
2488 static void
2489 perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2490 {
2491         long max_size = perf_data_size(data);
2492
2493         atomic_set(&data->lock, -1);
2494
2495         if (event->attr.watermark) {
2496                 data->watermark = min_t(long, max_size,
2497                                         event->attr.wakeup_watermark);
2498         }
2499
2500         if (!data->watermark)
2501                 data->watermark = max_size / 2;
2502
2503
2504         rcu_assign_pointer(event->data, data);
2505 }
2506
2507 static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2508 {
2509         struct perf_mmap_data *data;
2510
2511         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2512         perf_mmap_data_free(data);
2513 }
2514
2515 static void perf_mmap_data_release(struct perf_event *event)
2516 {
2517         struct perf_mmap_data *data = event->data;
2518
2519         WARN_ON(atomic_read(&event->mmap_count));
2520
2521         rcu_assign_pointer(event->data, NULL);
2522         call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2523 }
2524
2525 static void perf_mmap_open(struct vm_area_struct *vma)
2526 {
2527         struct perf_event *event = vma->vm_file->private_data;
2528
2529         atomic_inc(&event->mmap_count);
2530 }
2531
2532 static void perf_mmap_close(struct vm_area_struct *vma)
2533 {
2534         struct perf_event *event = vma->vm_file->private_data;
2535
2536         WARN_ON_ONCE(event->ctx->parent_ctx);
2537         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2538                 unsigned long size = perf_data_size(event->data);
2539                 struct user_struct *user = current_user();
2540
2541                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2542                 vma->vm_mm->locked_vm -= event->data->nr_locked;
2543                 perf_mmap_data_release(event);
2544                 mutex_unlock(&event->mmap_mutex);
2545         }
2546 }
2547
2548 static const struct vm_operations_struct perf_mmap_vmops = {
2549         .open           = perf_mmap_open,
2550         .close          = perf_mmap_close,
2551         .fault          = perf_mmap_fault,
2552         .page_mkwrite   = perf_mmap_fault,
2553 };
2554
2555 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2556 {
2557         struct perf_event *event = file->private_data;
2558         unsigned long user_locked, user_lock_limit;
2559         struct user_struct *user = current_user();
2560         unsigned long locked, lock_limit;
2561         struct perf_mmap_data *data;
2562         unsigned long vma_size;
2563         unsigned long nr_pages;
2564         long user_extra, extra;
2565         int ret = 0;
2566
2567         if (!(vma->vm_flags & VM_SHARED))
2568                 return -EINVAL;
2569
2570         vma_size = vma->vm_end - vma->vm_start;
2571         nr_pages = (vma_size / PAGE_SIZE) - 1;
2572
2573         /*
2574          * If we have data pages ensure they're a power-of-two number, so we
2575          * can do bitmasks instead of modulo.
2576          */
2577         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2578                 return -EINVAL;
2579
2580         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2581                 return -EINVAL;
2582
2583         if (vma->vm_pgoff != 0)
2584                 return -EINVAL;
2585
2586         WARN_ON_ONCE(event->ctx->parent_ctx);
2587         mutex_lock(&event->mmap_mutex);
2588         if (event->output) {
2589                 ret = -EINVAL;
2590                 goto unlock;
2591         }
2592
2593         if (atomic_inc_not_zero(&event->mmap_count)) {
2594                 if (nr_pages != event->data->nr_pages)
2595                         ret = -EINVAL;
2596                 goto unlock;
2597         }
2598
2599         user_extra = nr_pages + 1;
2600         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2601
2602         /*
2603          * Increase the limit linearly with more CPUs:
2604          */
2605         user_lock_limit *= num_online_cpus();
2606
2607         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2608
2609         extra = 0;
2610         if (user_locked > user_lock_limit)
2611                 extra = user_locked - user_lock_limit;
2612
2613         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2614         lock_limit >>= PAGE_SHIFT;
2615         locked = vma->vm_mm->locked_vm + extra;
2616
2617         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2618                 !capable(CAP_IPC_LOCK)) {
2619                 ret = -EPERM;
2620                 goto unlock;
2621         }
2622
2623         WARN_ON(event->data);
2624
2625         data = perf_mmap_data_alloc(event, nr_pages);
2626         ret = -ENOMEM;
2627         if (!data)
2628                 goto unlock;
2629
2630         ret = 0;
2631         perf_mmap_data_init(event, data);
2632
2633         atomic_set(&event->mmap_count, 1);
2634         atomic_long_add(user_extra, &user->locked_vm);
2635         vma->vm_mm->locked_vm += extra;
2636         event->data->nr_locked = extra;
2637         if (vma->vm_flags & VM_WRITE)
2638                 event->data->writable = 1;
2639
2640 unlock:
2641         mutex_unlock(&event->mmap_mutex);
2642
2643         vma->vm_flags |= VM_RESERVED;
2644         vma->vm_ops = &perf_mmap_vmops;
2645
2646         return ret;
2647 }
2648
2649 static int perf_fasync(int fd, struct file *filp, int on)
2650 {
2651         struct inode *inode = filp->f_path.dentry->d_inode;
2652         struct perf_event *event = filp->private_data;
2653         int retval;
2654
2655         mutex_lock(&inode->i_mutex);
2656         retval = fasync_helper(fd, filp, on, &event->fasync);
2657         mutex_unlock(&inode->i_mutex);
2658
2659         if (retval < 0)
2660                 return retval;
2661
2662         return 0;
2663 }
2664
2665 static const struct file_operations perf_fops = {
2666         .release                = perf_release,
2667         .read                   = perf_read,
2668         .poll                   = perf_poll,
2669         .unlocked_ioctl         = perf_ioctl,
2670         .compat_ioctl           = perf_ioctl,
2671         .mmap                   = perf_mmap,
2672         .fasync                 = perf_fasync,
2673 };
2674
2675 /*
2676  * Perf event wakeup
2677  *
2678  * If there's data, ensure we set the poll() state and publish everything
2679  * to user-space before waking everybody up.
2680  */
2681
2682 void perf_event_wakeup(struct perf_event *event)
2683 {
2684         wake_up_all(&event->waitq);
2685
2686         if (event->pending_kill) {
2687                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2688                 event->pending_kill = 0;
2689         }
2690 }
2691
2692 /*
2693  * Pending wakeups
2694  *
2695  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2696  *
2697  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2698  * single linked list and use cmpxchg() to add entries lockless.
2699  */
2700
2701 static void perf_pending_event(struct perf_pending_entry *entry)
2702 {
2703         struct perf_event *event = container_of(entry,
2704                         struct perf_event, pending);
2705
2706         if (event->pending_disable) {
2707                 event->pending_disable = 0;
2708                 __perf_event_disable(event);
2709         }
2710
2711         if (event->pending_wakeup) {
2712                 event->pending_wakeup = 0;
2713                 perf_event_wakeup(event);
2714         }
2715 }
2716
2717 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2718
2719 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2720         PENDING_TAIL,
2721 };
2722
2723 static void perf_pending_queue(struct perf_pending_entry *entry,
2724                                void (*func)(struct perf_pending_entry *))
2725 {
2726         struct perf_pending_entry **head;
2727
2728         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2729                 return;
2730
2731         entry->func = func;
2732
2733         head = &get_cpu_var(perf_pending_head);
2734
2735         do {
2736                 entry->next = *head;
2737         } while (cmpxchg(head, entry->next, entry) != entry->next);
2738
2739         set_perf_event_pending();
2740
2741         put_cpu_var(perf_pending_head);
2742 }
2743
2744 static int __perf_pending_run(void)
2745 {
2746         struct perf_pending_entry *list;
2747         int nr = 0;
2748
2749         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2750         while (list != PENDING_TAIL) {
2751                 void (*func)(struct perf_pending_entry *);
2752                 struct perf_pending_entry *entry = list;
2753
2754                 list = list->next;
2755
2756                 func = entry->func;
2757                 entry->next = NULL;
2758                 /*
2759                  * Ensure we observe the unqueue before we issue the wakeup,
2760                  * so that we won't be waiting forever.
2761                  * -- see perf_not_pending().
2762                  */
2763                 smp_wmb();
2764
2765                 func(entry);
2766                 nr++;
2767         }
2768
2769         return nr;
2770 }
2771
2772 static inline int perf_not_pending(struct perf_event *event)
2773 {
2774         /*
2775          * If we flush on whatever cpu we run, there is a chance we don't
2776          * need to wait.
2777          */
2778         get_cpu();
2779         __perf_pending_run();
2780         put_cpu();
2781
2782         /*
2783          * Ensure we see the proper queue state before going to sleep
2784          * so that we do not miss the wakeup. -- see perf_pending_handle()
2785          */
2786         smp_rmb();
2787         return event->pending.next == NULL;
2788 }
2789
2790 static void perf_pending_sync(struct perf_event *event)
2791 {
2792         wait_event(event->waitq, perf_not_pending(event));
2793 }
2794
2795 void perf_event_do_pending(void)
2796 {
2797         __perf_pending_run();
2798 }
2799
2800 /*
2801  * Callchain support -- arch specific
2802  */
2803
2804 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2805 {
2806         return NULL;
2807 }
2808
2809 /*
2810  * Output
2811  */
2812 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2813                               unsigned long offset, unsigned long head)
2814 {
2815         unsigned long mask;
2816
2817         if (!data->writable)
2818                 return true;
2819
2820         mask = perf_data_size(data) - 1;
2821
2822         offset = (offset - tail) & mask;
2823         head   = (head   - tail) & mask;
2824
2825         if ((int)(head - offset) < 0)
2826                 return false;
2827
2828         return true;
2829 }
2830
2831 static void perf_output_wakeup(struct perf_output_handle *handle)
2832 {
2833         atomic_set(&handle->data->poll, POLL_IN);
2834
2835         if (handle->nmi) {
2836                 handle->event->pending_wakeup = 1;
2837                 perf_pending_queue(&handle->event->pending,
2838                                    perf_pending_event);
2839         } else
2840                 perf_event_wakeup(handle->event);
2841 }
2842
2843 /*
2844  * Curious locking construct.
2845  *
2846  * We need to ensure a later event_id doesn't publish a head when a former
2847  * event_id isn't done writing. However since we need to deal with NMIs we
2848  * cannot fully serialize things.
2849  *
2850  * What we do is serialize between CPUs so we only have to deal with NMI
2851  * nesting on a single CPU.
2852  *
2853  * We only publish the head (and generate a wakeup) when the outer-most
2854  * event_id completes.
2855  */
2856 static void perf_output_lock(struct perf_output_handle *handle)
2857 {
2858         struct perf_mmap_data *data = handle->data;
2859         int cur, cpu = get_cpu();
2860
2861         handle->locked = 0;
2862
2863         for (;;) {
2864                 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2865                 if (cur == -1) {
2866                         handle->locked = 1;
2867                         break;
2868                 }
2869                 if (cur == cpu)
2870                         break;
2871
2872                 cpu_relax();
2873         }
2874 }
2875
2876 static void perf_output_unlock(struct perf_output_handle *handle)
2877 {
2878         struct perf_mmap_data *data = handle->data;
2879         unsigned long head;
2880         int cpu;
2881
2882         data->done_head = data->head;
2883
2884         if (!handle->locked)
2885                 goto out;
2886
2887 again:
2888         /*
2889          * The xchg implies a full barrier that ensures all writes are done
2890          * before we publish the new head, matched by a rmb() in userspace when
2891          * reading this position.
2892          */
2893         while ((head = atomic_long_xchg(&data->done_head, 0)))
2894                 data->user_page->data_head = head;
2895
2896         /*
2897          * NMI can happen here, which means we can miss a done_head update.
2898          */
2899
2900         cpu = atomic_xchg(&data->lock, -1);
2901         WARN_ON_ONCE(cpu != smp_processor_id());
2902
2903         /*
2904          * Therefore we have to validate we did not indeed do so.
2905          */
2906         if (unlikely(atomic_long_read(&data->done_head))) {
2907                 /*
2908                  * Since we had it locked, we can lock it again.
2909                  */
2910                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2911                         cpu_relax();
2912
2913                 goto again;
2914         }
2915
2916         if (atomic_xchg(&data->wakeup, 0))
2917                 perf_output_wakeup(handle);
2918 out:
2919         put_cpu();
2920 }
2921
2922 void perf_output_copy(struct perf_output_handle *handle,
2923                       const void *buf, unsigned int len)
2924 {
2925         unsigned int pages_mask;
2926         unsigned long offset;
2927         unsigned int size;
2928         void **pages;
2929
2930         offset          = handle->offset;
2931         pages_mask      = handle->data->nr_pages - 1;
2932         pages           = handle->data->data_pages;
2933
2934         do {
2935                 unsigned long page_offset;
2936                 unsigned long page_size;
2937                 int nr;
2938
2939                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2940                 page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
2941                 page_offset = offset & (page_size - 1);
2942                 size        = min_t(unsigned int, page_size - page_offset, len);
2943
2944                 memcpy(pages[nr] + page_offset, buf, size);
2945
2946                 len         -= size;
2947                 buf         += size;
2948                 offset      += size;
2949         } while (len);
2950
2951         handle->offset = offset;
2952
2953         /*
2954          * Check we didn't copy past our reservation window, taking the
2955          * possible unsigned int wrap into account.
2956          */
2957         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2958 }
2959
2960 int perf_output_begin(struct perf_output_handle *handle,
2961                       struct perf_event *event, unsigned int size,
2962                       int nmi, int sample)
2963 {
2964         struct perf_event *output_event;
2965         struct perf_mmap_data *data;
2966         unsigned long tail, offset, head;
2967         int have_lost;
2968         struct {
2969                 struct perf_event_header header;
2970                 u64                      id;
2971                 u64                      lost;
2972         } lost_event;
2973
2974         rcu_read_lock();
2975         /*
2976          * For inherited events we send all the output towards the parent.
2977          */
2978         if (event->parent)
2979                 event = event->parent;
2980
2981         output_event = rcu_dereference(event->output);
2982         if (output_event)
2983                 event = output_event;
2984
2985         data = rcu_dereference(event->data);
2986         if (!data)
2987                 goto out;
2988
2989         handle->data    = data;
2990         handle->event   = event;
2991         handle->nmi     = nmi;
2992         handle->sample  = sample;
2993
2994         if (!data->nr_pages)
2995                 goto fail;
2996
2997         have_lost = atomic_read(&data->lost);
2998         if (have_lost)
2999                 size += sizeof(lost_event);
3000
3001         perf_output_lock(handle);
3002
3003         do {
3004                 /*
3005                  * Userspace could choose to issue a mb() before updating the
3006                  * tail pointer. So that all reads will be completed before the
3007                  * write is issued.
3008                  */
3009                 tail = ACCESS_ONCE(data->user_page->data_tail);
3010                 smp_rmb();
3011                 offset = head = atomic_long_read(&data->head);
3012                 head += size;
3013                 if (unlikely(!perf_output_space(data, tail, offset, head)))
3014                         goto fail;
3015         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
3016
3017         handle->offset  = offset;
3018         handle->head    = head;
3019
3020         if (head - tail > data->watermark)
3021                 atomic_set(&data->wakeup, 1);
3022
3023         if (have_lost) {
3024                 lost_event.header.type = PERF_RECORD_LOST;
3025                 lost_event.header.misc = 0;
3026                 lost_event.header.size = sizeof(lost_event);
3027                 lost_event.id          = event->id;
3028                 lost_event.lost        = atomic_xchg(&data->lost, 0);
3029
3030                 perf_output_put(handle, lost_event);
3031         }
3032
3033         return 0;
3034
3035 fail:
3036         atomic_inc(&data->lost);
3037         perf_output_unlock(handle);
3038 out:
3039         rcu_read_unlock();
3040
3041         return -ENOSPC;
3042 }
3043
3044 void perf_output_end(struct perf_output_handle *handle)
3045 {
3046         struct perf_event *event = handle->event;
3047         struct perf_mmap_data *data = handle->data;
3048
3049         int wakeup_events = event->attr.wakeup_events;
3050
3051         if (handle->sample && wakeup_events) {
3052                 int events = atomic_inc_return(&data->events);
3053                 if (events >= wakeup_events) {
3054                         atomic_sub(wakeup_events, &data->events);
3055                         atomic_set(&data->wakeup, 1);
3056                 }
3057         }
3058
3059         perf_output_unlock(handle);
3060         rcu_read_unlock();
3061 }
3062
3063 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3064 {
3065         /*
3066          * only top level events have the pid namespace they were created in
3067          */
3068         if (event->parent)
3069                 event = event->parent;
3070
3071         return task_tgid_nr_ns(p, event->ns);
3072 }
3073
3074 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3075 {
3076         /*
3077          * only top level events have the pid namespace they were created in
3078          */
3079         if (event->parent)
3080                 event = event->parent;
3081
3082         return task_pid_nr_ns(p, event->ns);
3083 }
3084
3085 static void perf_output_read_one(struct perf_output_handle *handle,
3086                                  struct perf_event *event)
3087 {
3088         u64 read_format = event->attr.read_format;
3089         u64 values[4];
3090         int n = 0;
3091
3092         values[n++] = atomic64_read(&event->count);
3093         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3094                 values[n++] = event->total_time_enabled +
3095                         atomic64_read(&event->child_total_time_enabled);
3096         }
3097         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3098                 values[n++] = event->total_time_running +
3099                         atomic64_read(&event->child_total_time_running);
3100         }
3101         if (read_format & PERF_FORMAT_ID)
3102                 values[n++] = primary_event_id(event);
3103
3104         perf_output_copy(handle, values, n * sizeof(u64));
3105 }
3106
3107 /*
3108  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3109  */
3110 static void perf_output_read_group(struct perf_output_handle *handle,
3111                             struct perf_event *event)
3112 {
3113         struct perf_event *leader = event->group_leader, *sub;
3114         u64 read_format = event->attr.read_format;
3115         u64 values[5];
3116         int n = 0;
3117
3118         values[n++] = 1 + leader->nr_siblings;
3119
3120         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3121                 values[n++] = leader->total_time_enabled;
3122
3123         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3124                 values[n++] = leader->total_time_running;
3125
3126         if (leader != event)
3127                 leader->pmu->read(leader);
3128
3129         values[n++] = atomic64_read(&leader->count);
3130         if (read_format & PERF_FORMAT_ID)
3131                 values[n++] = primary_event_id(leader);
3132
3133         perf_output_copy(handle, values, n * sizeof(u64));
3134
3135         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3136                 n = 0;
3137
3138                 if (sub != event)
3139                         sub->pmu->read(sub);
3140
3141                 values[n++] = atomic64_read(&sub->count);
3142                 if (read_format & PERF_FORMAT_ID)
3143                         values[n++] = primary_event_id(sub);
3144
3145                 perf_output_copy(handle, values, n * sizeof(u64));
3146         }
3147 }
3148
3149 static void perf_output_read(struct perf_output_handle *handle,
3150                              struct perf_event *event)
3151 {
3152         if (event->attr.read_format & PERF_FORMAT_GROUP)
3153                 perf_output_read_group(handle, event);
3154         else
3155                 perf_output_read_one(handle, event);
3156 }
3157
3158 void perf_output_sample(struct perf_output_handle *handle,
3159                         struct perf_event_header *header,
3160                         struct perf_sample_data *data,
3161                         struct perf_event *event)
3162 {
3163         u64 sample_type = data->type;
3164
3165         perf_output_put(handle, *header);
3166
3167         if (sample_type & PERF_SAMPLE_IP)
3168                 perf_output_put(handle, data->ip);
3169
3170         if (sample_type & PERF_SAMPLE_TID)
3171                 perf_output_put(handle, data->tid_entry);
3172
3173         if (sample_type & PERF_SAMPLE_TIME)
3174                 perf_output_put(handle, data->time);
3175
3176         if (sample_type & PERF_SAMPLE_ADDR)
3177                 perf_output_put(handle, data->addr);
3178
3179         if (sample_type & PERF_SAMPLE_ID)
3180                 perf_output_put(handle, data->id);
3181
3182         if (sample_type & PERF_SAMPLE_STREAM_ID)
3183                 perf_output_put(handle, data->stream_id);
3184
3185         if (sample_type & PERF_SAMPLE_CPU)
3186                 perf_output_put(handle, data->cpu_entry);
3187
3188         if (sample_type & PERF_SAMPLE_PERIOD)
3189                 perf_output_put(handle, data->period);
3190
3191         if (sample_type & PERF_SAMPLE_READ)
3192                 perf_output_read(handle, event);
3193
3194         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3195                 if (data->callchain) {
3196                         int size = 1;
3197
3198                         if (data->callchain)
3199                                 size += data->callchain->nr;
3200
3201                         size *= sizeof(u64);
3202
3203                         perf_output_copy(handle, data->callchain, size);
3204                 } else {
3205                         u64 nr = 0;
3206                         perf_output_put(handle, nr);
3207                 }
3208         }
3209
3210         if (sample_type & PERF_SAMPLE_RAW) {
3211                 if (data->raw) {
3212                         perf_output_put(handle, data->raw->size);
3213                         perf_output_copy(handle, data->raw->data,
3214                                          data->raw->size);
3215                 } else {
3216                         struct {
3217                                 u32     size;
3218                                 u32     data;
3219                         } raw = {
3220                                 .size = sizeof(u32),
3221                                 .data = 0,
3222                         };
3223                         perf_output_put(handle, raw);
3224                 }
3225         }
3226 }
3227
3228 void perf_prepare_sample(struct perf_event_header *header,
3229                          struct perf_sample_data *data,
3230                          struct perf_event *event,
3231                          struct pt_regs *regs)
3232 {
3233         u64 sample_type = event->attr.sample_type;
3234
3235         data->type = sample_type;
3236
3237         header->type = PERF_RECORD_SAMPLE;
3238         header->size = sizeof(*header);
3239
3240         header->misc = 0;
3241         header->misc |= perf_misc_flags(regs);
3242
3243         if (sample_type & PERF_SAMPLE_IP) {
3244                 data->ip = perf_instruction_pointer(regs);
3245
3246                 header->size += sizeof(data->ip);
3247         }
3248
3249         if (sample_type & PERF_SAMPLE_TID) {
3250                 /* namespace issues */
3251                 data->tid_entry.pid = perf_event_pid(event, current);
3252                 data->tid_entry.tid = perf_event_tid(event, current);
3253
3254                 header->size += sizeof(data->tid_entry);
3255         }
3256
3257         if (sample_type & PERF_SAMPLE_TIME) {
3258                 data->time = perf_clock();
3259
3260                 header->size += sizeof(data->time);
3261         }
3262
3263         if (sample_type & PERF_SAMPLE_ADDR)
3264                 header->size += sizeof(data->addr);
3265
3266         if (sample_type & PERF_SAMPLE_ID) {
3267                 data->id = primary_event_id(event);
3268
3269                 header->size += sizeof(data->id);
3270         }
3271
3272         if (sample_type & PERF_SAMPLE_STREAM_ID) {
3273                 data->stream_id = event->id;
3274
3275                 header->size += sizeof(data->stream_id);
3276         }
3277
3278         if (sample_type & PERF_SAMPLE_CPU) {
3279                 data->cpu_entry.cpu             = raw_smp_processor_id();
3280                 data->cpu_entry.reserved        = 0;
3281
3282                 header->size += sizeof(data->cpu_entry);
3283         }
3284
3285         if (sample_type & PERF_SAMPLE_PERIOD)
3286                 header->size += sizeof(data->period);
3287
3288         if (sample_type & PERF_SAMPLE_READ)
3289                 header->size += perf_event_read_size(event);
3290
3291         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3292                 int size = 1;
3293
3294                 data->callchain = perf_callchain(regs);
3295
3296                 if (data->callchain)
3297                         size += data->callchain->nr;
3298
3299                 header->size += size * sizeof(u64);
3300         }
3301
3302         if (sample_type & PERF_SAMPLE_RAW) {
3303                 int size = sizeof(u32);
3304
3305                 if (data->raw)
3306                         size += data->raw->size;
3307                 else
3308                         size += sizeof(u32);
3309
3310                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3311                 header->size += size;
3312         }
3313 }
3314
3315 static void perf_event_output(struct perf_event *event, int nmi,
3316                                 struct perf_sample_data *data,
3317                                 struct pt_regs *regs)
3318 {
3319         struct perf_output_handle handle;
3320         struct perf_event_header header;
3321
3322         perf_prepare_sample(&header, data, event, regs);
3323
3324         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3325                 return;
3326
3327         perf_output_sample(&handle, &header, data, event);
3328
3329         perf_output_end(&handle);
3330 }
3331
3332 /*
3333  * read event_id
3334  */
3335
3336 struct perf_read_event {
3337         struct perf_event_header        header;
3338
3339         u32                             pid;
3340         u32                             tid;
3341 };
3342
3343 static void
3344 perf_event_read_event(struct perf_event *event,
3345                         struct task_struct *task)
3346 {
3347         struct perf_output_handle handle;
3348         struct perf_read_event read_event = {
3349                 .header = {
3350                         .type = PERF_RECORD_READ,
3351                         .misc = 0,
3352                         .size = sizeof(read_event) + perf_event_read_size(event),
3353                 },
3354                 .pid = perf_event_pid(event, task),
3355                 .tid = perf_event_tid(event, task),
3356         };
3357         int ret;
3358
3359         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3360         if (ret)
3361                 return;
3362
3363         perf_output_put(&handle, read_event);
3364         perf_output_read(&handle, event);
3365
3366         perf_output_end(&handle);
3367 }
3368
3369 /*
3370  * task tracking -- fork/exit
3371  *
3372  * enabled by: attr.comm | attr.mmap | attr.task
3373  */
3374
3375 struct perf_task_event {
3376         struct task_struct              *task;
3377         struct perf_event_context       *task_ctx;
3378
3379         struct {
3380                 struct perf_event_header        header;
3381
3382                 u32                             pid;
3383                 u32                             ppid;
3384                 u32                             tid;
3385                 u32                             ptid;
3386                 u64                             time;
3387         } event_id;
3388 };
3389
3390 static void perf_event_task_output(struct perf_event *event,
3391                                      struct perf_task_event *task_event)
3392 {
3393         struct perf_output_handle handle;
3394         int size;
3395         struct task_struct *task = task_event->task;
3396         int ret;
3397
3398         size  = task_event->event_id.header.size;
3399         ret = perf_output_begin(&handle, event, size, 0, 0);
3400
3401         if (ret)
3402                 return;
3403
3404         task_event->event_id.pid = perf_event_pid(event, task);
3405         task_event->event_id.ppid = perf_event_pid(event, current);
3406
3407         task_event->event_id.tid = perf_event_tid(event, task);
3408         task_event->event_id.ptid = perf_event_tid(event, current);
3409
3410         task_event->event_id.time = perf_clock();
3411
3412         perf_output_put(&handle, task_event->event_id);
3413
3414         perf_output_end(&handle);
3415 }
3416
3417 static int perf_event_task_match(struct perf_event *event)
3418 {
3419         if (event->state != PERF_EVENT_STATE_ACTIVE)
3420                 return 0;
3421
3422         if (event->cpu != -1 && event->cpu != smp_processor_id())
3423                 return 0;
3424
3425         if (event->attr.comm || event->attr.mmap || event->attr.task)
3426                 return 1;
3427
3428         return 0;
3429 }
3430
3431 static void perf_event_task_ctx(struct perf_event_context *ctx,
3432                                   struct perf_task_event *task_event)
3433 {
3434         struct perf_event *event;
3435
3436         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3437                 if (perf_event_task_match(event))
3438                         perf_event_task_output(event, task_event);
3439         }
3440 }
3441
3442 static void perf_event_task_event(struct perf_task_event *task_event)
3443 {
3444         struct perf_cpu_context *cpuctx;
3445         struct perf_event_context *ctx = task_event->task_ctx;
3446
3447         rcu_read_lock();
3448         cpuctx = &get_cpu_var(perf_cpu_context);
3449         perf_event_task_ctx(&cpuctx->ctx, task_event);
3450         if (!ctx)
3451                 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3452         if (ctx)
3453                 perf_event_task_ctx(ctx, task_event);
3454         put_cpu_var(perf_cpu_context);
3455         rcu_read_unlock();
3456 }
3457
3458 static void perf_event_task(struct task_struct *task,
3459                               struct perf_event_context *task_ctx,
3460                               int new)
3461 {
3462         struct perf_task_event task_event;
3463
3464         if (!atomic_read(&nr_comm_events) &&
3465             !atomic_read(&nr_mmap_events) &&
3466             !atomic_read(&nr_task_events))
3467                 return;
3468
3469         task_event = (struct perf_task_event){
3470                 .task     = task,
3471                 .task_ctx = task_ctx,
3472                 .event_id    = {
3473                         .header = {
3474                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3475                                 .misc = 0,
3476                                 .size = sizeof(task_event.event_id),
3477                         },
3478                         /* .pid  */
3479                         /* .ppid */
3480                         /* .tid  */
3481                         /* .ptid */
3482                 },
3483         };
3484
3485         perf_event_task_event(&task_event);
3486 }
3487
3488 void perf_event_fork(struct task_struct *task)
3489 {
3490         perf_event_task(task, NULL, 1);
3491 }
3492
3493 /*
3494  * comm tracking
3495  */
3496
3497 struct perf_comm_event {
3498         struct task_struct      *task;
3499         char                    *comm;
3500         int                     comm_size;
3501
3502         struct {
3503                 struct perf_event_header        header;
3504
3505                 u32                             pid;
3506                 u32                             tid;
3507         } event_id;
3508 };
3509
3510 static void perf_event_comm_output(struct perf_event *event,
3511                                      struct perf_comm_event *comm_event)
3512 {
3513         struct perf_output_handle handle;
3514         int size = comm_event->event_id.header.size;
3515         int ret = perf_output_begin(&handle, event, size, 0, 0);
3516
3517         if (ret)
3518                 return;
3519
3520         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3521         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3522
3523         perf_output_put(&handle, comm_event->event_id);
3524         perf_output_copy(&handle, comm_event->comm,
3525                                    comm_event->comm_size);
3526         perf_output_end(&handle);
3527 }
3528
3529 static int perf_event_comm_match(struct perf_event *event)
3530 {
3531         if (event->state != PERF_EVENT_STATE_ACTIVE)
3532                 return 0;
3533
3534         if (event->cpu != -1 && event->cpu != smp_processor_id())
3535                 return 0;
3536
3537         if (event->attr.comm)
3538                 return 1;
3539
3540         return 0;
3541 }
3542
3543 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3544                                   struct perf_comm_event *comm_event)
3545 {
3546         struct perf_event *event;
3547
3548         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3549                 if (perf_event_comm_match(event))
3550                         perf_event_comm_output(event, comm_event);
3551         }
3552 }
3553
3554 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3555 {
3556         struct perf_cpu_context *cpuctx;
3557         struct perf_event_context *ctx;
3558         unsigned int size;
3559         char comm[TASK_COMM_LEN];
3560
3561         memset(comm, 0, sizeof(comm));
3562         strlcpy(comm, comm_event->task->comm, sizeof(comm));
3563         size = ALIGN(strlen(comm)+1, sizeof(u64));
3564
3565         comm_event->comm = comm;
3566         comm_event->comm_size = size;
3567
3568         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3569
3570         rcu_read_lock();
3571         cpuctx = &get_cpu_var(perf_cpu_context);
3572         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3573         ctx = rcu_dereference(current->perf_event_ctxp);
3574         if (ctx)
3575                 perf_event_comm_ctx(ctx, comm_event);
3576         put_cpu_var(perf_cpu_context);
3577         rcu_read_unlock();
3578 }
3579
3580 void perf_event_comm(struct task_struct *task)
3581 {
3582         struct perf_comm_event comm_event;
3583
3584         if (task->perf_event_ctxp)
3585                 perf_event_enable_on_exec(task);
3586
3587         if (!atomic_read(&nr_comm_events))
3588                 return;
3589
3590         comm_event = (struct perf_comm_event){
3591                 .task   = task,
3592                 /* .comm      */
3593                 /* .comm_size */
3594                 .event_id  = {
3595                         .header = {
3596                                 .type = PERF_RECORD_COMM,
3597                                 .misc = 0,
3598                                 /* .size */
3599                         },
3600                         /* .pid */
3601                         /* .tid */
3602                 },
3603         };
3604
3605         perf_event_comm_event(&comm_event);
3606 }
3607
3608 /*
3609  * mmap tracking
3610  */
3611
3612 struct perf_mmap_event {
3613         struct vm_area_struct   *vma;
3614
3615         const char              *file_name;
3616         int                     file_size;
3617
3618         struct {
3619                 struct perf_event_header        header;
3620
3621                 u32                             pid;
3622                 u32                             tid;
3623                 u64                             start;
3624                 u64                             len;
3625                 u64                             pgoff;
3626         } event_id;
3627 };
3628
3629 static void perf_event_mmap_output(struct perf_event *event,
3630                                      struct perf_mmap_event *mmap_event)
3631 {
3632         struct perf_output_handle handle;
3633         int size = mmap_event->event_id.header.size;
3634         int ret = perf_output_begin(&handle, event, size, 0, 0);
3635
3636         if (ret)
3637                 return;
3638
3639         mmap_event->event_id.pid = perf_event_pid(event, current);
3640         mmap_event->event_id.tid = perf_event_tid(event, current);
3641
3642         perf_output_put(&handle, mmap_event->event_id);
3643         perf_output_copy(&handle, mmap_event->file_name,
3644                                    mmap_event->file_size);
3645         perf_output_end(&handle);
3646 }
3647
3648 static int perf_event_mmap_match(struct perf_event *event,
3649                                    struct perf_mmap_event *mmap_event)
3650 {
3651         if (event->state != PERF_EVENT_STATE_ACTIVE)
3652                 return 0;
3653
3654         if (event->cpu != -1 && event->cpu != smp_processor_id())
3655                 return 0;
3656
3657         if (event->attr.mmap)
3658                 return 1;
3659
3660         return 0;
3661 }
3662
3663 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3664                                   struct perf_mmap_event *mmap_event)
3665 {
3666         struct perf_event *event;
3667
3668         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3669                 if (perf_event_mmap_match(event, mmap_event))
3670                         perf_event_mmap_output(event, mmap_event);
3671         }
3672 }
3673
3674 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3675 {
3676         struct perf_cpu_context *cpuctx;
3677         struct perf_event_context *ctx;
3678         struct vm_area_struct *vma = mmap_event->vma;
3679         struct file *file = vma->vm_file;
3680         unsigned int size;
3681         char tmp[16];
3682         char *buf = NULL;
3683         const char *name;
3684
3685         memset(tmp, 0, sizeof(tmp));
3686
3687         if (file) {
3688                 /*
3689                  * d_path works from the end of the buffer backwards, so we
3690                  * need to add enough zero bytes after the string to handle
3691                  * the 64bit alignment we do later.
3692                  */
3693                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3694                 if (!buf) {
3695                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3696                         goto got_name;
3697                 }
3698                 name = d_path(&file->f_path, buf, PATH_MAX);
3699                 if (IS_ERR(name)) {
3700                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3701                         goto got_name;
3702                 }
3703         } else {
3704                 if (arch_vma_name(mmap_event->vma)) {
3705                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3706                                        sizeof(tmp));
3707                         goto got_name;
3708                 }
3709
3710                 if (!vma->vm_mm) {
3711                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3712                         goto got_name;
3713                 }
3714
3715                 name = strncpy(tmp, "//anon", sizeof(tmp));
3716                 goto got_name;
3717         }
3718
3719 got_name:
3720         size = ALIGN(strlen(name)+1, sizeof(u64));
3721
3722         mmap_event->file_name = name;
3723         mmap_event->file_size = size;
3724
3725         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3726
3727         rcu_read_lock();
3728         cpuctx = &get_cpu_var(perf_cpu_context);
3729         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3730         ctx = rcu_dereference(current->perf_event_ctxp);
3731         if (ctx)
3732                 perf_event_mmap_ctx(ctx, mmap_event);
3733         put_cpu_var(perf_cpu_context);
3734         rcu_read_unlock();
3735
3736         kfree(buf);
3737 }
3738
3739 void __perf_event_mmap(struct vm_area_struct *vma)
3740 {
3741         struct perf_mmap_event mmap_event;
3742
3743         if (!atomic_read(&nr_mmap_events))
3744                 return;
3745
3746         mmap_event = (struct perf_mmap_event){
3747                 .vma    = vma,
3748                 /* .file_name */
3749                 /* .file_size */
3750                 .event_id  = {
3751                         .header = {
3752                                 .type = PERF_RECORD_MMAP,
3753                                 .misc = 0,
3754                                 /* .size */
3755                         },
3756                         /* .pid */
3757                         /* .tid */
3758                         .start  = vma->vm_start,
3759                         .len    = vma->vm_end - vma->vm_start,
3760                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
3761                 },
3762         };
3763
3764         perf_event_mmap_event(&mmap_event);
3765 }
3766
3767 /*
3768  * IRQ throttle logging
3769  */
3770
3771 static void perf_log_throttle(struct perf_event *event, int enable)
3772 {
3773         struct perf_output_handle handle;
3774         int ret;
3775
3776         struct {
3777                 struct perf_event_header        header;
3778                 u64                             time;
3779                 u64                             id;
3780                 u64                             stream_id;
3781         } throttle_event = {
3782                 .header = {
3783                         .type = PERF_RECORD_THROTTLE,
3784                         .misc = 0,
3785                         .size = sizeof(throttle_event),
3786                 },
3787                 .time           = perf_clock(),
3788                 .id             = primary_event_id(event),
3789                 .stream_id      = event->id,
3790         };
3791
3792         if (enable)
3793                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3794
3795         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3796         if (ret)
3797                 return;
3798
3799         perf_output_put(&handle, throttle_event);
3800         perf_output_end(&handle);
3801 }
3802
3803 /*
3804  * Generic event overflow handling, sampling.
3805  */
3806
3807 static int __perf_event_overflow(struct perf_event *event, int nmi,
3808                                    int throttle, struct perf_sample_data *data,
3809                                    struct pt_regs *regs)
3810 {
3811         int events = atomic_read(&event->event_limit);
3812         struct hw_perf_event *hwc = &event->hw;
3813         int ret = 0;
3814
3815         throttle = (throttle && event->pmu->unthrottle != NULL);
3816
3817         if (!throttle) {
3818                 hwc->interrupts++;
3819         } else {
3820                 if (hwc->interrupts != MAX_INTERRUPTS) {
3821                         hwc->interrupts++;
3822                         if (HZ * hwc->interrupts >
3823                                         (u64)sysctl_perf_event_sample_rate) {
3824                                 hwc->interrupts = MAX_INTERRUPTS;
3825                                 perf_log_throttle(event, 0);
3826                                 ret = 1;
3827                         }
3828                 } else {
3829                         /*
3830                          * Keep re-disabling events even though on the previous
3831                          * pass we disabled it - just in case we raced with a
3832                          * sched-in and the event got enabled again:
3833                          */
3834                         ret = 1;
3835                 }
3836         }
3837
3838         if (event->attr.freq) {
3839                 u64 now = perf_clock();
3840                 s64 delta = now - hwc->freq_time_stamp;
3841
3842                 hwc->freq_time_stamp = now;
3843
3844                 if (delta > 0 && delta < 2*TICK_NSEC)
3845                         perf_adjust_period(event, delta, hwc->last_period);
3846         }
3847
3848         /*
3849          * XXX event_limit might not quite work as expected on inherited
3850          * events
3851          */
3852
3853         event->pending_kill = POLL_IN;
3854         if (events && atomic_dec_and_test(&event->event_limit)) {
3855                 ret = 1;
3856                 event->pending_kill = POLL_HUP;
3857                 if (nmi) {
3858                         event->pending_disable = 1;
3859                         perf_pending_queue(&event->pending,
3860                                            perf_pending_event);
3861                 } else
3862                         perf_event_disable(event);
3863         }
3864
3865         if (event->overflow_handler)
3866                 event->overflow_handler(event, nmi, data, regs);
3867         else
3868                 perf_event_output(event, nmi, data, regs);
3869
3870         return ret;
3871 }
3872
3873 int perf_event_overflow(struct perf_event *event, int nmi,
3874                           struct perf_sample_data *data,
3875                           struct pt_regs *regs)
3876 {
3877         return __perf_event_overflow(event, nmi, 1, data, regs);
3878 }
3879
3880 /*
3881  * Generic software event infrastructure
3882  */
3883
3884 /*
3885  * We directly increment event->count and keep a second value in
3886  * event->hw.period_left to count intervals. This period event
3887  * is kept in the range [-sample_period, 0] so that we can use the
3888  * sign as trigger.
3889  */
3890
3891 static u64 perf_swevent_set_period(struct perf_event *event)
3892 {
3893         struct hw_perf_event *hwc = &event->hw;
3894         u64 period = hwc->last_period;
3895         u64 nr, offset;
3896         s64 old, val;
3897
3898         hwc->last_period = hwc->sample_period;
3899
3900 again:
3901         old = val = atomic64_read(&hwc->period_left);
3902         if (val < 0)
3903                 return 0;
3904
3905         nr = div64_u64(period + val, period);
3906         offset = nr * period;
3907         val -= offset;
3908         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3909                 goto again;
3910
3911         return nr;
3912 }
3913
3914 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3915                                     int nmi, struct perf_sample_data *data,
3916                                     struct pt_regs *regs)
3917 {
3918         struct hw_perf_event *hwc = &event->hw;
3919         int throttle = 0;
3920
3921         data->period = event->hw.last_period;
3922         if (!overflow)
3923                 overflow = perf_swevent_set_period(event);
3924
3925         if (hwc->interrupts == MAX_INTERRUPTS)
3926                 return;
3927
3928         for (; overflow; overflow--) {
3929                 if (__perf_event_overflow(event, nmi, throttle,
3930                                             data, regs)) {
3931                         /*
3932                          * We inhibit the overflow from happening when
3933                          * hwc->interrupts == MAX_INTERRUPTS.
3934                          */
3935                         break;
3936                 }
3937                 throttle = 1;
3938         }
3939 }
3940
3941 static void perf_swevent_unthrottle(struct perf_event *event)
3942 {
3943         /*
3944          * Nothing to do, we already reset hwc->interrupts.
3945          */
3946 }
3947
3948 static void perf_swevent_add(struct perf_event *event, u64 nr,
3949                                int nmi, struct perf_sample_data *data,
3950                                struct pt_regs *regs)
3951 {
3952         struct hw_perf_event *hwc = &event->hw;
3953
3954         atomic64_add(nr, &event->count);
3955
3956         if (!regs)
3957                 return;
3958
3959         if (!hwc->sample_period)
3960                 return;
3961
3962         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3963                 return perf_swevent_overflow(event, 1, nmi, data, regs);
3964
3965         if (atomic64_add_negative(nr, &hwc->period_left))
3966                 return;
3967
3968         perf_swevent_overflow(event, 0, nmi, data, regs);
3969 }
3970
3971 static int perf_swevent_is_counting(struct perf_event *event)
3972 {
3973         /*
3974          * The event is active, we're good!
3975          */
3976         if (event->state == PERF_EVENT_STATE_ACTIVE)
3977                 return 1;
3978
3979         /*
3980          * The event is off/error, not counting.
3981          */
3982         if (event->state != PERF_EVENT_STATE_INACTIVE)
3983                 return 0;
3984
3985         /*
3986          * The event is inactive, if the context is active
3987          * we're part of a group that didn't make it on the 'pmu',
3988          * not counting.
3989          */
3990         if (event->ctx->is_active)
3991                 return 0;
3992
3993         /*
3994          * We're inactive and the context is too, this means the
3995          * task is scheduled out, we're counting events that happen
3996          * to us, like migration events.
3997          */
3998         return 1;
3999 }
4000
4001 static int perf_tp_event_match(struct perf_event *event,
4002                                 struct perf_sample_data *data);
4003
4004 static int perf_exclude_event(struct perf_event *event,
4005                               struct pt_regs *regs)
4006 {
4007         if (regs) {
4008                 if (event->attr.exclude_user && user_mode(regs))
4009                         return 1;
4010
4011                 if (event->attr.exclude_kernel && !user_mode(regs))
4012                         return 1;
4013         }
4014
4015         return 0;
4016 }
4017
4018 static int perf_swevent_match(struct perf_event *event,
4019                                 enum perf_type_id type,
4020                                 u32 event_id,
4021                                 struct perf_sample_data *data,
4022                                 struct pt_regs *regs)
4023 {
4024         if (event->cpu != -1 && event->cpu != smp_processor_id())
4025                 return 0;
4026
4027         if (!perf_swevent_is_counting(event))
4028                 return 0;
4029
4030         if (event->attr.type != type)
4031                 return 0;
4032
4033         if (event->attr.config != event_id)
4034                 return 0;
4035
4036         if (perf_exclude_event(event, regs))
4037                 return 0;
4038
4039         if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4040             !perf_tp_event_match(event, data))
4041                 return 0;
4042
4043         return 1;
4044 }
4045
4046 static void perf_swevent_ctx_event(struct perf_event_context *ctx,
4047                                      enum perf_type_id type,
4048                                      u32 event_id, u64 nr, int nmi,
4049                                      struct perf_sample_data *data,
4050                                      struct pt_regs *regs)
4051 {
4052         struct perf_event *event;
4053
4054         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4055                 if (perf_swevent_match(event, type, event_id, data, regs))
4056                         perf_swevent_add(event, nr, nmi, data, regs);
4057         }
4058 }
4059
4060 int perf_swevent_get_recursion_context(void)
4061 {
4062         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
4063         int rctx;
4064
4065         if (in_nmi())
4066                 rctx = 3;
4067         else if (in_irq())
4068                 rctx = 2;
4069         else if (in_softirq())
4070                 rctx = 1;
4071         else
4072                 rctx = 0;
4073
4074         if (cpuctx->recursion[rctx]) {
4075                 put_cpu_var(perf_cpu_context);
4076                 return -1;
4077         }
4078
4079         cpuctx->recursion[rctx]++;
4080         barrier();
4081
4082         return rctx;
4083 }
4084 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4085
4086 void perf_swevent_put_recursion_context(int rctx)
4087 {
4088         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4089         barrier();
4090         cpuctx->recursion[rctx]--;
4091         put_cpu_var(perf_cpu_context);
4092 }
4093 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4094
4095 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4096                                     u64 nr, int nmi,
4097                                     struct perf_sample_data *data,
4098                                     struct pt_regs *regs)
4099 {
4100         struct perf_cpu_context *cpuctx;
4101         struct perf_event_context *ctx;
4102
4103         cpuctx = &__get_cpu_var(perf_cpu_context);
4104         rcu_read_lock();
4105         perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4106                                  nr, nmi, data, regs);
4107         /*
4108          * doesn't really matter which of the child contexts the
4109          * events ends up in.
4110          */
4111         ctx = rcu_dereference(current->perf_event_ctxp);
4112         if (ctx)
4113                 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4114         rcu_read_unlock();
4115 }
4116
4117 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4118                             struct pt_regs *regs, u64 addr)
4119 {
4120         struct perf_sample_data data;
4121         int rctx;
4122
4123         rctx = perf_swevent_get_recursion_context();
4124         if (rctx < 0)
4125                 return;
4126
4127         data.addr = addr;
4128         data.raw  = NULL;
4129
4130         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4131
4132         perf_swevent_put_recursion_context(rctx);
4133 }
4134
4135 static void perf_swevent_read(struct perf_event *event)
4136 {
4137 }
4138
4139 static int perf_swevent_enable(struct perf_event *event)
4140 {
4141         struct hw_perf_event *hwc = &event->hw;
4142
4143         if (hwc->sample_period) {
4144                 hwc->last_period = hwc->sample_period;
4145                 perf_swevent_set_period(event);
4146         }
4147         return 0;
4148 }
4149
4150 static void perf_swevent_disable(struct perf_event *event)
4151 {
4152 }
4153
4154 static const struct pmu perf_ops_generic = {
4155         .enable         = perf_swevent_enable,
4156         .disable        = perf_swevent_disable,
4157         .read           = perf_swevent_read,
4158         .unthrottle     = perf_swevent_unthrottle,
4159 };
4160
4161 /*
4162  * hrtimer based swevent callback
4163  */
4164
4165 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4166 {
4167         enum hrtimer_restart ret = HRTIMER_RESTART;
4168         struct perf_sample_data data;
4169         struct pt_regs *regs;
4170         struct perf_event *event;
4171         u64 period;
4172
4173         event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
4174         event->pmu->read(event);
4175
4176         data.addr = 0;
4177         data.raw = NULL;
4178         data.period = event->hw.last_period;
4179         regs = get_irq_regs();
4180         /*
4181          * In case we exclude kernel IPs or are somehow not in interrupt
4182          * context, provide the next best thing, the user IP.
4183          */
4184         if ((event->attr.exclude_kernel || !regs) &&
4185                         !event->attr.exclude_user)
4186                 regs = task_pt_regs(current);
4187
4188         if (regs) {
4189                 if (!(event->attr.exclude_idle && current->pid == 0))
4190                         if (perf_event_overflow(event, 0, &data, regs))
4191                                 ret = HRTIMER_NORESTART;
4192         }
4193
4194         period = max_t(u64, 10000, event->hw.sample_period);
4195         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4196
4197         return ret;
4198 }
4199
4200 static void perf_swevent_start_hrtimer(struct perf_event *event)
4201 {
4202         struct hw_perf_event *hwc = &event->hw;
4203
4204         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4205         hwc->hrtimer.function = perf_swevent_hrtimer;
4206         if (hwc->sample_period) {
4207                 u64 period;
4208
4209                 if (hwc->remaining) {
4210                         if (hwc->remaining < 0)
4211                                 period = 10000;
4212                         else
4213                                 period = hwc->remaining;
4214                         hwc->remaining = 0;
4215                 } else {
4216                         period = max_t(u64, 10000, hwc->sample_period);
4217                 }
4218                 __hrtimer_start_range_ns(&hwc->hrtimer,
4219                                 ns_to_ktime(period), 0,
4220                                 HRTIMER_MODE_REL, 0);
4221         }
4222 }
4223
4224 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4225 {
4226         struct hw_perf_event *hwc = &event->hw;
4227
4228         if (hwc->sample_period) {
4229                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4230                 hwc->remaining = ktime_to_ns(remaining);
4231
4232                 hrtimer_cancel(&hwc->hrtimer);
4233         }
4234 }
4235
4236 /*
4237  * Software event: cpu wall time clock
4238  */
4239
4240 static void cpu_clock_perf_event_update(struct perf_event *event)
4241 {
4242         int cpu = raw_smp_processor_id();
4243         s64 prev;
4244         u64 now;
4245
4246         now = cpu_clock(cpu);
4247         prev = atomic64_xchg(&event->hw.prev_count, now);
4248         atomic64_add(now - prev, &event->count);
4249 }
4250
4251 static int cpu_clock_perf_event_enable(struct perf_event *event)
4252 {
4253         struct hw_perf_event *hwc = &event->hw;
4254         int cpu = raw_smp_processor_id();
4255
4256         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4257         perf_swevent_start_hrtimer(event);
4258
4259         return 0;
4260 }
4261
4262 static void cpu_clock_perf_event_disable(struct perf_event *event)
4263 {
4264         perf_swevent_cancel_hrtimer(event);
4265         cpu_clock_perf_event_update(event);
4266 }
4267
4268 static void cpu_clock_perf_event_read(struct perf_event *event)
4269 {
4270         cpu_clock_perf_event_update(event);
4271 }
4272
4273 static const struct pmu perf_ops_cpu_clock = {
4274         .enable         = cpu_clock_perf_event_enable,
4275         .disable        = cpu_clock_perf_event_disable,
4276         .read           = cpu_clock_perf_event_read,
4277 };
4278
4279 /*
4280  * Software event: task time clock
4281  */
4282
4283 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4284 {
4285         u64 prev;
4286         s64 delta;
4287
4288         prev = atomic64_xchg(&event->hw.prev_count, now);
4289         delta = now - prev;
4290         atomic64_add(delta, &event->count);
4291 }
4292
4293 static int task_clock_perf_event_enable(struct perf_event *event)
4294 {
4295         struct hw_perf_event *hwc = &event->hw;
4296         u64 now;
4297
4298         now = event->ctx->time;
4299
4300         atomic64_set(&hwc->prev_count, now);
4301
4302         perf_swevent_start_hrtimer(event);
4303
4304         return 0;
4305 }
4306
4307 static void task_clock_perf_event_disable(struct perf_event *event)
4308 {
4309         perf_swevent_cancel_hrtimer(event);
4310         task_clock_perf_event_update(event, event->ctx->time);
4311
4312 }
4313
4314 static void task_clock_perf_event_read(struct perf_event *event)
4315 {
4316         u64 time;
4317
4318         if (!in_nmi()) {
4319                 update_context_time(event->ctx);
4320                 time = event->ctx->time;
4321         } else {
4322                 u64 now = perf_clock();
4323                 u64 delta = now - event->ctx->timestamp;
4324                 time = event->ctx->time + delta;
4325         }
4326
4327         task_clock_perf_event_update(event, time);
4328 }
4329
4330 static const struct pmu perf_ops_task_clock = {
4331         .enable         = task_clock_perf_event_enable,
4332         .disable        = task_clock_perf_event_disable,
4333         .read           = task_clock_perf_event_read,
4334 };
4335
4336 #ifdef CONFIG_EVENT_TRACING
4337
4338 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4339                           int entry_size)
4340 {
4341         struct perf_raw_record raw = {
4342                 .size = entry_size,
4343                 .data = record,
4344         };
4345
4346         struct perf_sample_data data = {
4347                 .addr = addr,
4348                 .raw = &raw,
4349         };
4350
4351         struct pt_regs *regs = get_irq_regs();
4352
4353         if (!regs)
4354                 regs = task_pt_regs(current);
4355
4356         /* Trace events already protected against recursion */
4357         do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4358                                 &data, regs);
4359 }
4360 EXPORT_SYMBOL_GPL(perf_tp_event);
4361
4362 static int perf_tp_event_match(struct perf_event *event,
4363                                 struct perf_sample_data *data)
4364 {
4365         void *record = data->raw->data;
4366
4367         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4368                 return 1;
4369         return 0;
4370 }
4371
4372 static void tp_perf_event_destroy(struct perf_event *event)
4373 {
4374         ftrace_profile_disable(event->attr.config);
4375 }
4376
4377 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4378 {
4379         /*
4380          * Raw tracepoint data is a severe data leak, only allow root to
4381          * have these.
4382          */
4383         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4384                         perf_paranoid_tracepoint_raw() &&
4385                         !capable(CAP_SYS_ADMIN))
4386                 return ERR_PTR(-EPERM);
4387
4388         if (ftrace_profile_enable(event->attr.config))
4389                 return NULL;
4390
4391         event->destroy = tp_perf_event_destroy;
4392
4393         return &perf_ops_generic;
4394 }
4395
4396 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4397 {
4398         char *filter_str;
4399         int ret;
4400
4401         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4402                 return -EINVAL;
4403
4404         filter_str = strndup_user(arg, PAGE_SIZE);
4405         if (IS_ERR(filter_str))
4406                 return PTR_ERR(filter_str);
4407
4408         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4409
4410         kfree(filter_str);
4411         return ret;
4412 }
4413
4414 static void perf_event_free_filter(struct perf_event *event)
4415 {
4416         ftrace_profile_free_filter(event);
4417 }
4418
4419 #else
4420
4421 static int perf_tp_event_match(struct perf_event *event,
4422                                 struct perf_sample_data *data)
4423 {
4424         return 1;
4425 }
4426
4427 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4428 {
4429         return NULL;
4430 }
4431
4432 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4433 {
4434         return -ENOENT;
4435 }
4436
4437 static void perf_event_free_filter(struct perf_event *event)
4438 {
4439 }
4440
4441 #endif /* CONFIG_EVENT_TRACING */
4442
4443 #ifdef CONFIG_HAVE_HW_BREAKPOINT
4444 static void bp_perf_event_destroy(struct perf_event *event)
4445 {
4446         release_bp_slot(event);
4447 }
4448
4449 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4450 {
4451         int err;
4452
4453         err = register_perf_hw_breakpoint(bp);
4454         if (err)
4455                 return ERR_PTR(err);
4456
4457         bp->destroy = bp_perf_event_destroy;
4458
4459         return &perf_ops_bp;
4460 }
4461
4462 void perf_bp_event(struct perf_event *bp, void *data)
4463 {
4464         struct perf_sample_data sample;
4465         struct pt_regs *regs = data;
4466
4467         sample.raw = NULL;
4468         sample.addr = bp->attr.bp_addr;
4469
4470         if (!perf_exclude_event(bp, regs))
4471                 perf_swevent_add(bp, 1, 1, &sample, regs);
4472 }
4473 #else
4474 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4475 {
4476         return NULL;
4477 }
4478
4479 void perf_bp_event(struct perf_event *bp, void *regs)
4480 {
4481 }
4482 #endif
4483
4484 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4485
4486 static void sw_perf_event_destroy(struct perf_event *event)
4487 {
4488         u64 event_id = event->attr.config;
4489
4490         WARN_ON(event->parent);
4491
4492         atomic_dec(&perf_swevent_enabled[event_id]);
4493 }
4494
4495 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4496 {
4497         const struct pmu *pmu = NULL;
4498         u64 event_id = event->attr.config;
4499
4500         /*
4501          * Software events (currently) can't in general distinguish
4502          * between user, kernel and hypervisor events.
4503          * However, context switches and cpu migrations are considered
4504          * to be kernel events, and page faults are never hypervisor
4505          * events.
4506          */
4507         switch (event_id) {
4508         case PERF_COUNT_SW_CPU_CLOCK:
4509                 pmu = &perf_ops_cpu_clock;
4510
4511                 break;
4512         case PERF_COUNT_SW_TASK_CLOCK:
4513                 /*
4514                  * If the user instantiates this as a per-cpu event,
4515                  * use the cpu_clock event instead.
4516                  */
4517                 if (event->ctx->task)
4518                         pmu = &perf_ops_task_clock;
4519                 else
4520                         pmu = &perf_ops_cpu_clock;
4521
4522                 break;
4523         case PERF_COUNT_SW_PAGE_FAULTS:
4524         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4525         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4526         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4527         case PERF_COUNT_SW_CPU_MIGRATIONS:
4528         case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4529         case PERF_COUNT_SW_EMULATION_FAULTS:
4530                 if (!event->parent) {
4531                         atomic_inc(&perf_swevent_enabled[event_id]);
4532                         event->destroy = sw_perf_event_destroy;
4533                 }
4534                 pmu = &perf_ops_generic;
4535                 break;
4536         }
4537
4538         return pmu;
4539 }
4540
4541 /*
4542  * Allocate and initialize a event structure
4543  */
4544 static struct perf_event *
4545 perf_event_alloc(struct perf_event_attr *attr,
4546                    int cpu,
4547                    struct perf_event_context *ctx,
4548                    struct perf_event *group_leader,
4549                    struct perf_event *parent_event,
4550                    perf_overflow_handler_t overflow_handler,
4551                    gfp_t gfpflags)
4552 {
4553         const struct pmu *pmu;
4554         struct perf_event *event;
4555         struct hw_perf_event *hwc;
4556         long err;
4557
4558         event = kzalloc(sizeof(*event), gfpflags);
4559         if (!event)
4560                 return ERR_PTR(-ENOMEM);
4561
4562         /*
4563          * Single events are their own group leaders, with an
4564          * empty sibling list:
4565          */
4566         if (!group_leader)
4567                 group_leader = event;
4568
4569         mutex_init(&event->child_mutex);
4570         INIT_LIST_HEAD(&event->child_list);
4571
4572         INIT_LIST_HEAD(&event->group_entry);
4573         INIT_LIST_HEAD(&event->event_entry);
4574         INIT_LIST_HEAD(&event->sibling_list);
4575         init_waitqueue_head(&event->waitq);
4576
4577         mutex_init(&event->mmap_mutex);
4578
4579         event->cpu              = cpu;
4580         event->attr             = *attr;
4581         event->group_leader     = group_leader;
4582         event->pmu              = NULL;
4583         event->ctx              = ctx;
4584         event->oncpu            = -1;
4585
4586         event->parent           = parent_event;
4587
4588         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4589         event->id               = atomic64_inc_return(&perf_event_id);
4590
4591         event->state            = PERF_EVENT_STATE_INACTIVE;
4592
4593         if (!overflow_handler && parent_event)
4594                 overflow_handler = parent_event->overflow_handler;
4595
4596         event->overflow_handler = overflow_handler;
4597
4598         if (attr->disabled)
4599                 event->state = PERF_EVENT_STATE_OFF;
4600
4601         pmu = NULL;
4602
4603         hwc = &event->hw;
4604         hwc->sample_period = attr->sample_period;
4605         if (attr->freq && attr->sample_freq)
4606                 hwc->sample_period = 1;
4607         hwc->last_period = hwc->sample_period;
4608
4609         atomic64_set(&hwc->period_left, hwc->sample_period);
4610
4611         /*
4612          * we currently do not support PERF_FORMAT_GROUP on inherited events
4613          */
4614         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4615                 goto done;
4616
4617         switch (attr->type) {
4618         case PERF_TYPE_RAW:
4619         case PERF_TYPE_HARDWARE:
4620         case PERF_TYPE_HW_CACHE:
4621                 pmu = hw_perf_event_init(event);
4622                 break;
4623
4624         case PERF_TYPE_SOFTWARE:
4625                 pmu = sw_perf_event_init(event);
4626                 break;
4627
4628         case PERF_TYPE_TRACEPOINT:
4629                 pmu = tp_perf_event_init(event);
4630                 break;
4631
4632         case PERF_TYPE_BREAKPOINT:
4633                 pmu = bp_perf_event_init(event);
4634                 break;
4635
4636
4637         default:
4638                 break;
4639         }
4640 done:
4641         err = 0;
4642         if (!pmu)
4643                 err = -EINVAL;
4644         else if (IS_ERR(pmu))
4645                 err = PTR_ERR(pmu);
4646
4647         if (err) {
4648                 if (event->ns)
4649                         put_pid_ns(event->ns);
4650                 kfree(event);
4651                 return ERR_PTR(err);
4652         }
4653
4654         event->pmu = pmu;
4655
4656         if (!event->parent) {
4657                 atomic_inc(&nr_events);
4658                 if (event->attr.mmap)
4659                         atomic_inc(&nr_mmap_events);
4660                 if (event->attr.comm)
4661                         atomic_inc(&nr_comm_events);
4662                 if (event->attr.task)
4663                         atomic_inc(&nr_task_events);
4664         }
4665
4666         return event;
4667 }
4668
4669 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4670                           struct perf_event_attr *attr)
4671 {
4672         u32 size;
4673         int ret;
4674
4675         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4676                 return -EFAULT;
4677
4678         /*
4679          * zero the full structure, so that a short copy will be nice.
4680          */
4681         memset(attr, 0, sizeof(*attr));
4682
4683         ret = get_user(size, &uattr->size);
4684         if (ret)
4685                 return ret;
4686
4687         if (size > PAGE_SIZE)   /* silly large */
4688                 goto err_size;
4689
4690         if (!size)              /* abi compat */
4691                 size = PERF_ATTR_SIZE_VER0;
4692
4693         if (size < PERF_ATTR_SIZE_VER0)
4694                 goto err_size;
4695
4696         /*
4697          * If we're handed a bigger struct than we know of,
4698          * ensure all the unknown bits are 0 - i.e. new
4699          * user-space does not rely on any kernel feature
4700          * extensions we dont know about yet.
4701          */
4702         if (size > sizeof(*attr)) {
4703                 unsigned char __user *addr;
4704                 unsigned char __user *end;
4705                 unsigned char val;
4706
4707                 addr = (void __user *)uattr + sizeof(*attr);
4708                 end  = (void __user *)uattr + size;
4709
4710                 for (; addr < end; addr++) {
4711                         ret = get_user(val, addr);
4712                         if (ret)
4713                                 return ret;
4714                         if (val)
4715                                 goto err_size;
4716                 }
4717                 size = sizeof(*attr);
4718         }
4719
4720         ret = copy_from_user(attr, uattr, size);
4721         if (ret)
4722                 return -EFAULT;
4723
4724         /*
4725          * If the type exists, the corresponding creation will verify
4726          * the attr->config.
4727          */
4728         if (attr->type >= PERF_TYPE_MAX)
4729                 return -EINVAL;
4730
4731         if (attr->__reserved_1 || attr->__reserved_2)
4732                 return -EINVAL;
4733
4734         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4735                 return -EINVAL;
4736
4737         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4738                 return -EINVAL;
4739
4740 out:
4741         return ret;
4742
4743 err_size:
4744         put_user(sizeof(*attr), &uattr->size);
4745         ret = -E2BIG;
4746         goto out;
4747 }
4748
4749 static int perf_event_set_output(struct perf_event *event, int output_fd)
4750 {
4751         struct perf_event *output_event = NULL;
4752         struct file *output_file = NULL;
4753         struct perf_event *old_output;
4754         int fput_needed = 0;
4755         int ret = -EINVAL;
4756
4757         if (!output_fd)
4758                 goto set;
4759
4760         output_file = fget_light(output_fd, &fput_needed);
4761         if (!output_file)
4762                 return -EBADF;
4763
4764         if (output_file->f_op != &perf_fops)
4765                 goto out;
4766
4767         output_event = output_file->private_data;
4768
4769         /* Don't chain output fds */
4770         if (output_event->output)
4771                 goto out;
4772
4773         /* Don't set an output fd when we already have an output channel */
4774         if (event->data)
4775                 goto out;
4776
4777         atomic_long_inc(&output_file->f_count);
4778
4779 set:
4780         mutex_lock(&event->mmap_mutex);
4781         old_output = event->output;
4782         rcu_assign_pointer(event->output, output_event);
4783         mutex_unlock(&event->mmap_mutex);
4784
4785         if (old_output) {
4786                 /*
4787                  * we need to make sure no existing perf_output_*()
4788                  * is still referencing this event.
4789                  */
4790                 synchronize_rcu();
4791                 fput(old_output->filp);
4792         }
4793
4794         ret = 0;
4795 out:
4796         fput_light(output_file, fput_needed);
4797         return ret;
4798 }
4799
4800 /**
4801  * sys_perf_event_open - open a performance event, associate it to a task/cpu
4802  *
4803  * @attr_uptr:  event_id type attributes for monitoring/sampling
4804  * @pid:                target pid
4805  * @cpu:                target cpu
4806  * @group_fd:           group leader event fd
4807  */
4808 SYSCALL_DEFINE5(perf_event_open,
4809                 struct perf_event_attr __user *, attr_uptr,
4810                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4811 {
4812         struct perf_event *event, *group_leader;
4813         struct perf_event_attr attr;
4814         struct perf_event_context *ctx;
4815         struct file *event_file = NULL;
4816         struct file *group_file = NULL;
4817         int fput_needed = 0;
4818         int fput_needed2 = 0;
4819         int err;
4820
4821         /* for future expandability... */
4822         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4823                 return -EINVAL;
4824
4825         err = perf_copy_attr(attr_uptr, &attr);
4826         if (err)
4827                 return err;
4828
4829         if (!attr.exclude_kernel) {
4830                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4831                         return -EACCES;
4832         }
4833
4834         if (attr.freq) {
4835                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4836                         return -EINVAL;
4837         }
4838
4839         /*
4840          * Get the target context (task or percpu):
4841          */
4842         ctx = find_get_context(pid, cpu);
4843         if (IS_ERR(ctx))
4844                 return PTR_ERR(ctx);
4845
4846         /*
4847          * Look up the group leader (we will attach this event to it):
4848          */
4849         group_leader = NULL;
4850         if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4851                 err = -EINVAL;
4852                 group_file = fget_light(group_fd, &fput_needed);
4853                 if (!group_file)
4854                         goto err_put_context;
4855                 if (group_file->f_op != &perf_fops)
4856                         goto err_put_context;
4857
4858                 group_leader = group_file->private_data;
4859                 /*
4860                  * Do not allow a recursive hierarchy (this new sibling
4861                  * becoming part of another group-sibling):
4862                  */
4863                 if (group_leader->group_leader != group_leader)
4864                         goto err_put_context;
4865                 /*
4866                  * Do not allow to attach to a group in a different
4867                  * task or CPU context:
4868                  */
4869                 if (group_leader->ctx != ctx)
4870                         goto err_put_context;
4871                 /*
4872                  * Only a group leader can be exclusive or pinned
4873                  */
4874                 if (attr.exclusive || attr.pinned)
4875                         goto err_put_context;
4876         }
4877
4878         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4879                                      NULL, NULL, GFP_KERNEL);
4880         err = PTR_ERR(event);
4881         if (IS_ERR(event))
4882                 goto err_put_context;
4883
4884         err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4885         if (err < 0)
4886                 goto err_free_put_context;
4887
4888         event_file = fget_light(err, &fput_needed2);
4889         if (!event_file)
4890                 goto err_free_put_context;
4891
4892         if (flags & PERF_FLAG_FD_OUTPUT) {
4893                 err = perf_event_set_output(event, group_fd);
4894                 if (err)
4895                         goto err_fput_free_put_context;
4896         }
4897
4898         event->filp = event_file;
4899         WARN_ON_ONCE(ctx->parent_ctx);
4900         mutex_lock(&ctx->mutex);
4901         perf_install_in_context(ctx, event, cpu);
4902         ++ctx->generation;
4903         mutex_unlock(&ctx->mutex);
4904
4905         event->owner = current;
4906         get_task_struct(current);
4907         mutex_lock(&current->perf_event_mutex);
4908         list_add_tail(&event->owner_entry, &current->perf_event_list);
4909         mutex_unlock(&current->perf_event_mutex);
4910
4911 err_fput_free_put_context:
4912         fput_light(event_file, fput_needed2);
4913
4914 err_free_put_context:
4915         if (err < 0)
4916                 kfree(event);
4917
4918 err_put_context:
4919         if (err < 0)
4920                 put_ctx(ctx);
4921
4922         fput_light(group_file, fput_needed);
4923
4924         return err;
4925 }
4926
4927 /**
4928  * perf_event_create_kernel_counter
4929  *
4930  * @attr: attributes of the counter to create
4931  * @cpu: cpu in which the counter is bound
4932  * @pid: task to profile
4933  */
4934 struct perf_event *
4935 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4936                                  pid_t pid,
4937                                  perf_overflow_handler_t overflow_handler)
4938 {
4939         struct perf_event *event;
4940         struct perf_event_context *ctx;
4941         int err;
4942
4943         /*
4944          * Get the target context (task or percpu):
4945          */
4946
4947         ctx = find_get_context(pid, cpu);
4948         if (IS_ERR(ctx)) {
4949                 err = PTR_ERR(ctx);
4950                 goto err_exit;
4951         }
4952
4953         event = perf_event_alloc(attr, cpu, ctx, NULL,
4954                                  NULL, overflow_handler, GFP_KERNEL);
4955         if (IS_ERR(event)) {
4956                 err = PTR_ERR(event);
4957                 goto err_put_context;
4958         }
4959
4960         event->filp = NULL;
4961         WARN_ON_ONCE(ctx->parent_ctx);
4962         mutex_lock(&ctx->mutex);
4963         perf_install_in_context(ctx, event, cpu);
4964         ++ctx->generation;
4965         mutex_unlock(&ctx->mutex);
4966
4967         event->owner = current;
4968         get_task_struct(current);
4969         mutex_lock(&current->perf_event_mutex);
4970         list_add_tail(&event->owner_entry, &current->perf_event_list);
4971         mutex_unlock(&current->perf_event_mutex);
4972
4973         return event;
4974
4975  err_put_context:
4976         put_ctx(ctx);
4977  err_exit:
4978         return ERR_PTR(err);
4979 }
4980 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4981
4982 /*
4983  * inherit a event from parent task to child task:
4984  */
4985 static struct perf_event *
4986 inherit_event(struct perf_event *parent_event,
4987               struct task_struct *parent,
4988               struct perf_event_context *parent_ctx,
4989               struct task_struct *child,
4990               struct perf_event *group_leader,
4991               struct perf_event_context *child_ctx)
4992 {
4993         struct perf_event *child_event;
4994
4995         /*
4996          * Instead of creating recursive hierarchies of events,
4997          * we link inherited events back to the original parent,
4998          * which has a filp for sure, which we use as the reference
4999          * count:
5000          */
5001         if (parent_event->parent)
5002                 parent_event = parent_event->parent;
5003
5004         child_event = perf_event_alloc(&parent_event->attr,
5005                                            parent_event->cpu, child_ctx,
5006                                            group_leader, parent_event,
5007                                            NULL, GFP_KERNEL);
5008         if (IS_ERR(child_event))
5009                 return child_event;
5010         get_ctx(child_ctx);
5011
5012         /*
5013          * Make the child state follow the state of the parent event,
5014          * not its attr.disabled bit.  We hold the parent's mutex,
5015          * so we won't race with perf_event_{en, dis}able_family.
5016          */
5017         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
5018                 child_event->state = PERF_EVENT_STATE_INACTIVE;
5019         else
5020                 child_event->state = PERF_EVENT_STATE_OFF;
5021
5022         if (parent_event->attr.freq) {
5023                 u64 sample_period = parent_event->hw.sample_period;
5024                 struct hw_perf_event *hwc = &child_event->hw;
5025
5026                 hwc->sample_period = sample_period;
5027                 hwc->last_period   = sample_period;
5028
5029                 atomic64_set(&hwc->period_left, sample_period);
5030         }
5031
5032         child_event->overflow_handler = parent_event->overflow_handler;
5033
5034         /*
5035          * Link it up in the child's context:
5036          */
5037         add_event_to_ctx(child_event, child_ctx);
5038
5039         /*
5040          * Get a reference to the parent filp - we will fput it
5041          * when the child event exits. This is safe to do because
5042          * we are in the parent and we know that the filp still
5043          * exists and has a nonzero count:
5044          */
5045         atomic_long_inc(&parent_event->filp->f_count);
5046
5047         /*
5048          * Link this into the parent event's child list
5049          */
5050         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5051         mutex_lock(&parent_event->child_mutex);
5052         list_add_tail(&child_event->child_list, &parent_event->child_list);
5053         mutex_unlock(&parent_event->child_mutex);
5054
5055         return child_event;
5056 }
5057
5058 static int inherit_group(struct perf_event *parent_event,
5059               struct task_struct *parent,
5060               struct perf_event_context *parent_ctx,
5061               struct task_struct *child,
5062               struct perf_event_context *child_ctx)
5063 {
5064         struct perf_event *leader;
5065         struct perf_event *sub;
5066         struct perf_event *child_ctr;
5067
5068         leader = inherit_event(parent_event, parent, parent_ctx,
5069                                  child, NULL, child_ctx);
5070         if (IS_ERR(leader))
5071                 return PTR_ERR(leader);
5072         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5073                 child_ctr = inherit_event(sub, parent, parent_ctx,
5074                                             child, leader, child_ctx);
5075                 if (IS_ERR(child_ctr))
5076                         return PTR_ERR(child_ctr);
5077         }
5078         return 0;
5079 }
5080
5081 static void sync_child_event(struct perf_event *child_event,
5082                                struct task_struct *child)
5083 {
5084         struct perf_event *parent_event = child_event->parent;
5085         u64 child_val;
5086
5087         if (child_event->attr.inherit_stat)
5088                 perf_event_read_event(child_event, child);
5089
5090         child_val = atomic64_read(&child_event->count);
5091
5092         /*
5093          * Add back the child's count to the parent's count:
5094          */
5095         atomic64_add(child_val, &parent_event->count);
5096         atomic64_add(child_event->total_time_enabled,
5097                      &parent_event->child_total_time_enabled);
5098         atomic64_add(child_event->total_time_running,
5099                      &parent_event->child_total_time_running);
5100
5101         /*
5102          * Remove this event from the parent's list
5103          */
5104         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5105         mutex_lock(&parent_event->child_mutex);
5106         list_del_init(&child_event->child_list);
5107         mutex_unlock(&parent_event->child_mutex);
5108
5109         /*
5110          * Release the parent event, if this was the last
5111          * reference to it.
5112          */
5113         fput(parent_event->filp);
5114 }
5115
5116 static void
5117 __perf_event_exit_task(struct perf_event *child_event,
5118                          struct perf_event_context *child_ctx,
5119                          struct task_struct *child)
5120 {
5121         struct perf_event *parent_event;
5122
5123         perf_event_remove_from_context(child_event);
5124
5125         parent_event = child_event->parent;
5126         /*
5127          * It can happen that parent exits first, and has events
5128          * that are still around due to the child reference. These
5129          * events need to be zapped - but otherwise linger.
5130          */
5131         if (parent_event) {
5132                 sync_child_event(child_event, child);
5133                 free_event(child_event);
5134         }
5135 }
5136
5137 /*
5138  * When a child task exits, feed back event values to parent events.
5139  */
5140 void perf_event_exit_task(struct task_struct *child)
5141 {
5142         struct perf_event *child_event, *tmp;
5143         struct perf_event_context *child_ctx;
5144         unsigned long flags;
5145
5146         if (likely(!child->perf_event_ctxp)) {
5147                 perf_event_task(child, NULL, 0);
5148                 return;
5149         }
5150
5151         local_irq_save(flags);
5152         /*
5153          * We can't reschedule here because interrupts are disabled,
5154          * and either child is current or it is a task that can't be
5155          * scheduled, so we are now safe from rescheduling changing
5156          * our context.
5157          */
5158         child_ctx = child->perf_event_ctxp;
5159         __perf_event_task_sched_out(child_ctx);
5160
5161         /*
5162          * Take the context lock here so that if find_get_context is
5163          * reading child->perf_event_ctxp, we wait until it has
5164          * incremented the context's refcount before we do put_ctx below.
5165          */
5166         raw_spin_lock(&child_ctx->lock);
5167         child->perf_event_ctxp = NULL;
5168         /*
5169          * If this context is a clone; unclone it so it can't get
5170          * swapped to another process while we're removing all
5171          * the events from it.
5172          */
5173         unclone_ctx(child_ctx);
5174         update_context_time(child_ctx);
5175         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5176
5177         /*
5178          * Report the task dead after unscheduling the events so that we
5179          * won't get any samples after PERF_RECORD_EXIT. We can however still
5180          * get a few PERF_RECORD_READ events.
5181          */
5182         perf_event_task(child, child_ctx, 0);
5183
5184         /*
5185          * We can recurse on the same lock type through:
5186          *
5187          *   __perf_event_exit_task()
5188          *     sync_child_event()
5189          *       fput(parent_event->filp)
5190          *         perf_release()
5191          *           mutex_lock(&ctx->mutex)
5192          *
5193          * But since its the parent context it won't be the same instance.
5194          */
5195         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5196
5197 again:
5198         list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5199                                  group_entry)
5200                 __perf_event_exit_task(child_event, child_ctx, child);
5201
5202         list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5203                                  group_entry)
5204                 __perf_event_exit_task(child_event, child_ctx, child);
5205
5206         /*
5207          * If the last event was a group event, it will have appended all
5208          * its siblings to the list, but we obtained 'tmp' before that which
5209          * will still point to the list head terminating the iteration.
5210          */
5211         if (!list_empty(&child_ctx->pinned_groups) ||
5212             !list_empty(&child_ctx->flexible_groups))
5213                 goto again;
5214
5215         mutex_unlock(&child_ctx->mutex);
5216
5217         put_ctx(child_ctx);
5218 }
5219
5220 static void perf_free_event(struct perf_event *event,
5221                             struct perf_event_context *ctx)
5222 {
5223         struct perf_event *parent = event->parent;
5224
5225         if (WARN_ON_ONCE(!parent))
5226                 return;
5227
5228         mutex_lock(&parent->child_mutex);
5229         list_del_init(&event->child_list);
5230         mutex_unlock(&parent->child_mutex);
5231
5232         fput(parent->filp);
5233
5234         list_del_event(event, ctx);
5235         free_event(event);
5236 }
5237
5238 /*
5239  * free an unexposed, unused context as created by inheritance by
5240  * init_task below, used by fork() in case of fail.
5241  */
5242 void perf_event_free_task(struct task_struct *task)
5243 {
5244         struct perf_event_context *ctx = task->perf_event_ctxp;
5245         struct perf_event *event, *tmp;
5246
5247         if (!ctx)
5248                 return;
5249
5250         mutex_lock(&ctx->mutex);
5251 again:
5252         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5253                 perf_free_event(event, ctx);
5254
5255         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5256                                  group_entry)
5257                 perf_free_event(event, ctx);
5258
5259         if (!list_empty(&ctx->pinned_groups) ||
5260             !list_empty(&ctx->flexible_groups))
5261                 goto again;
5262
5263         mutex_unlock(&ctx->mutex);
5264
5265         put_ctx(ctx);
5266 }
5267
5268 static int
5269 inherit_task_group(struct perf_event *event, struct task_struct *parent,
5270                    struct perf_event_context *parent_ctx,
5271                    struct task_struct *child,
5272                    int *inherited_all)
5273 {
5274         int ret;
5275         struct perf_event_context *child_ctx = child->perf_event_ctxp;
5276
5277         if (!event->attr.inherit) {
5278                 *inherited_all = 0;
5279                 return 0;
5280         }
5281
5282         if (!child_ctx) {
5283                 /*
5284                  * This is executed from the parent task context, so
5285                  * inherit events that have been marked for cloning.
5286                  * First allocate and initialize a context for the
5287                  * child.
5288                  */
5289
5290                 child_ctx = kzalloc(sizeof(struct perf_event_context),
5291                                     GFP_KERNEL);
5292                 if (!child_ctx)
5293                         return -ENOMEM;
5294
5295                 __perf_event_init_context(child_ctx, child);
5296                 child->perf_event_ctxp = child_ctx;
5297                 get_task_struct(child);
5298         }
5299
5300         ret = inherit_group(event, parent, parent_ctx,
5301                             child, child_ctx);
5302
5303         if (ret)
5304                 *inherited_all = 0;
5305
5306         return ret;
5307 }
5308
5309
5310 /*
5311  * Initialize the perf_event context in task_struct
5312  */
5313 int perf_event_init_task(struct task_struct *child)
5314 {
5315         struct perf_event_context *child_ctx, *parent_ctx;
5316         struct perf_event_context *cloned_ctx;
5317         struct perf_event *event;
5318         struct task_struct *parent = current;
5319         int inherited_all = 1;
5320         int ret = 0;
5321
5322         child->perf_event_ctxp = NULL;
5323
5324         mutex_init(&child->perf_event_mutex);
5325         INIT_LIST_HEAD(&child->perf_event_list);
5326
5327         if (likely(!parent->perf_event_ctxp))
5328                 return 0;
5329
5330         /*
5331          * If the parent's context is a clone, pin it so it won't get
5332          * swapped under us.
5333          */
5334         parent_ctx = perf_pin_task_context(parent);
5335
5336         /*
5337          * No need to check if parent_ctx != NULL here; since we saw
5338          * it non-NULL earlier, the only reason for it to become NULL
5339          * is if we exit, and since we're currently in the middle of
5340          * a fork we can't be exiting at the same time.
5341          */
5342
5343         /*
5344          * Lock the parent list. No need to lock the child - not PID
5345          * hashed yet and not running, so nobody can access it.
5346          */
5347         mutex_lock(&parent_ctx->mutex);
5348
5349         /*
5350          * We dont have to disable NMIs - we are only looking at
5351          * the list, not manipulating it:
5352          */
5353         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5354                 ret = inherit_task_group(event, parent, parent_ctx, child,
5355                                          &inherited_all);
5356                 if (ret)
5357                         break;
5358         }
5359
5360         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5361                 ret = inherit_task_group(event, parent, parent_ctx, child,
5362                                          &inherited_all);
5363                 if (ret)
5364                         break;
5365         }
5366
5367         child_ctx = child->perf_event_ctxp;
5368
5369         if (child_ctx && inherited_all) {
5370                 /*
5371                  * Mark the child context as a clone of the parent
5372                  * context, or of whatever the parent is a clone of.
5373                  * Note that if the parent is a clone, it could get
5374                  * uncloned at any point, but that doesn't matter
5375                  * because the list of events and the generation
5376                  * count can't have changed since we took the mutex.
5377                  */
5378                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5379                 if (cloned_ctx) {
5380                         child_ctx->parent_ctx = cloned_ctx;
5381                         child_ctx->parent_gen = parent_ctx->parent_gen;
5382                 } else {
5383                         child_ctx->parent_ctx = parent_ctx;
5384                         child_ctx->parent_gen = parent_ctx->generation;
5385                 }
5386                 get_ctx(child_ctx->parent_ctx);
5387         }
5388
5389         mutex_unlock(&parent_ctx->mutex);
5390
5391         perf_unpin_context(parent_ctx);
5392
5393         return ret;
5394 }
5395
5396 static void __cpuinit perf_event_init_cpu(int cpu)
5397 {
5398         struct perf_cpu_context *cpuctx;
5399
5400         cpuctx = &per_cpu(perf_cpu_context, cpu);
5401         __perf_event_init_context(&cpuctx->ctx, NULL);
5402
5403         spin_lock(&perf_resource_lock);
5404         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5405         spin_unlock(&perf_resource_lock);
5406
5407         hw_perf_event_setup(cpu);
5408 }
5409
5410 #ifdef CONFIG_HOTPLUG_CPU
5411 static void __perf_event_exit_cpu(void *info)
5412 {
5413         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5414         struct perf_event_context *ctx = &cpuctx->ctx;
5415         struct perf_event *event, *tmp;
5416
5417         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5418                 __perf_event_remove_from_context(event);
5419         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5420                 __perf_event_remove_from_context(event);
5421 }
5422 static void perf_event_exit_cpu(int cpu)
5423 {
5424         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5425         struct perf_event_context *ctx = &cpuctx->ctx;
5426
5427         mutex_lock(&ctx->mutex);
5428         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5429         mutex_unlock(&ctx->mutex);
5430 }
5431 #else
5432 static inline void perf_event_exit_cpu(int cpu) { }
5433 #endif
5434
5435 static int __cpuinit
5436 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5437 {
5438         unsigned int cpu = (long)hcpu;
5439
5440         switch (action) {
5441
5442         case CPU_UP_PREPARE:
5443         case CPU_UP_PREPARE_FROZEN:
5444                 perf_event_init_cpu(cpu);
5445                 break;
5446
5447         case CPU_ONLINE:
5448         case CPU_ONLINE_FROZEN:
5449                 hw_perf_event_setup_online(cpu);
5450                 break;
5451
5452         case CPU_DOWN_PREPARE:
5453         case CPU_DOWN_PREPARE_FROZEN:
5454                 perf_event_exit_cpu(cpu);
5455                 break;
5456
5457         case CPU_DEAD:
5458                 hw_perf_event_setup_offline(cpu);
5459                 break;
5460
5461         default:
5462                 break;
5463         }
5464
5465         return NOTIFY_OK;
5466 }
5467
5468 /*
5469  * This has to have a higher priority than migration_notifier in sched.c.
5470  */
5471 static struct notifier_block __cpuinitdata perf_cpu_nb = {
5472         .notifier_call          = perf_cpu_notify,
5473         .priority               = 20,
5474 };
5475
5476 void __init perf_event_init(void)
5477 {
5478         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5479                         (void *)(long)smp_processor_id());
5480         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5481                         (void *)(long)smp_processor_id());
5482         register_cpu_notifier(&perf_cpu_nb);
5483 }
5484
5485 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5486 {
5487         return sprintf(buf, "%d\n", perf_reserved_percpu);
5488 }
5489
5490 static ssize_t
5491 perf_set_reserve_percpu(struct sysdev_class *class,
5492                         const char *buf,
5493                         size_t count)
5494 {
5495         struct perf_cpu_context *cpuctx;
5496         unsigned long val;
5497         int err, cpu, mpt;
5498
5499         err = strict_strtoul(buf, 10, &val);
5500         if (err)
5501                 return err;
5502         if (val > perf_max_events)
5503                 return -EINVAL;
5504
5505         spin_lock(&perf_resource_lock);
5506         perf_reserved_percpu = val;
5507         for_each_online_cpu(cpu) {
5508                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5509                 raw_spin_lock_irq(&cpuctx->ctx.lock);
5510                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5511                           perf_max_events - perf_reserved_percpu);
5512                 cpuctx->max_pertask = mpt;
5513                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5514         }
5515         spin_unlock(&perf_resource_lock);
5516
5517         return count;
5518 }
5519
5520 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5521 {
5522         return sprintf(buf, "%d\n", perf_overcommit);
5523 }
5524
5525 static ssize_t
5526 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5527 {
5528         unsigned long val;
5529         int err;
5530
5531         err = strict_strtoul(buf, 10, &val);
5532         if (err)
5533                 return err;
5534         if (val > 1)
5535                 return -EINVAL;
5536
5537         spin_lock(&perf_resource_lock);
5538         perf_overcommit = val;
5539         spin_unlock(&perf_resource_lock);
5540
5541         return count;
5542 }
5543
5544 static SYSDEV_CLASS_ATTR(
5545                                 reserve_percpu,
5546                                 0644,
5547                                 perf_show_reserve_percpu,
5548                                 perf_set_reserve_percpu
5549                         );
5550
5551 static SYSDEV_CLASS_ATTR(
5552                                 overcommit,
5553                                 0644,
5554                                 perf_show_overcommit,
5555                                 perf_set_overcommit
5556                         );
5557
5558 static struct attribute *perfclass_attrs[] = {
5559         &attr_reserve_percpu.attr,
5560         &attr_overcommit.attr,
5561         NULL
5562 };
5563
5564 static struct attribute_group perfclass_attr_group = {
5565         .attrs                  = perfclass_attrs,
5566         .name                   = "perf_events",
5567 };
5568
5569 static int __init perf_event_sysfs_init(void)
5570 {
5571         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5572                                   &perfclass_attr_group);
5573 }
5574 device_initcall(perf_event_sysfs_init);